From 280dd53c88fb35736563ba1a820dfc765d817aa1 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 12 Sep 2024 18:00:59 +0200
Subject: [PATCH 01/27] Code generatot graph creation

---
 scrapegraphai/graphs/__init__.py             |   1 +
 scrapegraphai/graphs/code_generator_graph.py | 145 +++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 scrapegraphai/graphs/code_generator_graph.py

diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
index 966f9978..ebe914fb 100644
--- a/scrapegraphai/graphs/__init__.py
+++ b/scrapegraphai/graphs/__init__.py
@@ -26,3 +26,4 @@ from .markdown_scraper_multi_graph import MDScraperMultiGraph
 from .search_link_graph import SearchLinkGraph
 from .screenshot_scraper_graph import ScreenshotScraperGraph
 from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
+from .code_generator_graph import CodeGeneratorGraph
\ No newline at end of file
diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
new file mode 100644
index 00000000..e6dd50de
--- /dev/null
+++ b/scrapegraphai/graphs/code_generator_graph.py
@@ -0,0 +1,145 @@
+"""
+SmartScraperGraph Module
+"""
+from typing import Optional
+import logging
+from pydantic import BaseModel
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+from ..nodes import (
+    FetchNode,
+    ParseNode,
+    GenerateAnswerNode
+)
+
+class CodeGeneratorGraph(AbstractGraph):
+    """
+    ...
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (BaseModel): The schema for the graph output.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, 
+        configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+        library (str): The library used for web scraping (beautiful soup).
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (BaseModel): The schema for the graph output.
+
+    Example:
+        >>> code_gen = CodeGeneratorGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
+        ...     {"llm": {"model": "openai/gpt-3.5-turbo"}}
+        ... )
+        >>> result = code_gen.run()
+        )
+    """
+
+    def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
+        
+        self.library = config['library']
+        
+        super().__init__(prompt, config, source, schema)
+
+        self.input_key = "url" if source.startswith("http") else "local_dir"
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
+        """
+        
+        fetch_node = FetchNode(
+            input="url| local_dir",
+            output=["doc"],
+            node_config={
+                "llm_model": self.llm_model,
+                "force": self.config.get("force", False),
+                "cut": self.config.get("cut", True),
+                "loader_kwargs": self.config.get("loader_kwargs", {}),
+                "browser_base": self.config.get("browser_base"),
+                "scrape_do": self.config.get("scrape_do")
+            }
+        )
+        parse_node = ParseNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={
+                "llm_model": self.llm_model,
+                "chunk_size": self.model_token
+            }
+        )
+
+        generate_validation_answer_node = GenerateAnswerNode(
+            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+            output=["answer"],
+            node_config={
+                "llm_model": self.llm_model,
+                "additional_info": self.config.get("additional_info"),
+                "schema": self.schema,
+            }
+        )
+        
+        json_descriptor_node = JsonDescriptorNode(
+            input="user_prompt",
+            output=["json_descriptor"],
+            node_config={
+                "llm_model": self.llm_model,
+                "chunk_size": self.model_token,
+                "schema": self.schema
+            }
+        )
+        
+        generate_code_node = GenerateCodeNode(
+            input="user_prompt & json_descriptor & doc & answer",
+            output=["code"],
+            node_config={
+                "llm_model": self.llm_model,
+                "additional_info": self.config.get("additional_info"),
+                "schema": self.schema
+            },
+            library=self.library,
+            website=self.source
+        )
+
+        return BaseGraph(
+            nodes=[
+                fetch_node,
+                parse_node,
+                generate_validation_answer_node,
+                json_descriptor_node,
+                generate_code_node,
+            ],
+            edges=[
+                (fetch_node, parse_node),
+                (parse_node, generate_validation_answer_node),
+                (generate_validation_answer_node, json_descriptor_node),
+                (json_descriptor_node, generate_code_node)
+            ],
+            entry_point=fetch_node,
+            graph_name=self.__class__.__name__
+        )
+
+    def run(self) -> str:
+        """
+        Executes the scraping process and returns the generated code.
+
+        Returns:
+            str: The generated code.
+        """
+
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("code", "No code created.")

From 9862425fb13e380986877486404d605631b38a4e Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 12 Sep 2024 18:04:37 +0200
Subject: [PATCH 02/27] JsonDescriptorNode created

---
 scrapegraphai/nodes/json_descriptor_node.py | 161 ++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 scrapegraphai/nodes/json_descriptor_node.py

diff --git a/scrapegraphai/nodes/json_descriptor_node.py b/scrapegraphai/nodes/json_descriptor_node.py
new file mode 100644
index 00000000..53507edf
--- /dev/null
+++ b/scrapegraphai/nodes/json_descriptor_node.py
@@ -0,0 +1,161 @@
+"""
+JsonDescriptorNode Module
+"""
+from typing import List, Optional
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.runnables import RunnableParallel
+from langchain_core.utils.pydantic import is_basemodel_subclass
+from langchain_openai import ChatOpenAI, AzureChatOpenAI
+from langchain_mistralai import ChatMistralAI
+from langchain_community.chat_models import ChatOllama
+from tqdm import tqdm
+from .base_node import BaseNode
+
+
+class JsonDescriptorNode(BaseNode):
+    """
+    A node that generate a json descriptor using a large language model (LLM) based on the user's input and schema.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "GenerateAnswer",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+
+        if isinstance(node_config["llm_model"], ChatOllama):
+            self.llm_model.format="json"
+
+        self.verbose = (
+            True if node_config is None else node_config.get("verbose", False)
+        )
+        self.force = (
+            False if node_config is None else node_config.get("force", False)
+        )
+        self.script_creator = (
+            False if node_config is None else node_config.get("script_creator", False)
+        )
+        self.is_md_scraper = (
+            False if node_config is None else node_config.get("is_md_scraper", False)
+        )
+
+        self.additional_info = node_config.get("additional_info")
+
+    def execute(self, state: dict) -> dict:
+        """
+        Generates an answer by constructing a prompt from the user's input and the scraped
+        content, querying the language model, and parsing its response.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the generated answer.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating
+                      that the necessary information for generating an answer is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        input_keys = self.get_input_keys(state)
+        
+        input_data = [state[key] for key in input_keys]
+        user_prompt = input_data[0]
+        doc = input_data[1]
+
+        if self.node_config.get("schema", None) is not None:
+
+            if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
+                self.llm_model = self.llm_model.with_structured_output(
+                    schema = self.node_config["schema"],
+                    method="function_calling") # json schema works only on specific models
+                
+                # default parser to empty lambda function
+                output_parser = lambda x: x
+                if is_basemodel_subclass(self.node_config["schema"]):
+                    output_parser = dict
+                format_instructions = "NA"
+            else:
+                output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
+                format_instructions = output_parser.get_format_instructions()
+
+        else:
+            output_parser = JsonOutputParser()
+            format_instructions = output_parser.get_format_instructions()
+
+        if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) \
+            and not self.script_creator \
+            or self.force \
+            and not self.script_creator or self.is_md_scraper:
+
+            template_no_chunks_prompt  = TEMPLATE_NO_CHUNKS_MD
+            template_chunks_prompt  = TEMPLATE_CHUNKS_MD
+            template_merge_prompt  = TEMPLATE_MERGE_MD
+        else:
+            template_no_chunks_prompt  = TEMPLATE_NO_CHUNKS
+            template_chunks_prompt  = TEMPLATE_CHUNKS
+            template_merge_prompt  = TEMPLATE_MERGE
+
+        if self.additional_info is not None:
+            template_no_chunks_prompt  = self.additional_info + template_no_chunks_prompt
+            template_chunks_prompt  = self.additional_info + template_chunks_prompt
+            template_merge_prompt  = self.additional_info + template_merge_prompt 
+
+        if len(doc) == 1:
+            prompt = PromptTemplate(
+                template=template_no_chunks_prompt ,
+                input_variables=["question"],
+                partial_variables={"context": doc,
+                                    "format_instructions": format_instructions})
+            chain =  prompt | self.llm_model | output_parser
+            answer = chain.invoke({"question": user_prompt})
+
+            state.update({self.output[0]: answer})
+            return state
+
+        chains_dict = {}
+        for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)):
+
+            prompt = PromptTemplate(
+                template=TEMPLATE_CHUNKS,
+                input_variables=["question"],
+                partial_variables={"context": chunk,
+                                "chunk_id": i + 1,
+                                "format_instructions": format_instructions})
+            chain_name = f"chunk{i+1}"
+            chains_dict[chain_name] = prompt | self.llm_model | output_parser
+
+        async_runner = RunnableParallel(**chains_dict)
+
+        batch_results =  async_runner.invoke({"question": user_prompt})
+
+        merge_prompt = PromptTemplate(
+                template = template_merge_prompt ,
+                input_variables=["context", "question"],
+                partial_variables={"format_instructions": format_instructions},
+            )
+
+        merge_chain = merge_prompt | self.llm_model | output_parser
+        answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
+
+        state.update({self.output[0]: answer})
+        return state

From 42318a144191bb7a6b576ff4752096864fc6cb05 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 12 Sep 2024 18:35:19 +0200
Subject: [PATCH 03/27] Creation of PromptRefiner

---
 scrapegraphai/graphs/code_generator_graph.py          | 11 ++++++-----
 scrapegraphai/nodes/__init__.py                       |  1 +
 ...json_descriptor_node.py => prompt_refiner_node.py} | 10 ++++++----
 3 files changed, 13 insertions(+), 9 deletions(-)
 rename scrapegraphai/nodes/{json_descriptor_node.py => prompt_refiner_node.py} (94%)

diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
index e6dd50de..58d7f2c0 100644
--- a/scrapegraphai/graphs/code_generator_graph.py
+++ b/scrapegraphai/graphs/code_generator_graph.py
@@ -9,7 +9,8 @@ from .abstract_graph import AbstractGraph
 from ..nodes import (
     FetchNode,
     ParseNode,
-    GenerateAnswerNode
+    GenerateAnswerNode,
+    PromptRefinerNode,
 )
 
 class CodeGeneratorGraph(AbstractGraph):
@@ -91,7 +92,7 @@ class CodeGeneratorGraph(AbstractGraph):
             }
         )
         
-        json_descriptor_node = JsonDescriptorNode(
+        prompt_refier_node = PromptRefinerNode(
             input="user_prompt",
             output=["json_descriptor"],
             node_config={
@@ -118,14 +119,14 @@ class CodeGeneratorGraph(AbstractGraph):
                 fetch_node,
                 parse_node,
                 generate_validation_answer_node,
-                json_descriptor_node,
+                prompt_refier_node,
                 generate_code_node,
             ],
             edges=[
                 (fetch_node, parse_node),
                 (parse_node, generate_validation_answer_node),
-                (generate_validation_answer_node, json_descriptor_node),
-                (json_descriptor_node, generate_code_node)
+                (generate_validation_answer_node, prompt_refier_node),
+                (prompt_refier_node, generate_code_node)
             ],
             entry_point=fetch_node,
             graph_name=self.__class__.__name__
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index 1e990400..27d6883f 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -23,3 +23,4 @@ from .merge_generated_scripts import MergeGeneratedScriptsNode
 from .fetch_screen_node import FetchScreenNode
 from .generate_answer_from_image_node import GenerateAnswerFromImageNode
 from .concat_answers_node import ConcatAnswersNode
+from .prompt_refiner_node import PromptRefinerNode
\ No newline at end of file
diff --git a/scrapegraphai/nodes/json_descriptor_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
similarity index 94%
rename from scrapegraphai/nodes/json_descriptor_node.py
rename to scrapegraphai/nodes/prompt_refiner_node.py
index 53507edf..f824134d 100644
--- a/scrapegraphai/nodes/json_descriptor_node.py
+++ b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -1,5 +1,5 @@
 """
-JsonDescriptorNode Module
+PromptRefinerNode Module
 """
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
@@ -13,9 +13,11 @@ from tqdm import tqdm
 from .base_node import BaseNode
 
 
-class JsonDescriptorNode(BaseNode):
+class PromptRefinerNode(BaseNode):
     """
-    A node that generate a json descriptor using a large language model (LLM) based on the user's input and schema.
+    A node that refine the user prompt with the use of the schema and additional context and
+    create a precise prompt in subsequent steps that explicitly link elements in the user's 
+    original input to their corresponding representations in the JSON schema.
 
     Attributes:
         llm_model: An instance of a language model client, configured for generating answers.
@@ -33,7 +35,7 @@ class JsonDescriptorNode(BaseNode):
         input: str,
         output: List[str],
         node_config: Optional[dict] = None,
-        node_name: str = "GenerateAnswer",
+        node_name: str = "JsonDescriptor",
     ):
         super().__init__(node_name, "node", input, output, 2, node_config)
 

From 2a760a1c6f755ea29e297a8c11e3af29c38bb69b Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 12 Sep 2024 21:17:46 +0200
Subject: [PATCH 04/27] initial promptrefiner prompt

---
 scrapegraphai/graphs/code_generator_graph.py |  4 +-
 scrapegraphai/nodes/prompt_refiner_node.py   | 60 +++++++-------------
 2 files changed, 24 insertions(+), 40 deletions(-)

diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
index 58d7f2c0..174b112d 100644
--- a/scrapegraphai/graphs/code_generator_graph.py
+++ b/scrapegraphai/graphs/code_generator_graph.py
@@ -94,7 +94,7 @@ class CodeGeneratorGraph(AbstractGraph):
         
         prompt_refier_node = PromptRefinerNode(
             input="user_prompt",
-            output=["json_descriptor"],
+            output=["refined_prompt"],
             node_config={
                 "llm_model": self.llm_model,
                 "chunk_size": self.model_token,
@@ -103,7 +103,7 @@ class CodeGeneratorGraph(AbstractGraph):
         )
         
         generate_code_node = GenerateCodeNode(
-            input="user_prompt & json_descriptor & doc & answer",
+            input="refined_prompt & doc & answer",
             output=["code"],
             node_config={
                 "llm_model": self.llm_model,
diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
index f824134d..3bd219ed 100644
--- a/scrapegraphai/nodes/prompt_refiner_node.py
+++ b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -35,7 +35,7 @@ class PromptRefinerNode(BaseNode):
         input: str,
         output: List[str],
         node_config: Optional[dict] = None,
-        node_name: str = "JsonDescriptor",
+        node_name: str = "PromptRefiner",
     ):
         super().__init__(node_name, "node", input, output, 2, node_config)
 
@@ -82,47 +82,31 @@ class PromptRefinerNode(BaseNode):
         
         input_data = [state[key] for key in input_keys]
         user_prompt = input_data[0]
-        doc = input_data[1]
 
         if self.node_config.get("schema", None) is not None:
 
-            if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
-                self.llm_model = self.llm_model.with_structured_output(
-                    schema = self.node_config["schema"],
-                    method="function_calling") # json schema works only on specific models
+            self.schema = self.node_config["schema"]
+            
+            if self.additional_info is not None: # add context to the prompt
+                pass
+
+            template_prompt_builder = """
+            You are tasked with generating a prompt that will guide an LLM in reasoning about how to identify specific elements within an HTML page for data extraction.
+            **Input:**
+            
+            * **User Prompt:** The user's natural language description of the data they want to extract from the HTML page.
+            * **JSON Schema:** A JSON schema representing the desired output structure of the extracted data.
+            * **Additional Information (Optional):** Any supplementary details provided by the user, such as specific HTML patterns they've observed, known challenges in identifying certain elements, or preferences for particular scraping strategies.
+
+            **Output:**
+            """
+            
+            example_prompts = [
+                """
                 
-                # default parser to empty lambda function
-                output_parser = lambda x: x
-                if is_basemodel_subclass(self.node_config["schema"]):
-                    output_parser = dict
-                format_instructions = "NA"
-            else:
-                output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
-                format_instructions = output_parser.get_format_instructions()
-
-        else:
-            output_parser = JsonOutputParser()
-            format_instructions = output_parser.get_format_instructions()
-
-        if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) \
-            and not self.script_creator \
-            or self.force \
-            and not self.script_creator or self.is_md_scraper:
-
-            template_no_chunks_prompt  = TEMPLATE_NO_CHUNKS_MD
-            template_chunks_prompt  = TEMPLATE_CHUNKS_MD
-            template_merge_prompt  = TEMPLATE_MERGE_MD
-        else:
-            template_no_chunks_prompt  = TEMPLATE_NO_CHUNKS
-            template_chunks_prompt  = TEMPLATE_CHUNKS
-            template_merge_prompt  = TEMPLATE_MERGE
-
-        if self.additional_info is not None:
-            template_no_chunks_prompt  = self.additional_info + template_no_chunks_prompt
-            template_chunks_prompt  = self.additional_info + template_chunks_prompt
-            template_merge_prompt  = self.additional_info + template_merge_prompt 
-
-        if len(doc) == 1:
+                """
+            ]
+            
             prompt = PromptTemplate(
                 template=template_no_chunks_prompt ,
                 input_variables=["question"],

From 545970ce542183e783609f2620866da2577f08ee Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Fri, 13 Sep 2024 18:17:37 +0200
Subject: [PATCH 05/27] html_reduction script

---
 scrapegraphai/code_gen/html_reduce.py      | 85 ++++++++++++++++++++++
 scrapegraphai/code_gen/script_genarated.py | 64 ++++++++++++++++
 scrapegraphai/code_gen/script_runner.py    |  0
 3 files changed, 149 insertions(+)
 create mode 100644 scrapegraphai/code_gen/html_reduce.py
 create mode 100644 scrapegraphai/code_gen/script_genarated.py
 create mode 100644 scrapegraphai/code_gen/script_runner.py

diff --git a/scrapegraphai/code_gen/html_reduce.py b/scrapegraphai/code_gen/html_reduce.py
new file mode 100644
index 00000000..a25cd5d8
--- /dev/null
+++ b/scrapegraphai/code_gen/html_reduce.py
@@ -0,0 +1,85 @@
+import re
+from bs4 import BeautifulSoup, Comment
+
+
+def minify_html(html):
+    # Remove comments
+    html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
+    
+    # Remove whitespace between tags
+    html = re.sub(r'>\s+<', '><', html)
+    
+    # Remove whitespace at the beginning and end of tags
+    html = re.sub(r'\s+>', '>', html)
+    html = re.sub(r'<\s+', '<', html)
+    
+    # Collapse multiple whitespace characters into a single space
+    html = re.sub(r'\s+', ' ', html)
+    
+    # Remove spaces around equals signs in attributes
+    html = re.sub(r'\s*=\s*', '=', html)
+    
+    return html.strip()
+
+def reduce_html(html, reduction):
+    """
+    html: str, the HTML content to reduce
+    reduction: 0: minification only,
+               1: minification and removig unnecessary tags and attributes,
+               2: minification, removig unnecessary tags and attributes, simplifying text content, removing of the head tag
+    
+    
+    """
+    if reduction == 0:
+        return minify_html(html)
+    
+    soup = BeautifulSoup(html, 'html.parser')
+    
+    # Remove comments
+    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+        comment.extract()
+    
+    # Remove script and style tag contents, but keep the tags
+    for tag in soup(['script', 'style']):
+        tag.string = ""
+    
+    # Remove unnecessary attributes, but keep class and id
+    attrs_to_keep = ['class', 'id', 'href', 'src']
+    for tag in soup.find_all(True):
+        for attr in list(tag.attrs):
+            if attr not in attrs_to_keep:
+                del tag[attr]
+                
+    if reduction == 1:
+        return minify_html(str(soup))
+    
+    # Remove script and style tags completely
+    for tag in soup(['script', 'style']):
+        tag.decompose()
+    
+    # Focus only on the body
+    body = soup.body
+    if not body:
+        return "No <body> tag found in the HTML"
+    
+    # Simplify text content
+    for tag in body.find_all(string=True):
+        if tag.parent.name not in ['script', 'style']:
+            tag.replace_with(re.sub(r'\s+', ' ', tag.strip())[:20])
+    
+    # Generate reduced HTML
+    reduced_html = str(body)
+    
+    # Apply minification
+    reduced_html = minify_html(reduced_html)
+    
+    return reduced_html
+
+# Get string with html from example.html
+html = open('example_1.html').read()
+
+reduced_html = reduce_html(html, 2)
+
+# Print the reduced html in result.html
+with open('result_1.html', 'w') as f:
+    f.write(reduced_html)
\ No newline at end of file
diff --git a/scrapegraphai/code_gen/script_genarated.py b/scrapegraphai/code_gen/script_genarated.py
new file mode 100644
index 00000000..ee2d9fc3
--- /dev/null
+++ b/scrapegraphai/code_gen/script_genarated.py
@@ -0,0 +1,64 @@
+from bs4 import BeautifulSoup
+import re
+
+def extract_book_info(html_string):
+    """
+    Extracts book information (title, author, publication date, publisher) from an HTML string using BeautifulSoup.
+
+    Args:
+        html_string: The HTML content as a string.
+
+    Returns:
+        A dictionary containing the extracted book information in the desired JSON schema format.
+    """
+
+    soup = BeautifulSoup(html_string, 'html.parser')
+
+    # Find all book listings
+    book_listings = soup.find_all('div', class_='cc-product-list-item')
+
+    books_data = []
+    for listing in book_listings:
+        # Extract title
+        title_elem = listing.find('a', class_='cc-title')
+        title = title_elem.text.strip() if title_elem else None
+
+        # Extract author
+        author_elem = listing.find('div', class_='cc-author').find('a', class_='cc-author-name')
+        author = author_elem.text.strip() if author_elem else None
+
+        # Extract publisher and publication date
+        publisher_info_elem = listing.find('span', class_='cc-publisher')
+        publisher_info_text = publisher_info_elem.text.strip() if publisher_info_elem else None
+
+        if publisher_info_text:
+            # Assuming publisher name is linked and publication date is the remaining text
+            publisher_elem = publisher_info_elem.find('a', class_='cc-publisher-name')
+            publisher = publisher_elem.text.strip() if publisher_elem else None
+
+            # Use regex to extract year (assuming 4-digit year format)
+            publication_date_match = re.search(r'\b(\d{4})\b', publisher_info_text)
+            publication_date = publication_date_match.group(1) if publication_date_match else None
+        else:
+            publisher = None
+            publication_date = None
+
+        # Create a book dictionary and append to the list
+        book_data = {
+            "title": title,
+            "author": author,
+            "publication_date": publication_date,
+            "publisher": publisher
+        }
+        books_data.append(book_data)
+
+    # Structure the output according to the JSON schema
+    output = {
+        "books": books_data
+    }
+
+    return output
+
+html = open('example_1.html').read()
+result = extract_book_info(html)
+print(result)
\ No newline at end of file
diff --git a/scrapegraphai/code_gen/script_runner.py b/scrapegraphai/code_gen/script_runner.py
new file mode 100644
index 00000000..e69de29b

From 330c22fd5e4ed638e28973b300e3ce7a47e08067 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 19 Sep 2024 10:40:18 +0200
Subject: [PATCH 06/27] Update prompt_refiner_node.py

---
 scrapegraphai/nodes/prompt_refiner_node.py | 124 +++++++++++----------
 1 file changed, 67 insertions(+), 57 deletions(-)

diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
index 3bd219ed..1748aec0 100644
--- a/scrapegraphai/nodes/prompt_refiner_node.py
+++ b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -3,7 +3,7 @@ PromptRefinerNode Module
 """
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
-from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnableParallel
 from langchain_core.utils.pydantic import is_basemodel_subclass
 from langchain_openai import ChatOpenAI, AzureChatOpenAI
@@ -61,8 +61,7 @@ class PromptRefinerNode(BaseNode):
 
     def execute(self, state: dict) -> dict:
         """
-        Generates an answer by constructing a prompt from the user's input and the scraped
-        content, querying the language model, and parsing its response.
+        Generate a refined prompt using the user's prompt, the schema, and additional context.
 
         Args:
             state (dict): The current state of the graph. The input keys will be used
@@ -76,72 +75,83 @@ class PromptRefinerNode(BaseNode):
                       that the necessary information for generating an answer is missing.
         """
 
+        template_prompt_builder = """
+        **Task**: Analyze the user's request and the desired output schema to create a structured description for web scraping. Carefully examine both the user's request and the JSON schema to understand the desired data elements and their relationships.
+
+        **User's Request**:
+        {user_input}
+
+        **Desired JSON Output Schema**:
+        ```json
+        {json_schema}
+        ```
+
+        **Analysis Instructions**:
+        Genarate the breakdown of the user request and link the  elements of the user's request with the json schema
+
+        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.
+        Please generate only the analysis and no other text.
+
+        **Response**:
+        """
+        
+        template_prompt_builder_with_context = """
+        **Task**: Analyze the user's request, the desired output schema, and the additional context the user provided to create a structured description for web scraping. Carefully examine both the user's request and the JSON schema to understand the desired data elements and their relationships.
+
+        **User's Request**:
+        {user_input}
+
+        **Desired JSON Output Schema**:
+        ```json
+        {json_schema}
+        ```
+        
+        **Additional Context**:
+        {additional_context}
+
+        **Analysis Instructions**:
+        Genarate the breakdown of the user request and link the  elements of the user's request with the json schema
+
+        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.
+        Please generate only the analysis and no other text.
+
+        **Response**:
+        """
+        
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
         input_keys = self.get_input_keys(state)
         
         input_data = [state[key] for key in input_keys]
-        user_prompt = input_data[0]
+        user_prompt = input_data[0] #                           get user prompt
 
         if self.node_config.get("schema", None) is not None:
 
-            self.schema = self.node_config["schema"]
+            self.schema = self.node_config["schema"] #          get JSON schema
             
-            if self.additional_info is not None: # add context to the prompt
-                pass
+            if self.additional_info is not None: #              use additional context if present
+                prompt = PromptTemplate(
+                    template=template_prompt_builder_with_context,
+                    partial_variables={"user_input": user_prompt,
+                                        "json_schema": self.schema,
+                                        "additional_context": self.additional_info})
+            else:
+                prompt = PromptTemplate(
+                    template=template_prompt_builder,
+                    partial_variables={"user_input": user_prompt,
+                                        "json_schema": self.schema})
 
-            template_prompt_builder = """
-            You are tasked with generating a prompt that will guide an LLM in reasoning about how to identify specific elements within an HTML page for data extraction.
-            **Input:**
-            
-            * **User Prompt:** The user's natural language description of the data they want to extract from the HTML page.
-            * **JSON Schema:** A JSON schema representing the desired output structure of the extracted data.
-            * **Additional Information (Optional):** Any supplementary details provided by the user, such as specific HTML patterns they've observed, known challenges in identifying certain elements, or preferences for particular scraping strategies.
+            output_parser = StrOutputParser()
 
-            **Output:**
-            """
-            
-            example_prompts = [
-                """
-                
-                """
-            ]
-            
-            prompt = PromptTemplate(
-                template=template_no_chunks_prompt ,
-                input_variables=["question"],
-                partial_variables={"context": doc,
-                                    "format_instructions": format_instructions})
             chain =  prompt | self.llm_model | output_parser
-            answer = chain.invoke({"question": user_prompt})
+            refined_prompt = chain.invoke({})
 
-            state.update({self.output[0]: answer})
+            state.update({self.output[0]: refined_prompt})
             return state
 
-        chains_dict = {}
-        for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)):
-
-            prompt = PromptTemplate(
-                template=TEMPLATE_CHUNKS,
-                input_variables=["question"],
-                partial_variables={"context": chunk,
-                                "chunk_id": i + 1,
-                                "format_instructions": format_instructions})
-            chain_name = f"chunk{i+1}"
-            chains_dict[chain_name] = prompt | self.llm_model | output_parser
-
-        async_runner = RunnableParallel(**chains_dict)
-
-        batch_results =  async_runner.invoke({"question": user_prompt})
-
-        merge_prompt = PromptTemplate(
-                template = template_merge_prompt ,
-                input_variables=["context", "question"],
-                partial_variables={"format_instructions": format_instructions},
-            )
-
-        merge_chain = merge_prompt | self.llm_model | output_parser
-        answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
-
-        state.update({self.output[0]: answer})
-        return state
+        else: #                                                no schema provided
+            self.logger.error("No schema provided for prompt refinement.")
+            
+            # TODO: Handle the case where no schema is provided => error handling
+            
+            return state

From a2490e370a4d0d621963ce1ab7e64d25f04e6d2e Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 19 Sep 2024 11:01:47 +0200
Subject: [PATCH 07/27] html analyzer node added

---
 scrapegraphai/nodes/__init__.py           |   3 +-
 scrapegraphai/nodes/html_analyzer_node.py | 166 ++++++++++++++++++++++
 2 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100644 scrapegraphai/nodes/html_analyzer_node.py

diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index 27d6883f..a5ffb757 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -23,4 +23,5 @@ from .merge_generated_scripts import MergeGeneratedScriptsNode
 from .fetch_screen_node import FetchScreenNode
 from .generate_answer_from_image_node import GenerateAnswerFromImageNode
 from .concat_answers_node import ConcatAnswersNode
-from .prompt_refiner_node import PromptRefinerNode
\ No newline at end of file
+from .prompt_refiner_node import PromptRefinerNode
+from .html_analyzer_node import HtmlAnalyzerNode
\ No newline at end of file
diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
new file mode 100644
index 00000000..26f8fb17
--- /dev/null
+++ b/scrapegraphai/nodes/html_analyzer_node.py
@@ -0,0 +1,166 @@
+"""
+HtmlAnalyzerNode Module
+"""
+from typing import List, Optional
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnableParallel
+from langchain_core.utils.pydantic import is_basemodel_subclass
+from langchain_openai import ChatOpenAI, AzureChatOpenAI
+from langchain_mistralai import ChatMistralAI
+from langchain_community.chat_models import ChatOllama
+from tqdm import tqdm
+from .base_node import BaseNode
+
+
+class HtmlAnalyzerNode(BaseNode):
+    """
+    ...
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "HtmlAnalyzer",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+
+        if isinstance(node_config["llm_model"], ChatOllama):
+            self.llm_model.format="json"
+
+        self.verbose = (
+            True if node_config is None else node_config.get("verbose", False)
+        )
+        self.force = (
+            False if node_config is None else node_config.get("force", False)
+        )
+        self.script_creator = (
+            False if node_config is None else node_config.get("script_creator", False)
+        )
+        self.is_md_scraper = (
+            False if node_config is None else node_config.get("is_md_scraper", False)
+        )
+
+        self.additional_info = node_config.get("additional_info")
+
+    def execute(self, state: dict) -> dict:
+        """
+        ...
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the generated answer.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating
+                      that the necessary information for generating an answer is missing.
+        """
+
+        template_html_analysis = """
+        Task: Your job is to analyze the provided HTML code in relation to the initial scraping task analysis and provide all the necessary HTML information useful for implementing a function that extracts data from the given HTML string.
+        
+        **Initial Analysis**:
+        {initial_analysis}
+
+        **HTML Code**:
+        ```html
+        {html_code}
+        ```
+
+        **HTML Analysis Instructions**:
+        1. Examine the HTML code and identify elements, classes, or IDs that correspond to each required data field mentioned in the Initial Analysis.
+        2. Look for patterns or repeated structures that could indicate multiple items (e.g., product listings).
+        3. Note any nested structures or relationships between elements that are relevant to the data extraction task.
+        4. Discuss any additional considerations based on the specific HTML layout that are crucial for accurate data extraction.
+        5. Recommend the specific strategy to use for scraping the content, remeber.
+
+        **Important Notes**:
+        - The function that the code generator is gonig to implement will receive the HTML as a string parameter, not as a live webpage.
+        - No web scraping, automation, or handling of dynamic content is required.
+        - The analysis should focus solely on extracting data from the static HTML provided.
+        - Be precise and specific in your analysis, as the code generator will, possibly, not have access to the full HTML context.
+        
+        This HTML analysis will be used to guide the final code generation process for a function that extracts data from the given HTML string.
+        Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
+        
+        **Response**:
+        """
+        
+        template_html_analysis_with_context = """
+        Task: Your job is to analyze the provided HTML code in relation to the initial scraping task analysis and the additional context the user provided and provide all the necessary HTML information useful for implementing a function that extracts data from the given HTML string.
+        
+        **Initial Analysis**:
+        {initial_analysis}
+
+        **HTML Code**:
+        ```html
+        {html_code}
+        ```
+        
+        **Additional Context**:
+        {additional_context}
+
+        **HTML Analysis Instructions**:
+        1. Examine the HTML code and identify elements, classes, or IDs that correspond to each required data field mentioned in the Initial Analysis.
+        2. Look for patterns or repeated structures that could indicate multiple items (e.g., product listings).
+        3. Note any nested structures or relationships between elements that are relevant to the data extraction task.
+        4. Discuss any additional considerations based on the specific HTML layout that are crucial for accurate data extraction.
+        5. Recommend the specific strategy to use for scraping the content, remeber.
+
+        **Important Notes**:
+        - The function that the code generator is gonig to implement will receive the HTML as a string parameter, not as a live webpage.
+        - No web scraping, automation, or handling of dynamic content is required.
+        - The analysis should focus solely on extracting data from the static HTML provided.
+        - Be precise and specific in your analysis, as the code generator will, possibly, not have access to the full HTML context.
+        
+        This HTML analysis will be used to guide the final code generation process for a function that extracts data from the given HTML string.
+        Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
+        
+        **Response**:
+        """
+        
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        input_keys = self.get_input_keys(state)
+        
+        input_data = [state[key] for key in input_keys]
+        refined_prompt = input_data[0] #                        get refined user prompt
+        doc = input_data[1] #                                   get HTML code
+            
+        if self.additional_info is not None: #              use additional context if present
+            prompt = PromptTemplate(
+                template=template_html_analysis_with_context,
+                partial_variables={"initial_analysis": refined_prompt,
+                                    "html_code": doc,
+                                    "additional_context": self.additional_info})
+        else:
+            prompt = PromptTemplate(
+                template=template_html_analysis,
+                partial_variables={"initial_analysis": refined_prompt,
+                                    "html_code": doc})
+
+        output_parser = StrOutputParser()
+
+        chain =  prompt | self.llm_model | output_parser
+        html_analysis = chain.invoke({})
+
+        state.update({self.output[0]: html_analysis})
+        return state
+

From 470e76837256c28044d5ac8f30d8e156b9fcebcc Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 19 Sep 2024 14:45:56 +0200
Subject: [PATCH 08/27] Update code generator graph

---
 scrapegraphai/graphs/code_generator_graph.py | 17 +++++++++++++++--
 scrapegraphai/nodes/prompt_refiner_node.py   |  4 ++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
index 174b112d..c4fc81e7 100644
--- a/scrapegraphai/graphs/code_generator_graph.py
+++ b/scrapegraphai/graphs/code_generator_graph.py
@@ -11,6 +11,7 @@ from ..nodes import (
     ParseNode,
     GenerateAnswerNode,
     PromptRefinerNode,
+    HtmlAnalyzerNode,
 )
 
 class CodeGeneratorGraph(AbstractGraph):
@@ -102,8 +103,18 @@ class CodeGeneratorGraph(AbstractGraph):
             }
         )
         
+        html_analyzer_node = HtmlAnalyzerNode(
+            input="refined_prompt & doc",
+            output=["html_info"],
+            node_config={
+                "llm_model": self.llm_model,
+                "additional_info": self.config.get("additional_info"),
+                "schema": self.schema
+            }
+        )
+        
         generate_code_node = GenerateCodeNode(
-            input="refined_prompt & doc & answer",
+            input="user_prompt & refined_prompt & html_info & doc & answer",
             output=["code"],
             node_config={
                 "llm_model": self.llm_model,
@@ -120,13 +131,15 @@ class CodeGeneratorGraph(AbstractGraph):
                 parse_node,
                 generate_validation_answer_node,
                 prompt_refier_node,
+                html_analyzer_node,
                 generate_code_node,
             ],
             edges=[
                 (fetch_node, parse_node),
                 (parse_node, generate_validation_answer_node),
                 (generate_validation_answer_node, prompt_refier_node),
-                (prompt_refier_node, generate_code_node)
+                (prompt_refier_node, html_analyzer_node)
+                (html_analyzer_node, generate_code_node)
             ],
             entry_point=fetch_node,
             graph_name=self.__class__.__name__
diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
index 1748aec0..054ecf10 100644
--- a/scrapegraphai/nodes/prompt_refiner_node.py
+++ b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -133,13 +133,13 @@ class PromptRefinerNode(BaseNode):
                 prompt = PromptTemplate(
                     template=template_prompt_builder_with_context,
                     partial_variables={"user_input": user_prompt,
-                                        "json_schema": self.schema,
+                                        "json_schema": self.schema.schema(),
                                         "additional_context": self.additional_info})
             else:
                 prompt = PromptTemplate(
                     template=template_prompt_builder,
                     partial_variables={"user_input": user_prompt,
-                                        "json_schema": self.schema})
+                                        "json_schema": self.schema.schema()})
 
             output_parser = StrOutputParser()
 

From 0f4b01181478716dffebdbfcbe4c5bdeded2587c Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 19 Sep 2024 18:05:28 +0200
Subject: [PATCH 09/27] generate code node added

---
 scrapegraphai/nodes/generate_code_node.py  | 221 +++++++++++++++++++++
 scrapegraphai/nodes/prompt_refiner_node.py |   4 +-
 2 files changed, 223 insertions(+), 2 deletions(-)
 create mode 100644 scrapegraphai/nodes/generate_code_node.py

diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
new file mode 100644
index 00000000..b3bb8288
--- /dev/null
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -0,0 +1,221 @@
+"""
+GenerateCodeNode Module
+"""
+from typing import List, Optional
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnableParallel
+from langchain_core.utils.pydantic import is_basemodel_subclass
+from langchain_openai import ChatOpenAI, AzureChatOpenAI
+from langchain_mistralai import ChatMistralAI
+from langchain_community.chat_models import ChatOllama
+import ast
+import sys
+from io import StringIO
+from bs4 import BeautifulSoup
+import re
+from tqdm import tqdm
+from .base_node import BaseNode
+from pydantic import ValidationError
+
+
+class GenerateCodeNode(BaseNode):
+    """
+    ...
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "GenerateCode",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+
+        if isinstance(node_config["llm_model"], ChatOllama):
+            self.llm_model.format="json"
+
+        self.verbose = (
+            True if node_config is None else node_config.get("verbose", False)
+        )
+        self.force = (
+            False if node_config is None else node_config.get("force", False)
+        )
+        self.script_creator = (
+            False if node_config is None else node_config.get("script_creator", False)
+        )
+        self.is_md_scraper = (
+            False if node_config is None else node_config.get("is_md_scraper", False)
+        )
+
+        self.additional_info = node_config.get("additional_info")
+
+    def execute(self, state: dict) -> dict:
+        """
+        ...
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the generated answer.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating
+                      that the necessary information for generating an answer is missing.
+        """
+
+        template_code_generator = """
+        **Task**: Create a Python function named `extract_data(html: str) -> dict()` using BeautifulSoup that extracts relevant information from the given HTML code string and returns it in a dictionary matching the Desired JSON Output Schema.
+
+        **User's Request**:
+        {user_input}
+
+        **Desired JSON Output Schema**:
+        ```json
+        {json_schema}
+        ```
+
+        **Initial Task Analysis**:
+        {initial_analysis}
+
+        **HTML Code**:
+        ```html
+        {html_code}
+        ```
+
+        **HTML Structure Analysis**:
+        {html_analysis}
+
+        Based on the above analyses, generate the `extract_data(html: str) -> dict()` function that:
+        1. Efficiently extracts the required data from the given HTML structure.
+        2. Processes and structures the data according to the specified JSON schema.
+        3. Returns the structured data as a dictionary.
+        
+        Your code should be well-commented, explaining the reasoning behind key decisions and any potential areas for improvement or customization.
+        
+        Use only the following pre-imported libraries:
+        - BeautifulSoup from bs4
+        - re
+        
+        **Output ONLY the Python code of the extract_data function, WITHOUT ANY IMPORTS OR ADDITIONAL TEXT.**
+        
+        **Response**:
+        """
+        
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        input_keys = self.get_input_keys(state)
+        
+        input_data = [state[key] for key in input_keys]
+        
+        user_prompt = input_data[0] #       get user prompt
+        refined_prompt = input_data[1] #    get refined prompt
+        html_info = input_data[2] #         get html analysis
+        doc = input_data[3] #               get html code
+        answer = input_data[4] #            get answer generated from the generate answer node for verification
+        
+        if self.node_config.get("schema", None) is not None:
+            
+            self.output_schema = self.node_config["schema"] #          get JSON output schema
+        
+            prompt = PromptTemplate(
+                template=template_code_generator,
+                partial_variables={
+                    "user_input": user_prompt,
+                    "json_schema": self.output_schema.schema,
+                    "initial_analysis": refined_prompt,
+                    "html_code": doc,
+                    "html_analysis": html_info
+                })
+
+            output_parser = StrOutputParser()
+
+            chain =  prompt | self.llm_model | output_parser
+            generated_code = chain.invoke({})
+            
+            # syntax check
+            print("\Checking code syntax...")
+            syntax_valid, syntax_message = self.syntax_check(generated_code)
+            
+            if not syntax_valid:
+                print(f"Syntax not valid: {syntax_message}")
+            
+            # code execution
+            print("\nExecuting code in sandbox...")
+            execution_success, execution_result = self.create_sandbox_and_execute(generated_code, doc)
+            
+            if not execution_success:
+                print(f"Executio failed: {execution_result}")
+                
+            print("Code executed successfully.")
+            print(f"Execution result:\n{execution_result}")
+            
+            validation, errors = self.validate_dict(execution_result, self.output_schema)
+            if not validation:
+                print(f"Output does not match the schema: {errors}")
+            
+        
+        state.update({self.output[0]: generated_code})
+        return state
+
+    def syntax_check(self, code):
+        try:
+            ast.parse(code)
+            return True, "Syntax is correct."
+        except SyntaxError as e:
+            return False, f"Syntax error: {str(e)}"
+
+    def create_sandbox_and_execute(function_code, html_doc):
+        # Create a sandbox environment
+        sandbox_globals = {
+            'BeautifulSoup': BeautifulSoup,
+            're': re,
+            '__builtins__': __builtins__,
+        }
+        
+        # Capture stdout
+        old_stdout = sys.stdout
+        sys.stdout = StringIO()
+        
+        try:
+            # Execute the function code in the sandbox
+            exec(function_code, sandbox_globals)
+            
+            # Get the extract_data function from the sandbox
+            extract_data = sandbox_globals.get('extract_data')
+            
+            if not extract_data:
+                raise NameError("Function 'extract_data' not found in the generated code.")
+            
+            # Execute the extract_data function with the provided HTML
+            result = extract_data(html_doc)
+            
+            return True, result
+        except Exception as e:
+            return False, f"Error during execution: {str(e)}"
+        finally:
+            # Restore stdout
+            sys.stdout = old_stdout
+            
+    def validate_dict(self, data: dict, schema):
+        try:
+            schema(**data)  # Use the provided schema directly
+            return True, None
+        except ValidationError as e:
+            errors = e.errors()
+            return False, errors
\ No newline at end of file
diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
index 054ecf10..f9006f15 100644
--- a/scrapegraphai/nodes/prompt_refiner_node.py
+++ b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -127,13 +127,13 @@ class PromptRefinerNode(BaseNode):
 
         if self.node_config.get("schema", None) is not None:
 
-            self.schema = self.node_config["schema"] #          get JSON schema
+            self.data_schema = self.node_config["schema"] #          get JSON schema
             
             if self.additional_info is not None: #              use additional context if present
                 prompt = PromptTemplate(
                     template=template_prompt_builder_with_context,
                     partial_variables={"user_input": user_prompt,
-                                        "json_schema": self.schema.schema(),
+                                        "json_schema": self.data_schema.schema(),
                                         "additional_context": self.additional_info})
             else:
                 prompt = PromptTemplate(

From eb9c77c2d53d4572d818c4f392bdbac87e45e862 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Thu, 19 Sep 2024 18:09:27 +0200
Subject: [PATCH 10/27] code generator graph fixed

---
 scrapegraphai/graphs/code_generator_graph.py | 3 ++-
 scrapegraphai/nodes/__init__.py              | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
index c4fc81e7..c4fd6d7a 100644
--- a/scrapegraphai/graphs/code_generator_graph.py
+++ b/scrapegraphai/graphs/code_generator_graph.py
@@ -12,6 +12,7 @@ from ..nodes import (
     GenerateAnswerNode,
     PromptRefinerNode,
     HtmlAnalyzerNode,
+    GenerateCodeNode,
 )
 
 class CodeGeneratorGraph(AbstractGraph):
@@ -115,7 +116,7 @@ class CodeGeneratorGraph(AbstractGraph):
         
         generate_code_node = GenerateCodeNode(
             input="user_prompt & refined_prompt & html_info & doc & answer",
-            output=["code"],
+            output=["generated_code"],
             node_config={
                 "llm_model": self.llm_model,
                 "additional_info": self.config.get("additional_info"),
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
index a5ffb757..e5427044 100644
--- a/scrapegraphai/nodes/__init__.py
+++ b/scrapegraphai/nodes/__init__.py
@@ -24,4 +24,5 @@ from .fetch_screen_node import FetchScreenNode
 from .generate_answer_from_image_node import GenerateAnswerFromImageNode
 from .concat_answers_node import ConcatAnswersNode
 from .prompt_refiner_node import PromptRefinerNode
-from .html_analyzer_node import HtmlAnalyzerNode
\ No newline at end of file
+from .html_analyzer_node import HtmlAnalyzerNode
+from .generate_code_node import GenerateCodeNode
\ No newline at end of file

From 3ea1f20c8309a7d51cc0e4b458f78bf57377e269 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com>
Date: Fri, 20 Sep 2024 21:48:24 +0200
Subject: [PATCH 11/27] Update fetch_node.py

---
 scrapegraphai/nodes/fetch_node.py | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
index 19d59004..d07735e3 100644
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@@ -316,21 +316,7 @@ class FetchNode(BaseNode):
             compressed_document = [
                 Document(page_content=parsed_content, metadata={"source": "html file"})
             ]
-
-        return self.update_state(state, compressed_document)
-
-    def update_state(self, state, compressed_document):
-        """
-        Updates the state with the output data from the node.
-
-        Args:
-            state (dict): The current state of the graph.
-            compressed_document (List[Document]): The compressed document content fetched
-                                                    by the node.
-
-        Returns:
-            dict: The updated state with the output data.
-        """
-
+        state["original_html"] = document
         state.update({self.output[0]: compressed_document,})
         return state
+    

From 5b579b323fbd0432d86cf26e56d526df0d8c7465 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Sat, 21 Sep 2024 11:44:04 +0200
Subject: [PATCH 12/27] gode generator v0.1

---
 .../code_generation/simple_with_schema.py     |  53 ++
 requirements-dev.lock                         |  28 +
 requirements.lock                             |  28 +
 scrapegraphai/code_gen/code_exec_test.py      | 526 ++++++++++++++++++
 scrapegraphai/graphs/code_generator_graph.py  |  18 +-
 scrapegraphai/nodes/generate_code_node.py     |  30 +-
 scrapegraphai/nodes/html_analyzer_node.py     |  21 +-
 scrapegraphai/nodes/prompt_refiner_node.py    |  38 +-
 scrapegraphai/utils/__init__.py               |   3 +-
 scrapegraphai/utils/cleanup_html.py           |  82 ++-
 scrapegraphai/utils/schema_trasform.py        |  36 ++
 11 files changed, 824 insertions(+), 39 deletions(-)
 create mode 100644 examples/code_generation/simple_with_schema.py
 create mode 100644 scrapegraphai/code_gen/code_exec_test.py
 create mode 100644 scrapegraphai/utils/schema_trasform.py

diff --git a/examples/code_generation/simple_with_schema.py b/examples/code_generation/simple_with_schema.py
new file mode 100644
index 00000000..c4803c62
--- /dev/null
+++ b/examples/code_generation/simple_with_schema.py
@@ -0,0 +1,53 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key":openai_key,
+        "model": "openai/gpt-4o",\
+    },
+    "library": "beautifulsoup",
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
diff --git a/requirements-dev.lock b/requirements-dev.lock
index fd04d800..c271122c 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -6,6 +6,8 @@
 #   features: []
 #   all-features: false
 #   with-sources: false
+#   generate-hashes: false
+#   universal: false
 
 -e file:.
 aiofiles==24.1.0
@@ -71,6 +73,7 @@ cycler==0.12.1
 dataclasses-json==0.6.7
     # via langchain-community
 dill==0.3.8
+    # via multiprocess
     # via pylint
 distro==1.9.0
     # via openai
@@ -87,6 +90,7 @@ fastapi-pagination==0.12.26
     # via burr
 filelock==3.15.4
     # via huggingface-hub
+    # via transformers
 fonttools==4.53.1
     # via matplotlib
 free-proxy==1.1.1
@@ -129,6 +133,7 @@ graphviz==0.20.3
     # via burr
 greenlet==3.0.3
     # via playwright
+    # via sqlalchemy
 grpcio==1.65.4
     # via google-api-core
     # via grpcio-status
@@ -152,6 +157,7 @@ httpx-sse==0.4.0
     # via langchain-mistralai
 huggingface-hub==0.24.5
     # via tokenizers
+    # via transformers
 idna==3.7
     # via anyio
     # via httpx
@@ -235,9 +241,13 @@ mdurl==0.1.2
     # via markdown-it-py
 minify-html==0.15.0
     # via scrapegraphai
+mpire==2.10.2
+    # via semchunk
 multidict==6.0.5
     # via aiohttp
     # via yarl
+multiprocess==0.70.16
+    # via mpire
 mypy-extensions==1.0.0
     # via typing-inspect
 narwhals==1.3.0
@@ -254,6 +264,7 @@ numpy==1.26.4
     # via pydeck
     # via sf-hamilton
     # via streamlit
+    # via transformers
 ollama==0.3.2
     # via langchain-ollama
 openai==1.40.3
@@ -271,6 +282,7 @@ packaging==24.1
     # via pytest
     # via sphinx
     # via streamlit
+    # via transformers
 pandas==2.2.2
     # via scrapegraphai
     # via sf-hamilton
@@ -320,6 +332,7 @@ pyee==11.1.0
     # via playwright
 pygments==2.18.0
     # via furo
+    # via mpire
     # via rich
     # via sphinx
 pylint==3.2.6
@@ -342,11 +355,13 @@ pyyaml==6.0.2
     # via langchain
     # via langchain-community
     # via langchain-core
+    # via transformers
 referencing==0.35.1
     # via jsonschema
     # via jsonschema-specifications
 regex==2024.7.24
     # via tiktoken
+    # via transformers
 requests==2.32.3
     # via burr
     # via free-proxy
@@ -358,6 +373,7 @@ requests==2.32.3
     # via sphinx
     # via streamlit
     # via tiktoken
+    # via transformers
 rich==13.7.1
     # via streamlit
 rpds-py==0.20.0
@@ -367,6 +383,10 @@ rsa==4.9
     # via google-auth
 s3transfer==0.10.2
     # via boto3
+safetensors==0.4.5
+    # via transformers
+semchunk==2.2.0
+    # via scrapegraphai
 sf-hamilton==1.73.1
     # via burr
 six==1.16.0
@@ -416,6 +436,7 @@ tiktoken==0.7.0
     # via scrapegraphai
 tokenizers==0.19.1
     # via langchain-mistralai
+    # via transformers
 toml==0.10.2
     # via streamlit
 tomli==2.0.1
@@ -428,8 +449,13 @@ tornado==6.4.1
 tqdm==4.66.5
     # via google-generativeai
     # via huggingface-hub
+    # via mpire
     # via openai
     # via scrapegraphai
+    # via semchunk
+    # via transformers
+transformers==4.44.2
+    # via scrapegraphai
 typing-extensions==4.12.2
     # via altair
     # via anyio
@@ -464,6 +490,8 @@ urllib3==1.26.19
     # via requests
 uvicorn==0.30.5
     # via burr
+watchdog==4.0.2
+    # via streamlit
 yarl==1.9.4
     # via aiohttp
 zipp==3.20.1
diff --git a/requirements.lock b/requirements.lock
index b34c9290..f05c6db4 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -6,6 +6,8 @@
 #   features: []
 #   all-features: false
 #   with-sources: false
+#   generate-hashes: false
+#   universal: false
 
 -e file:.
 aiohttp==3.9.5
@@ -41,6 +43,8 @@ charset-normalizer==3.3.2
     # via requests
 dataclasses-json==0.6.7
     # via langchain-community
+dill==0.3.8
+    # via multiprocess
 distro==1.9.0
     # via openai
 exceptiongroup==1.2.2
@@ -49,6 +53,7 @@ faiss-cpu==1.8.0.post1
     # via scrapegraphai
 filelock==3.15.4
     # via huggingface-hub
+    # via transformers
 free-proxy==1.1.1
     # via scrapegraphai
 frozenlist==1.4.1
@@ -81,6 +86,7 @@ googleapis-common-protos==1.63.2
     # via grpcio-status
 greenlet==3.0.3
     # via playwright
+    # via sqlalchemy
 grpcio==1.65.1
     # via google-api-core
     # via grpcio-status
@@ -103,6 +109,7 @@ httpx-sse==0.4.0
     # via langchain-mistralai
 huggingface-hub==0.24.1
     # via tokenizers
+    # via transformers
 idna==3.7
     # via anyio
     # via httpx
@@ -153,9 +160,13 @@ marshmallow==3.21.3
     # via dataclasses-json
 minify-html==0.15.0
     # via scrapegraphai
+mpire==2.10.2
+    # via semchunk
 multidict==6.0.5
     # via aiohttp
     # via yarl
+multiprocess==0.70.16
+    # via mpire
 mypy-extensions==1.0.0
     # via typing-inspect
 numpy==1.26.4
@@ -164,6 +175,7 @@ numpy==1.26.4
     # via langchain-aws
     # via langchain-community
     # via pandas
+    # via transformers
 ollama==0.3.2
     # via langchain-ollama
 openai==1.41.0
@@ -175,6 +187,7 @@ packaging==24.1
     # via huggingface-hub
     # via langchain-core
     # via marshmallow
+    # via transformers
 pandas==2.2.2
     # via scrapegraphai
 playwright==1.45.1
@@ -205,6 +218,8 @@ pydantic-core==2.20.1
     # via pydantic
 pyee==11.1.0
     # via playwright
+pygments==2.18.0
+    # via mpire
 pyparsing==3.1.2
     # via httplib2
 python-dateutil==2.9.0.post0
@@ -219,8 +234,10 @@ pyyaml==6.0.1
     # via langchain
     # via langchain-community
     # via langchain-core
+    # via transformers
 regex==2024.5.15
     # via tiktoken
+    # via transformers
 requests==2.32.3
     # via free-proxy
     # via google-api-core
@@ -229,10 +246,15 @@ requests==2.32.3
     # via langchain-community
     # via langsmith
     # via tiktoken
+    # via transformers
 rsa==4.9
     # via google-auth
 s3transfer==0.10.2
     # via boto3
+safetensors==0.4.5
+    # via transformers
+semchunk==2.2.0
+    # via scrapegraphai
 six==1.16.0
     # via python-dateutil
 sniffio==1.3.1
@@ -253,11 +275,17 @@ tiktoken==0.7.0
     # via scrapegraphai
 tokenizers==0.19.1
     # via langchain-mistralai
+    # via transformers
 tqdm==4.66.4
     # via google-generativeai
     # via huggingface-hub
+    # via mpire
     # via openai
     # via scrapegraphai
+    # via semchunk
+    # via transformers
+transformers==4.44.2
+    # via scrapegraphai
 typing-extensions==4.12.2
     # via anyio
     # via google-generativeai
diff --git a/scrapegraphai/code_gen/code_exec_test.py b/scrapegraphai/code_gen/code_exec_test.py
new file mode 100644
index 00000000..95d2eefd
--- /dev/null
+++ b/scrapegraphai/code_gen/code_exec_test.py
@@ -0,0 +1,526 @@
+import ast
+import sys
+from io import StringIO
+from bs4 import BeautifulSoup
+import re
+
+generated_code = "def extract_data(html: str) -> dict:\n    from bs4 import BeautifulSoup\n    import re\n\n    # Parse the HTML content using BeautifulSoup\n    soup = BeautifulSoup(html, 'html.parser')\n\n    # Initialize the projects list\n    projects = []\n\n    # Find all <a> tags that contain project entries\n    project_links = soup.find_all('a', href=True)\n\n    # Iterate through each project link to extract title and description\n    for link in project_links:\n        # Check if the link contains an image and text\n        img_tag = link.find('img')\n        if img_tag and link.string:\n            # Extract the full text and split it into title and description\n            full_text = link.string.strip()\n            # Use regex to separate title and description\n            match = re.match(r'^(.*?)(?:\\s*-\\s*(.*))?$', full_text)\n            if match:\n                title = match.group(1).strip()\n                description = match.group(2).strip() if match.group(2) else ''\n                \n                # Append the project data to the projects list\n                projects.append({\n                    'title': title,\n                    'description': description\n                })\n\n    # Return the structured data as a dictionary\n    return {\n        'projects': projects\n    }\n"
+html = """
+<html lang="en" class="" data-theme="dark"><head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <title>Projects | Marco Perini</title> <meta name="author" content="Marco Perini"> <meta name="description" content="Personal Porfolio Website "> <meta name="keywords" content="jekyll, jekyll-theme, academic-website, portfolio-website, robotics, machine-learning, computer-vision, artificial-intelligence, deep-learning, data-science, data-analysis, data-visualization, reinforcement-learning, computer-science, computer-graphics, computer-architecture, computer-networks, computer-security, computer-aided-design, computer-algebra, computer-alg, hardware"> <link rel="stylesheet" href="/assets/css/bootstrap.min.css?a4b3f509e79c54a512b890d73235ef04"> <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/mdbootstrap@4.20.0/css/mdb.min.css" integrity="sha256-jpjYvU3G3N6nrrBwXJoVEYI/0zw8htfFnhT9ljN3JJw=" crossorigin="anonymous"> <link defer="" rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.22.1/dist/bootstrap-table.min.css"> <link rel="stylesheet" href="/assets/css/academicons.min.css?f0b7046b84e425c55f3463ac249818f5"> <link rel="stylesheet" type="text/css" href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700|Roboto+Slab:100,300,400,500,700|Material+Icons"> <link rel="stylesheet" href="/assets/css/jekyll-pygments-themes-github.css?19f3075a2d19613090fe9e16b564e1fe" media="none" id="highlight_theme_light"> <link rel="shortcut icon" href="data:image/svg+xml,<svg%20xmlns=%22http://www.w3.org/2000/svg%22%20viewBox=%220%200%20100%20100%22><text%20y=%22.9em%22%20font-size=%2290%22>%F0%9F%A6%BE</text></svg>"> <link rel="stylesheet" href="/assets/css/main.css?d41d8cd98f00b204e9800998ecf8427e"> <link rel="canonical" href="https://perinim.github.io/projects/"> <link rel="stylesheet" href="/assets/css/jekyll-pygments-themes-native.css?e74e74bf055e5729d44a7d031a5ca6a5" media="" id="highlight_theme_dark"> <script src="/assets/js/theme.js?96d6b3e1c3604aca8b6134c7afdd5db6"></script> <script src="/assets/js/dark_mode.js?9b17307bb950ffa2e34be0227f53558f"></script> <style type="text/css">/* Chart.js */
+@-webkit-keyframes chartjs-render-animation{from{opacity:0.99}to{opacity:1}}@keyframes chartjs-render-animation{from{opacity:0.99}to{opacity:1}}.chartjs-render-monitor{-webkit-animation:chartjs-render-animation 0.001s;animation:chartjs-render-animation 0.001s;}</style><script id="altmetric-embed-js" src="https://d1bxh8uas1mnw7.cloudfront.net/assets/altmetric_badges-2f3c1a827c4dee5fa0ff35ec229b9204ae106583cc99636c724152d1f7acea04.js"></script><style type="text/css">.medium-zoom-overlay{position:fixed;top:0;right:0;bottom:0;left:0;opacity:0;transition:opacity .3s;will-change:opacity}.medium-zoom--opened .medium-zoom-overlay{cursor:pointer;cursor:zoom-out;opacity:1}.medium-zoom-image{cursor:pointer;cursor:zoom-in;transition:transform .3s cubic-bezier(.2,0,.2,1)!important}.medium-zoom-image--hidden{visibility:hidden}.medium-zoom-image--opened{position:relative;cursor:pointer;cursor:zoom-out;will-change:transform}</style><style type="text/css">.CtxtMenu_InfoClose {  top:.2em; right:.2em;}
+.CtxtMenu_InfoContent {  overflow:auto; text-align:left; font-size:80%;  padding:.4em .6em; border:1px inset; margin:1em 0px;  max-height:20em; max-width:30em; background-color:#EEEEEE;  white-space:normal;}
+.CtxtMenu_Info.CtxtMenu_MousePost {outline:none;}
+.CtxtMenu_Info {  position:fixed; left:50%; width:auto; text-align:center;  border:3px outset; padding:1em 2em; background-color:#DDDDDD;  color:black;  cursor:default; font-family:message-box; font-size:120%;  font-style:normal; text-indent:0; text-transform:none;  line-height:normal; letter-spacing:normal; word-spacing:normal;  word-wrap:normal; white-space:nowrap; float:none; z-index:201;  border-radius: 15px;                     /* Opera 10.5 and IE9 */  -webkit-border-radius:15px;               /* Safari and Chrome */  -moz-border-radius:15px;                  /* Firefox */  -khtml-border-radius:15px;                /* Konqueror */  box-shadow:0px 10px 20px #808080;         /* Opera 10.5 and IE9 */  -webkit-box-shadow:0px 10px 20px #808080; /* Safari 3 & Chrome */  -moz-box-shadow:0px 10px 20px #808080;    /* Forefox 3.5 */  -khtml-box-shadow:0px 10px 20px #808080;  /* Konqueror */  filter:progid:DXImageTransform.Microsoft.dropshadow(OffX=2, OffY=2, Color="gray", Positive="true"); /* IE */}
+</style><style type="text/css">.CtxtMenu_MenuClose {  position:absolute;  cursor:pointer;  display:inline-block;  border:2px solid #AAA;  border-radius:18px;  -webkit-border-radius: 18px;             /* Safari and Chrome */  -moz-border-radius: 18px;                /* Firefox */  -khtml-border-radius: 18px;              /* Konqueror */  font-family: "Courier New", Courier;  font-size:24px;  color:#F0F0F0}
+.CtxtMenu_MenuClose span {  display:block; background-color:#AAA; border:1.5px solid;  border-radius:18px;  -webkit-border-radius: 18px;             /* Safari and Chrome */  -moz-border-radius: 18px;                /* Firefox */  -khtml-border-radius: 18px;              /* Konqueror */  line-height:0;  padding:8px 0 6px     /* may need to be browser-specific */}
+.CtxtMenu_MenuClose:hover {  color:white!important;  border:2px solid #CCC!important}
+.CtxtMenu_MenuClose:hover span {  background-color:#CCC!important}
+.CtxtMenu_MenuClose:hover:focus {  outline:none}
+</style><style type="text/css">.CtxtMenu_Menu {  position:absolute;  background-color:white;  color:black;  width:auto; padding:5px 0px;  border:1px solid #CCCCCC; margin:0; cursor:default;  font: menu; text-align:left; text-indent:0; text-transform:none;  line-height:normal; letter-spacing:normal; word-spacing:normal;  word-wrap:normal; white-space:nowrap; float:none; z-index:201;  border-radius: 5px;                     /* Opera 10.5 and IE9 */  -webkit-border-radius: 5px;             /* Safari and Chrome */  -moz-border-radius: 5px;                /* Firefox */  -khtml-border-radius: 5px;              /* Konqueror */  box-shadow:0px 10px 20px #808080;         /* Opera 10.5 and IE9 */  -webkit-box-shadow:0px 10px 20px #808080; /* Safari 3 & Chrome */  -moz-box-shadow:0px 10px 20px #808080;    /* Forefox 3.5 */  -khtml-box-shadow:0px 10px 20px #808080;  /* Konqueror */}
+.CtxtMenu_MenuItem {  padding: 1px 2em;  background:transparent;}
+.CtxtMenu_MenuArrow {  position:absolute; right:.5em; padding-top:.25em; color:#666666;  font-family: null; font-size: .75em}
+.CtxtMenu_MenuActive .CtxtMenu_MenuArrow {color:white}
+.CtxtMenu_MenuArrow.CtxtMenu_RTL {left:.5em; right:auto}
+.CtxtMenu_MenuCheck {  position:absolute; left:.7em;  font-family: null}
+.CtxtMenu_MenuCheck.CtxtMenu_RTL { right:.7em; left:auto }
+.CtxtMenu_MenuRadioCheck {  position:absolute; left: .7em;}
+.CtxtMenu_MenuRadioCheck.CtxtMenu_RTL {  right: .7em; left:auto}
+.CtxtMenu_MenuInputBox {  padding-left: 1em; right:.5em; color:#666666;  font-family: null;}
+.CtxtMenu_MenuInputBox.CtxtMenu_RTL {  left: .1em;}
+.CtxtMenu_MenuComboBox {  left:.1em; padding-bottom:.5em;}
+.CtxtMenu_MenuSlider {  left: .1em;}
+.CtxtMenu_SliderValue {  position:absolute; right:.1em; padding-top:.25em; color:#333333;  font-size: .75em}
+.CtxtMenu_SliderBar {  outline: none; background: #d3d3d3}
+.CtxtMenu_MenuLabel {  padding: 1px 2em 3px 1.33em;  font-style:italic}
+.CtxtMenu_MenuRule {  border-top: 1px solid #DDDDDD;  margin: 4px 3px;}
+.CtxtMenu_MenuDisabled {  color:GrayText}
+.CtxtMenu_MenuActive {  background-color: #606872;  color: white;}
+.CtxtMenu_MenuDisabled:focus {  background-color: #E8E8E8}
+.CtxtMenu_MenuLabel:focus {  background-color: #E8E8E8}
+.CtxtMenu_ContextMenu:focus {  outline:none}
+.CtxtMenu_ContextMenu .CtxtMenu_MenuItem:focus {  outline:none}
+.CtxtMenu_SelectionMenu {  position:relative; float:left;  border-bottom: none; -webkit-box-shadow:none; -webkit-border-radius:0px; }
+.CtxtMenu_SelectionItem {  padding-right: 1em;}
+.CtxtMenu_Selection {  right: 40%; width:50%; }
+.CtxtMenu_SelectionBox {  padding: 0em; max-height:20em; max-width: none;  background-color:#FFFFFF;}
+.CtxtMenu_SelectionDivider {  clear: both; border-top: 2px solid #000000;}
+.CtxtMenu_Menu .CtxtMenu_MenuClose {  top:-10px; left:-10px}
+</style><style id="MJX-CHTML-styles">
+mjx-container[jax="CHTML"] {
+  line-height: 0;
+}
+
+mjx-container [space="1"] {
+  margin-left: .111em;
+}
+
+mjx-container [space="2"] {
+  margin-left: .167em;
+}
+
+mjx-container [space="3"] {
+  margin-left: .222em;
+}
+
+mjx-container [space="4"] {
+  margin-left: .278em;
+}
+
+mjx-container [space="5"] {
+  margin-left: .333em;
+}
+
+mjx-container [rspace="1"] {
+  margin-right: .111em;
+}
+
+mjx-container [rspace="2"] {
+  margin-right: .167em;
+}
+
+mjx-container [rspace="3"] {
+  margin-right: .222em;
+}
+
+mjx-container [rspace="4"] {
+  margin-right: .278em;
+}
+
+mjx-container [rspace="5"] {
+  margin-right: .333em;
+}
+
+mjx-container [size="s"] {
+  font-size: 70.7%;
+}
+
+mjx-container [size="ss"] {
+  font-size: 50%;
+}
+
+mjx-container [size="Tn"] {
+  font-size: 60%;
+}
+
+mjx-container [size="sm"] {
+  font-size: 85%;
+}
+
+mjx-container [size="lg"] {
+  font-size: 120%;
+}
+
+mjx-container [size="Lg"] {
+  font-size: 144%;
+}
+
+mjx-container [size="LG"] {
+  font-size: 173%;
+}
+
+mjx-container [size="hg"] {
+  font-size: 207%;
+}
+
+mjx-container [size="HG"] {
+  font-size: 249%;
+}
+
+mjx-container [width="full"] {
+  width: 100%;
+}
+
+mjx-box {
+  display: inline-block;
+}
+
+mjx-block {
+  display: block;
+}
+
+mjx-itable {
+  display: inline-table;
+}
+
+mjx-row {
+  display: table-row;
+}
+
+mjx-row > * {
+  display: table-cell;
+}
+
+mjx-mtext {
+  display: inline-block;
+}
+
+mjx-mstyle {
+  display: inline-block;
+}
+
+mjx-merror {
+  display: inline-block;
+  color: red;
+  background-color: yellow;
+}
+
+mjx-mphantom {
+  visibility: hidden;
+}
+
+_::-webkit-full-page-media, _:future, :root mjx-container {
+  will-change: opacity;
+}
+
+mjx-assistive-mml {
+  position: absolute !important;
+  top: 0px;
+  left: 0px;
+  clip: rect(1px, 1px, 1px, 1px);
+  padding: 1px 0px 0px 0px !important;
+  border: 0px !important;
+  display: block !important;
+  width: auto !important;
+  overflow: hidden !important;
+  -webkit-touch-callout: none;
+  -webkit-user-select: none;
+  -khtml-user-select: none;
+  -moz-user-select: none;
+  -ms-user-select: none;
+  user-select: none;
+}
+
+mjx-assistive-mml[display="block"] {
+  width: 100% !important;
+}
+
+mjx-c::before {
+  display: block;
+  width: 0;
+}
+
+.MJX-TEX {
+  font-family: MJXZERO, MJXTEX;
+}
+
+.TEX-B {
+  font-family: MJXZERO, MJXTEX-B;
+}
+
+.TEX-I {
+  font-family: MJXZERO, MJXTEX-I;
+}
+
+.TEX-MI {
+  font-family: MJXZERO, MJXTEX-MI;
+}
+
+.TEX-BI {
+  font-family: MJXZERO, MJXTEX-BI;
+}
+
+.TEX-S1 {
+  font-family: MJXZERO, MJXTEX-S1;
+}
+
+.TEX-S2 {
+  font-family: MJXZERO, MJXTEX-S2;
+}
+
+.TEX-S3 {
+  font-family: MJXZERO, MJXTEX-S3;
+}
+
+.TEX-S4 {
+  font-family: MJXZERO, MJXTEX-S4;
+}
+
+.TEX-A {
+  font-family: MJXZERO, MJXTEX-A;
+}
+
+.TEX-C {
+  font-family: MJXZERO, MJXTEX-C;
+}
+
+.TEX-CB {
+  font-family: MJXZERO, MJXTEX-CB;
+}
+
+.TEX-FR {
+  font-family: MJXZERO, MJXTEX-FR;
+}
+
+.TEX-FRB {
+  font-family: MJXZERO, MJXTEX-FRB;
+}
+
+.TEX-SS {
+  font-family: MJXZERO, MJXTEX-SS;
+}
+
+.TEX-SSB {
+  font-family: MJXZERO, MJXTEX-SSB;
+}
+
+.TEX-SSI {
+  font-family: MJXZERO, MJXTEX-SSI;
+}
+
+.TEX-SC {
+  font-family: MJXZERO, MJXTEX-SC;
+}
+
+.TEX-T {
+  font-family: MJXZERO, MJXTEX-T;
+}
+
+.TEX-V {
+  font-family: MJXZERO, MJXTEX-V;
+}
+
+.TEX-VB {
+  font-family: MJXZERO, MJXTEX-VB;
+}
+
+mjx-stretchy-v mjx-c, mjx-stretchy-h mjx-c {
+  font-family: MJXZERO, MJXTEX-S1, MJXTEX-S4, MJXTEX, MJXTEX-A ! important;
+}
+
+@font-face /* 0 */ {
+  font-family: MJXZERO;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Zero.woff") format("woff");
+}
+
+@font-face /* 1 */ {
+  font-family: MJXTEX;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Main-Regular.woff") format("woff");
+}
+
+@font-face /* 2 */ {
+  font-family: MJXTEX-B;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Main-Bold.woff") format("woff");
+}
+
+@font-face /* 3 */ {
+  font-family: MJXTEX-I;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Math-Italic.woff") format("woff");
+}
+
+@font-face /* 4 */ {
+  font-family: MJXTEX-MI;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Main-Italic.woff") format("woff");
+}
+
+@font-face /* 5 */ {
+  font-family: MJXTEX-BI;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Math-BoldItalic.woff") format("woff");
+}
+
+@font-face /* 6 */ {
+  font-family: MJXTEX-S1;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Size1-Regular.woff") format("woff");
+}
+
+@font-face /* 7 */ {
+  font-family: MJXTEX-S2;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Size2-Regular.woff") format("woff");
+}
+
+@font-face /* 8 */ {
+  font-family: MJXTEX-S3;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Size3-Regular.woff") format("woff");
+}
+
+@font-face /* 9 */ {
+  font-family: MJXTEX-S4;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Size4-Regular.woff") format("woff");
+}
+
+@font-face /* 10 */ {
+  font-family: MJXTEX-A;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_AMS-Regular.woff") format("woff");
+}
+
+@font-face /* 11 */ {
+  font-family: MJXTEX-C;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Calligraphic-Regular.woff") format("woff");
+}
+
+@font-face /* 12 */ {
+  font-family: MJXTEX-CB;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Calligraphic-Bold.woff") format("woff");
+}
+
+@font-face /* 13 */ {
+  font-family: MJXTEX-FR;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Fraktur-Regular.woff") format("woff");
+}
+
+@font-face /* 14 */ {
+  font-family: MJXTEX-FRB;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Fraktur-Bold.woff") format("woff");
+}
+
+@font-face /* 15 */ {
+  font-family: MJXTEX-SS;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_SansSerif-Regular.woff") format("woff");
+}
+
+@font-face /* 16 */ {
+  font-family: MJXTEX-SSB;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_SansSerif-Bold.woff") format("woff");
+}
+
+@font-face /* 17 */ {
+  font-family: MJXTEX-SSI;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_SansSerif-Italic.woff") format("woff");
+}
+
+@font-face /* 18 */ {
+  font-family: MJXTEX-SC;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Script-Regular.woff") format("woff");
+}
+
+@font-face /* 19 */ {
+  font-family: MJXTEX-T;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Typewriter-Regular.woff") format("woff");
+}
+
+@font-face /* 20 */ {
+  font-family: MJXTEX-V;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Vector-Regular.woff") format("woff");
+}
+
+@font-face /* 21 */ {
+  font-family: MJXTEX-VB;
+  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Vector-Bold.woff") format("woff");
+}
+</style><link rel="stylesheet" href="https://badge.dimensions.ai/badge.css"></head> <body class="fixed-top-nav " data-new-gr-c-s-check-loaded="14.1196.0" data-gr-ext-installed="" style="padding-top: 57px;"> <header> <nav id="navbar" class="navbar navbar-light navbar-expand-sm fixed-top"> <div class="container"> <a class="navbar-brand title font-weight-lighter" href="/"><span class="font-weight-bold">Marco&nbsp;</span>Perini</a> <button class="navbar-toggler collapsed ml-auto" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation"> <span class="sr-only">Toggle navigation</span> <span class="icon-bar top-bar"></span> <span class="icon-bar middle-bar"></span> <span class="icon-bar bottom-bar"></span> </button> <div class="collapse navbar-collapse text-right" id="navbarNav"> <ul class="navbar-nav ml-auto flex-nowrap"> <li class="nav-item "> <a class="nav-link" href="/">About</a> </li> <li class="nav-item dropdown active"> <a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Projects<span class="sr-only">(current)</span></a> <div class="dropdown-menu dropdown-menu-right" aria-labelledby="navbarDropdown"> <a class="dropdown-item" href="/projects/">Projects</a> <div class="dropdown-divider"></div> <a class="dropdown-item" href="/competitions/">Competitions</a> </div> </li> <li class="nav-item "> <a class="nav-link" href="/cv/">CV</a> </li> <li class="toggle-container"> <button id="light-toggle" title="Change theme"> <i class="fa-solid fa-moon"></i> <i class="fa-solid fa-sun"></i> </button> </li> </ul> </div> </div> </nav> <progress id="progress" value="40" max="140" style="top: 57px;"> <div class="progress-container"> <span class="progress-bar"></span> </div> </progress> </header> <div class="container mt-5"> <div class="post"> <header class="post-header"> <h1 class="post-title">Projects</h1> <p class="post-description"></p> </header> <article> <div class="projects"> <div class="grid" style="position: relative; height: 803.703px;"> <div class="grid-sizer"></div> <div class="grid-item" style="position: absolute; left: 0px; top: 0px;"> <a href="/projects/rotary-pendulum-rl/"> <div class="card hoverable"> <figure> <picture> <source class="responsive-img-srcset" media="(max-width: 480px)" srcset="/assets/img/rotary_pybullet-480.webp"> <source class="responsive-img-srcset" media="(max-width: 800px)" srcset="/assets/img/rotary_pybullet-800.webp"> <source class="responsive-img-srcset" media="(max-width: 1400px)" srcset="/assets/img/rotary_pybullet-1400.webp"> <img src="/assets/img/rotary_pybullet.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture> </figure> <div class="card-body"> <h4 class="card-title">Rotary Pendulum RL</h4> <p class="card-text">Open Source project aimed at controlling a real life rotary pendulum using RL algorithms</p> <div class="row ml-1 mr-1 p-0"> </div> </div> </div> </a> </div> <div class="grid-sizer"></div> <div class="grid-item" style="position: absolute; left: 260px; top: 0px;"> <a href="https://github.com/PeriniM/DQN-SwingUp" rel="external nofollow noopener" target="_blank"> <div class="card hoverable"> <figure> <picture> <source class="responsive-img-srcset" media="(max-width: 480px)" srcset="/assets/img/value-policy-heatmaps-480.webp"> <source class="responsive-img-srcset" media="(max-width: 800px)" srcset="/assets/img/value-policy-heatmaps-800.webp"> <source class="responsive-img-srcset" media="(max-width: 1400px)" srcset="/assets/img/value-policy-heatmaps-1400.webp"> <img src="/assets/img/value-policy-heatmaps.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture> </figure> <div class="card-body"> <h4 class="card-title">DQN Implementation from scratch</h4> <p class="card-text">Developed a Deep Q-Network algorithm to train a simple and double pendulum</p> <div class="row ml-1 mr-1 p-0"> </div> </div> </div> </a> </div> <div class="grid-sizer"></div> <div class="grid-item" style="position: absolute; left: 520px; top: 0px;"> <a href="https://github.com/PeriniM/Multi-Agents-HAED" rel="external nofollow noopener" target="_blank"> <div class="card hoverable"> <figure> <picture> <source class="responsive-img-srcset" media="(max-width: 480px)" srcset="/assets/img/multi_agents_haed.gif-480.webp"> <source class="responsive-img-srcset" media="(max-width: 800px)" srcset="/assets/img/multi_agents_haed.gif-800.webp"> <source class="responsive-img-srcset" media="(max-width: 1400px)" srcset="/assets/img/multi_agents_haed.gif-1400.webp"> <img src="/assets/img/multi_agents_haed.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture> </figure> <div class="card-body"> <h4 class="card-title">Multi Agents HAED</h4> <p class="card-text">University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.</p> <div class="row ml-1 mr-1 p-0"> </div> </div> </div> </a> </div> <div class="grid-sizer"></div> <div class="grid-item" style="position: absolute; left: 0px; top: 447.453px;"> <a href="/projects/wireless-esc-drone/"> <div class="card hoverable"> <figure> <picture> <source class="responsive-img-srcset" media="(max-width: 480px)" srcset="/assets/img/wireless_esc.gif-480.webp"> <source class="responsive-img-srcset" media="(max-width: 800px)" srcset="/assets/img/wireless_esc.gif-800.webp"> <source class="responsive-img-srcset" media="(max-width: 1400px)" srcset="/assets/img/wireless_esc.gif-1400.webp"> <img src="/assets/img/wireless_esc.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture> </figure> <div class="card-body"> <h4 class="card-title">Wireless ESC for Modular Drones</h4> <p class="card-text">Modular drone architecture proposal and proof of concept. The project received maximum grade.</p> <div class="row ml-1 mr-1 p-0"> </div> </div> </div> </a> </div> </div> </div> </article> </div> </div> <footer class="fixed-bottom"> <div class="container mt-0"> © Copyright 2023 Marco Perini. Powered by <a href="https://jekyllrb.com/" target="_blank" rel="external nofollow noopener">Jekyll</a> with <a href="https://github.com/alshedivat/al-folio" rel="external nofollow noopener" target="_blank">al-folio</a> theme. Hosted by <a href="https://pages.github.com/" target="_blank" rel="external nofollow noopener">GitHub Pages</a>. </div> </footer> <script src="https://cdn.jsdelivr.net/npm/jquery@3.6.0/dist/jquery.min.js" integrity="sha256-/xUj+3OJU5yExlq6GSYGSHk7tPXikynS7ogEvDej/m4=" crossorigin="anonymous"></script> <script src="/assets/js/bootstrap.bundle.min.js"></script> <script src="https://cdn.jsdelivr.net/npm/mdbootstrap@4.20.0/js/mdb.min.js" integrity="sha256-NdbiivsvWt7VYCt6hYNT3h/th9vSTL4EDWeGs5SN3DA=" crossorigin="anonymous"></script> <script defer="" src="https://cdn.jsdelivr.net/npm/masonry-layout@4.2.2/dist/masonry.pkgd.min.js" integrity="sha256-Nn1q/fx0H7SNLZMQ5Hw5JLaTRZp0yILA/FRexe19VdI=" crossorigin="anonymous"></script> <script defer="" src="https://cdn.jsdelivr.net/npm/imagesloaded@4/imagesloaded.pkgd.min.js"></script> <script defer="" src="/assets/js/masonry.js" type="text/javascript"></script> <script defer="" src="https://cdn.jsdelivr.net/npm/medium-zoom@1.0.8/dist/medium-zoom.min.js" integrity="sha256-7PhEpEWEW0XXQ0k6kQrPKwuoIomz8R8IYyuU1Qew4P8=" crossorigin="anonymous"></script> <script defer="" src="/assets/js/zoom.js?7b30caa5023af4af8408a472dc4e1ebb"></script> <script defer="" src="https://unpkg.com/bootstrap-table@1.22.1/dist/bootstrap-table.min.js"></script> <script src="/assets/js/no_defer.js?d633890033921b33e0ceb13d22340a9c"></script> <script defer="" src="/assets/js/common.js?acdb9690d7641b2f8d40529018c71a01"></script> <script defer="" src="/assets/js/copy_code.js?07b8786bab9b4abe90d10e61f7d12ff7" type="text/javascript"></script> <script async="" src="https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js"></script> <script async="" src="https://badge.dimensions.ai/badge.js"></script> <script type="text/javascript">window.MathJax={tex:{tags:"ams"}};</script> <script defer="" type="text/javascript" id="MathJax-script" src="https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/tex-mml-chtml.js"></script> <script defer="" src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script> <script async="" src="https://www.googletagmanager.com/gtag/js?id=G-W66CHGTB05"></script> <script>function gtag(){window.dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-W66CHGTB05");</script> <script type="text/javascript">function progressBarSetup(){"max"in document.createElement("progress")?(initializeProgressElement(),$(document).on("scroll",function(){progressBar.attr({value:getCurrentScrollPosition()})}),$(window).on("resize",initializeProgressElement)):(resizeProgressBar(),$(document).on("scroll",resizeProgressBar),$(window).on("resize",resizeProgressBar))}function getCurrentScrollPosition(){return $(window).scrollTop()}function initializeProgressElement(){let e=$("#navbar").outerHeight(!0);$("body").css({"padding-top":e}),$("progress-container").css({"padding-top":e}),progressBar.css({top:e}),progressBar.attr({max:getDistanceToScroll(),value:getCurrentScrollPosition()})}function getDistanceToScroll(){return $(document).height()-$(window).height()}function resizeProgressBar(){progressBar.css({width:getWidthPercentage()+"%"})}function getWidthPercentage(){return getCurrentScrollPosition()/getDistanceToScroll()*100}const progressBar=$("#progress");window.onload=function(){setTimeout(progressBarSetup,50)};</script>  <div class="hiddendiv common"></div><iframe id="LgIoc6pK" frameborder="0" src="chrome-extension://ekhagklcjbdpajgpjgmbionohlpdbjgc/translateSandbox/translateSandbox.html" style="width: 0px; height: 0px; display: none;"></iframe></body><grammarly-desktop-integration data-grammarly-shadow-root="true"></grammarly-desktop-integration></html>
+"""
+
+def extract_data(html: str) -> dict:
+    from bs4 import BeautifulSoup
+    import re
+
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html, 'html.parser')
+
+    # Initialize the list to hold project data
+    projects = []
+
+    # Find all anchor tags that contain project information
+    for a_tag in soup.find_all('a', href=True):
+        # Extract the text content of the anchor tag
+        text_content = a_tag.get_text(strip=True)
+        
+        # Use regex to split the text content into title and description
+        match = re.match(r'(.+?)\s(.+)', text_content)
+        if match:
+            title = match.group(1)
+            description = match.group(2)
+            
+            # Append the project data to the list
+            projects.append({
+                'title': title,
+                'description': description
+            })
+
+    # Structure the data according to the specified JSON schema
+    result = {
+        'title': 'Projects',
+        'type': 'object',
+        'properties': {
+            'projects': {
+                'title': 'Projects',
+                'type': 'array',
+                'items': projects
+            }
+        },
+        'required': ['projects'],
+        'definitions': {
+            'Project': {
+                'title': 'Project',
+                'type': 'object',
+                'properties': {
+                    'title': {
+                        'title': 'Title',
+                        'description': 'The title of the project',
+                        'type': 'string'
+                    },
+                    'description': {
+                        'title': 'Description',
+                        'description': 'The description of the project',
+                        'type': 'string'
+                    }
+                },
+                'required': ['title', 'description']
+            }
+        }
+    }
+
+    return result
+
+def create_sandbox_and_execute(function_code, html_doc):
+        # Create a sandbox environment
+        sandbox_globals = {
+            'BeautifulSoup': BeautifulSoup,
+            're': re,
+            '__builtins__': __builtins__,
+        }
+        
+        # Capture stdout
+        old_stdout = sys.stdout
+        sys.stdout = StringIO()
+        
+        try:
+            # Execute the extract_data function with the provided HTML
+            result = extract_data(html_doc)
+            
+            return True, result
+        except Exception as e:
+            return False, f"Error during execution: {str(e)}"
+        finally:
+            # Restore stdout
+            sys.stdout = old_stdout
+            
+            
+#execution_success, execution_result = create_sandbox_and_execute(generated_code, html)
+from langchain_core.pydantic_v1 import BaseModel, Field
+from typing import List
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+
+def transform_schema(pydantic_schema):
+    def process_properties(properties):
+        result = {}
+        for key, value in properties.items():
+            if 'type' in value:
+                if value['type'] == 'array':
+                    if '$ref' in value['items']:
+                        ref_key = value['items']['$ref'].split('/')[-1]
+                        result[key] = [process_properties(pydantic_schema['definitions'][ref_key]['properties'])]
+                    else:
+                        result[key] = [value['items']['type']]
+                else:
+                    result[key] = {
+                        "type": value['type'],
+                        "description": value.get('description', '')
+                    }
+            elif '$ref' in value:
+                ref_key = value['$ref'].split('/')[-1]
+                result[key] = process_properties(pydantic_schema['definitions'][ref_key]['properties'])
+        return result
+
+    return process_properties(pydantic_schema['properties'])
+
+
+data_schema = Projects.schema()
+transformed_schema = transform_schema(data_schema)
+print(transformed_schema)
\ No newline at end of file
diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
index c4fd6d7a..6dc769b0 100644
--- a/scrapegraphai/graphs/code_generator_graph.py
+++ b/scrapegraphai/graphs/code_generator_graph.py
@@ -105,25 +105,25 @@ class CodeGeneratorGraph(AbstractGraph):
         )
         
         html_analyzer_node = HtmlAnalyzerNode(
-            input="refined_prompt & doc",
-            output=["html_info"],
+            input="refined_prompt & original_html",
+            output=["html_info", "reduced_html"],
             node_config={
                 "llm_model": self.llm_model,
                 "additional_info": self.config.get("additional_info"),
-                "schema": self.schema
+                "schema": self.schema,
+                "reduction": self.config.get("reduction", 0)
             }
         )
         
         generate_code_node = GenerateCodeNode(
-            input="user_prompt & refined_prompt & html_info & doc & answer",
+            input="user_prompt & refined_prompt & html_info & reduced_html & answer",
             output=["generated_code"],
             node_config={
+                "library": self.library,
                 "llm_model": self.llm_model,
                 "additional_info": self.config.get("additional_info"),
                 "schema": self.schema
-            },
-            library=self.library,
-            website=self.source
+            }
         )
 
         return BaseGraph(
@@ -139,7 +139,7 @@ class CodeGeneratorGraph(AbstractGraph):
                 (fetch_node, parse_node),
                 (parse_node, generate_validation_answer_node),
                 (generate_validation_answer_node, prompt_refier_node),
-                (prompt_refier_node, html_analyzer_node)
+                (prompt_refier_node, html_analyzer_node),
                 (html_analyzer_node, generate_code_node)
             ],
             entry_point=fetch_node,
@@ -157,4 +157,4 @@ class CodeGeneratorGraph(AbstractGraph):
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("code", "No code created.")
+        return self.final_state.get("generated_code", "No code created.")
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index b3bb8288..3f73ede2 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -17,6 +17,8 @@ import re
 from tqdm import tqdm
 from .base_node import BaseNode
 from pydantic import ValidationError
+from ..utils import transform_schema
+from jsonschema import validate, ValidationError
 
 
 class GenerateCodeNode(BaseNode):
@@ -126,20 +128,21 @@ class GenerateCodeNode(BaseNode):
         user_prompt = input_data[0] #       get user prompt
         refined_prompt = input_data[1] #    get refined prompt
         html_info = input_data[2] #         get html analysis
-        doc = input_data[3] #               get html code
+        reduced_html = input_data[3] #               get html code
         answer = input_data[4] #            get answer generated from the generate answer node for verification
         
         if self.node_config.get("schema", None) is not None:
             
-            self.output_schema = self.node_config["schema"] #          get JSON output schema
+            self.output_schema = self.node_config["schema"].schema() #          get JSON output schema
+            self.simplefied_schema = transform_schema(self.output_schema) #          get JSON output schema
         
             prompt = PromptTemplate(
                 template=template_code_generator,
                 partial_variables={
                     "user_input": user_prompt,
-                    "json_schema": self.output_schema.schema,
+                    "json_schema": str(self.simplefied_schema),
                     "initial_analysis": refined_prompt,
-                    "html_code": doc,
+                    "html_code": reduced_html,
                     "html_analysis": html_info
                 })
 
@@ -150,6 +153,7 @@ class GenerateCodeNode(BaseNode):
             
             # syntax check
             print("\Checking code syntax...")
+            generated_code = self.extract_code(generated_code)
             syntax_valid, syntax_message = self.syntax_check(generated_code)
             
             if not syntax_valid:
@@ -157,7 +161,7 @@ class GenerateCodeNode(BaseNode):
             
             # code execution
             print("\nExecuting code in sandbox...")
-            execution_success, execution_result = self.create_sandbox_and_execute(generated_code, doc)
+            execution_success, execution_result = self.create_sandbox_and_execute(generated_code, reduced_html)
             
             if not execution_success:
                 print(f"Executio failed: {execution_result}")
@@ -180,7 +184,7 @@ class GenerateCodeNode(BaseNode):
         except SyntaxError as e:
             return False, f"Syntax error: {str(e)}"
 
-    def create_sandbox_and_execute(function_code, html_doc):
+    def create_sandbox_and_execute(self, function_code, html_doc):
         # Create a sandbox environment
         sandbox_globals = {
             'BeautifulSoup': BeautifulSoup,
@@ -214,8 +218,18 @@ class GenerateCodeNode(BaseNode):
             
     def validate_dict(self, data: dict, schema):
         try:
-            schema(**data)  # Use the provided schema directly
+            validate(instance=data, schema=schema)
             return True, None
         except ValidationError as e:
             errors = e.errors()
-            return False, errors
\ No newline at end of file
+            return False, errors
+    
+    def extract_code(self, code: str) -> str:
+        # Pattern to match the code inside a code block
+        pattern = r'```(?:python)?\n(.*?)```'
+        
+        # Search for the code block, if present
+        match = re.search(pattern, code, re.DOTALL)
+        
+        # If a code block is found, return the code, otherwise return the entire string
+        return match.group(1) if match else code
\ No newline at end of file
diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
index 26f8fb17..cc8b4106 100644
--- a/scrapegraphai/nodes/html_analyzer_node.py
+++ b/scrapegraphai/nodes/html_analyzer_node.py
@@ -11,6 +11,7 @@ from langchain_mistralai import ChatMistralAI
 from langchain_community.chat_models import ChatOllama
 from tqdm import tqdm
 from .base_node import BaseNode
+from ..utils import reduce_html
 
 
 class HtmlAnalyzerNode(BaseNode):
@@ -100,7 +101,9 @@ class HtmlAnalyzerNode(BaseNode):
         This HTML analysis will be used to guide the final code generation process for a function that extracts data from the given HTML string.
         Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
         
-        **Response**:
+        Focus on providing a concise, step-by-step analysis of the HTML structure and the key elements needed for data extraction. Do not include any code examples or implementation logic. Keep the response focused and avoid general statements.**
+        
+        **HTML Analysis for Data Extraction**:
         """
         
         template_html_analysis_with_context = """
@@ -133,7 +136,9 @@ class HtmlAnalyzerNode(BaseNode):
         This HTML analysis will be used to guide the final code generation process for a function that extracts data from the given HTML string.
         Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
         
-        **Response**:
+        Focus on providing a concise, step-by-step analysis of the HTML structure and the key elements needed for data extraction. Do not include any code examples or implementation logic. Keep the response focused and avoid general statements.**
+        
+        **HTML Analysis for Data Extraction**:
         """
         
         self.logger.info(f"--- Executing {self.node_name} Node ---")
@@ -142,25 +147,27 @@ class HtmlAnalyzerNode(BaseNode):
         
         input_data = [state[key] for key in input_keys]
         refined_prompt = input_data[0] #                        get refined user prompt
-        doc = input_data[1] #                                   get HTML code
-            
+        html = input_data[1] #                                  get HTML code
+        
+        reduced_html = reduce_html(html[0].page_content, self.node_config.get("reduction", 0)) #                reduce HTML code
+        
         if self.additional_info is not None: #              use additional context if present
             prompt = PromptTemplate(
                 template=template_html_analysis_with_context,
                 partial_variables={"initial_analysis": refined_prompt,
-                                    "html_code": doc,
+                                    "html_code": reduced_html,
                                     "additional_context": self.additional_info})
         else:
             prompt = PromptTemplate(
                 template=template_html_analysis,
                 partial_variables={"initial_analysis": refined_prompt,
-                                    "html_code": doc})
+                                    "html_code": reduced_html})
 
         output_parser = StrOutputParser()
 
         chain =  prompt | self.llm_model | output_parser
         html_analysis = chain.invoke({})
 
-        state.update({self.output[0]: html_analysis})
+        state.update({self.output[0]: html_analysis, self.output[1]: reduced_html})
         return state
 
diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
index f9006f15..5aa93ba0 100644
--- a/scrapegraphai/nodes/prompt_refiner_node.py
+++ b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -11,6 +11,7 @@ from langchain_mistralai import ChatMistralAI
 from langchain_community.chat_models import ChatOllama
 from tqdm import tqdm
 from .base_node import BaseNode
+from ..utils import transform_schema
 
 
 class PromptRefinerNode(BaseNode):
@@ -76,7 +77,7 @@ class PromptRefinerNode(BaseNode):
         """
 
         template_prompt_builder = """
-        **Task**: Analyze the user's request and the desired output schema to create a structured description for web scraping. Carefully examine both the user's request and the JSON schema to understand the desired data elements and their relationships.
+        **Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction. Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.
 
         **User's Request**:
         {user_input}
@@ -87,7 +88,14 @@ class PromptRefinerNode(BaseNode):
         ```
 
         **Analysis Instructions**:
-        Genarate the breakdown of the user request and link the  elements of the user's request with the json schema
+        1. **Break Down User Request:** 
+        * Clearly identify the core entities or data types the user is asking for.
+        * Highlight any specific attributes or relationships mentioned in the request.
+
+        2. **Map to JSON Schema**:
+        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.
+        * Explain how the schema structure accommodates the user's needs.
+        * If applicable, mention any schema elements that are not directly addressed in the user's request.
 
         This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.
         Please generate only the analysis and no other text.
@@ -96,8 +104,8 @@ class PromptRefinerNode(BaseNode):
         """
         
         template_prompt_builder_with_context = """
-        **Task**: Analyze the user's request, the desired output schema, and the additional context the user provided to create a structured description for web scraping. Carefully examine both the user's request and the JSON schema to understand the desired data elements and their relationships.
-
+        **Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction. Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.
+        
         **User's Request**:
         {user_input}
 
@@ -110,7 +118,14 @@ class PromptRefinerNode(BaseNode):
         {additional_context}
 
         **Analysis Instructions**:
-        Genarate the breakdown of the user request and link the  elements of the user's request with the json schema
+        1. **Break Down User Request:** 
+        * Clearly identify the core entities or data types the user is asking for.
+        * Highlight any specific attributes or relationships mentioned in the request.
+
+        2. **Map to JSON Schema**:
+        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.
+        * Explain how the schema structure accommodates the user's needs.
+        * If applicable, mention any schema elements that are not directly addressed in the user's request.
 
         This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.
         Please generate only the analysis and no other text.
@@ -120,26 +135,23 @@ class PromptRefinerNode(BaseNode):
         
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
-        input_keys = self.get_input_keys(state)
-        
-        input_data = [state[key] for key in input_keys]
-        user_prompt = input_data[0] #                           get user prompt
+        user_prompt = state['user_prompt'] #                            get user prompt
 
         if self.node_config.get("schema", None) is not None:
 
-            self.data_schema = self.node_config["schema"] #          get JSON schema
+            self.simplefied_schema = transform_schema(self.node_config["schema"].schema()) #             get JSON schema
             
-            if self.additional_info is not None: #              use additional context if present
+            if self.additional_info is not None: #                      use additional context if present
                 prompt = PromptTemplate(
                     template=template_prompt_builder_with_context,
                     partial_variables={"user_input": user_prompt,
-                                        "json_schema": self.data_schema.schema(),
+                                        "json_schema": str(self.simplefied_schema),
                                         "additional_context": self.additional_info})
             else:
                 prompt = PromptTemplate(
                     template=template_prompt_builder,
                     partial_variables={"user_input": user_prompt,
-                                        "json_schema": self.schema.schema()})
+                                        "json_schema": str(self.simplefied_schema)})
 
             output_parser = StrOutputParser()
 
diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
index fbd03800..feffa683 100644
--- a/scrapegraphai/utils/__init__.py
+++ b/scrapegraphai/utils/__init__.py
@@ -7,7 +7,7 @@ from .prettify_exec_info import prettify_exec_info
 from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers
 from .save_audio_from_bytes import save_audio_from_bytes
 from .sys_dynamic_import import dynamic_import, srcfile_import
-from .cleanup_html import cleanup_html
+from .cleanup_html import cleanup_html, reduce_html
 from .logging import *
 from .convert_to_md import convert_to_md
 from .screenshot_scraping.screenshot_preparation import (take_screenshot,
@@ -17,3 +17,4 @@ from .screenshot_scraping.screenshot_preparation import (take_screenshot,
 from .screenshot_scraping.text_detection import detect_text
 from .tokenizer import num_tokens_calculus
 from .split_text_into_chunks import split_text_into_chunks
+from .schema_trasform import transform_schema
\ No newline at end of file
diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
index 6c7c3c4c..1521fe01 100644
--- a/scrapegraphai/utils/cleanup_html.py
+++ b/scrapegraphai/utils/cleanup_html.py
@@ -2,7 +2,8 @@
 Module for minimizing the code
 """
 from urllib.parse import urljoin
-from bs4 import BeautifulSoup
+import re
+from bs4 import BeautifulSoup, Comment
 from minify_html import minify
 
 def cleanup_html(html_content: str, base_url: str) -> str:
@@ -53,3 +54,82 @@ def cleanup_html(html_content: str, base_url: str) -> str:
     else:
         raise ValueError(f"""No HTML body content found, please try setting the 'headless'
                          flag to False in the graph configuration. HTML content: {html_content}""")
+
+
+def minify_html(html):
+    # Remove comments
+    html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
+    
+    # Remove whitespace between tags
+    html = re.sub(r'>\s+<', '><', html)
+    
+    # Remove whitespace at the beginning and end of tags
+    html = re.sub(r'\s+>', '>', html)
+    html = re.sub(r'<\s+', '<', html)
+    
+    # Collapse multiple whitespace characters into a single space
+    html = re.sub(r'\s+', ' ', html)
+    
+    # Remove spaces around equals signs in attributes
+    html = re.sub(r'\s*=\s*', '=', html)
+    
+    return html.strip()
+
+def reduce_html(html, reduction):
+    """
+    Reduces the size of the HTML content based on the specified level of reduction.
+    
+    Args:
+        html (str): The HTML content to reduce.
+        reduction (int): The level of reduction to apply to the HTML content.
+            0: minification only,
+            1: minification and removig unnecessary tags and attributes,
+            2: minification, removig unnecessary tags and attributes, simplifying text content, removing of the head tag
+    
+    Returns:
+        str: The reduced HTML content based on the specified reduction level.
+    """
+    if reduction == 0:
+        return minify_html(html)
+    
+    soup = BeautifulSoup(html, 'html.parser')
+    
+    # Remove comments
+    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+        comment.extract()
+    
+    # Remove script and style tag contents, but keep the tags
+    for tag in soup(['script', 'style']):
+        tag.string = ""
+    
+    # Remove unnecessary attributes, but keep class and id
+    attrs_to_keep = ['class', 'id', 'href', 'src']
+    for tag in soup.find_all(True):
+        for attr in list(tag.attrs):
+            if attr not in attrs_to_keep:
+                del tag[attr]
+                
+    if reduction == 1:
+        return minify_html(str(soup))
+    
+    # Remove script and style tags completely
+    for tag in soup(['script', 'style']):
+        tag.decompose()
+    
+    # Focus only on the body
+    body = soup.body
+    if not body:
+        return "No <body> tag found in the HTML"
+    
+    # Simplify text content
+    for tag in body.find_all(string=True):
+        if tag.parent.name not in ['script', 'style']:
+            tag.replace_with(re.sub(r'\s+', ' ', tag.strip())[:20])
+    
+    # Generate reduced HTML
+    reduced_html = str(body)
+    
+    # Apply minification
+    reduced_html = minify_html(reduced_html)
+    
+    return reduced_html
\ No newline at end of file
diff --git a/scrapegraphai/utils/schema_trasform.py b/scrapegraphai/utils/schema_trasform.py
new file mode 100644
index 00000000..af752470
--- /dev/null
+++ b/scrapegraphai/utils/schema_trasform.py
@@ -0,0 +1,36 @@
+"""
+This utility function trasfrom the pydantic schema into a more comprehensible schema.
+"""
+
+def transform_schema(pydantic_schema):
+    """
+    Transform the pydantic schema into a more comprehensible JSON schema.
+    
+    Args:
+        pydantic_schema (dict): The pydantic schema.
+    
+    Returns:
+        dict: The transformed JSON schema.
+    """
+    
+    def process_properties(properties):
+        result = {}
+        for key, value in properties.items():
+            if 'type' in value:
+                if value['type'] == 'array':
+                    if '$ref' in value['items']:
+                        ref_key = value['items']['$ref'].split('/')[-1]
+                        result[key] = [process_properties(pydantic_schema['definitions'][ref_key]['properties'])]
+                    else:
+                        result[key] = [value['items']['type']]
+                else:
+                    result[key] = {
+                        "type": value['type'],
+                        "description": value.get('description', '')
+                    }
+            elif '$ref' in value:
+                ref_key = value['$ref'].split('/')[-1]
+                result[key] = process_properties(pydantic_schema['definitions'][ref_key]['properties'])
+        return result
+
+    return process_properties(pydantic_schema['properties'])
\ No newline at end of file

From afa00d1594b655908ea911e8f2643f79117db84b Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Sat, 21 Sep 2024 13:02:16 +0200
Subject: [PATCH 13/27] Reasoning loop created

---
 scrapegraphai/nodes/generate_code_node.py | 340 ++++++++++++++++++----
 1 file changed, 286 insertions(+), 54 deletions(-)

diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index 3f73ede2..77fc6378 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -64,6 +64,15 @@ class GenerateCodeNode(BaseNode):
         )
 
         self.additional_info = node_config.get("additional_info")
+        
+        self.max_iterations = node_config.get("max_iterations", {
+            "overall": 10,
+            "syntax": 3,
+            "execution": 3,
+            "validation": 3
+        })
+        
+        self.output_schema = node_config.get("schema").schema() #          get JSON output schema
 
     def execute(self, state: dict) -> dict:
         """
@@ -80,7 +89,107 @@ class GenerateCodeNode(BaseNode):
             KeyError: If the input keys are not found in the state, indicating
                       that the necessary information for generating an answer is missing.
         """
+        
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
 
+        input_keys = self.get_input_keys(state)
+        
+        input_data = [state[key] for key in input_keys]
+        
+        user_prompt = input_data[0] #       get user prompt
+        refined_prompt = input_data[1] #    get refined prompt
+        html_info = input_data[2] #         get html analysis
+        reduced_html = input_data[3] #      get html code
+        answer = input_data[4] #            get answer generated from the generate answer node for verification
+        
+        self.raw_html = state['original_html'][0].page_content
+        
+        simplefied_schema = str(transform_schema(self.output_schema)) #          get JSON output schema
+        
+        reasoning_state = {
+            "user_input": user_prompt,
+            "json_schema": simplefied_schema,
+            "initial_analysis": refined_prompt,
+            "html_code": reduced_html,
+            "html_analysis": html_info,
+            "generated_code": "",
+            "execution_result": None,
+            "errors": {
+                "syntax": [],
+                "execution": [],
+                "validation": []
+            },
+            "iteration": 0
+        }
+    
+    
+        final_state = self.overall_reasoning_loop(reasoning_state)
+        
+        state.update({self.output[0]: final_state["generated_code"]})
+        return state
+    
+    def overall_reasoning_loop(self, state: dict) -> dict:
+        
+        state["generated_code"] = self.generate_initial_code(state)
+        
+        while state["iteration"] < self.max_iterations["overall"]:
+            state["iteration"] += 1
+            
+            state = self.syntax_reasoning_loop(state)
+            if state["errors"]["syntax"]:
+                continue
+            
+            state = self.execution_reasoning_loop(state)
+            if state["errors"]["execution"]:
+                continue
+            
+            state = self.validation_reasoning_loop(state)
+            if state["errors"]["validation"]:
+                continue
+            
+            # If we've made it here, the code is valid and produces the correct output
+            break
+        
+        return state
+    
+    def syntax_reasoning_loop(self, state: dict) -> dict:
+        for _ in range(self.max_iterations["syntax"]):
+            syntax_valid, syntax_message = self.syntax_check(state["generated_code"])
+            if syntax_valid:
+                state["errors"]["syntax"] = []
+                return state
+            
+            state["errors"]["syntax"] = [syntax_message]
+            analysis = self.syntax_focused_analysis(state)
+            state["generated_code"] = self.syntax_focused_code_generation(state, analysis)
+        return state
+    
+    def execution_reasoning_loop(self, state: dict, raw_html: str) -> dict:
+        for _ in range(self.max_iterations["execution"]):
+            execution_success, execution_result = self.create_sandbox_and_execute(state["generated_code"], raw_html)
+            if execution_success:
+                state["execution_result"] = execution_result
+                state["errors"]["execution"] = []
+                return state
+            
+            state["errors"]["execution"] = [execution_result]
+            analysis = self.execution_focused_analysis(state)
+            state["generated_code"] = self.execution_focused_code_generation(state, analysis)
+        return state
+    
+    def validation_reasoning_loop(self, state: dict) -> dict:
+        for _ in range(self.max_iterations["validation"]):
+            validation, errors = self.validate_dict(state["execution_result"], self.output_schema.schema())
+            if validation:
+                state["errors"]["validation"] = []
+                return state
+            
+            state["errors"]["validation"] = errors
+            analysis = self.validation_focused_analysis(state)
+            state["generated_code"] = self.validation_focused_code_generation(state, analysis)
+        return state
+    
+    def generate_initial_code(self, state: dict) -> str:
         template_code_generator = """
         **Task**: Create a Python function named `extract_data(html: str) -> dict()` using BeautifulSoup that extracts relevant information from the given HTML code string and returns it in a dictionary matching the Desired JSON Output Schema.
 
@@ -119,64 +228,187 @@ class GenerateCodeNode(BaseNode):
         **Response**:
         """
         
-        self.logger.info(f"--- Executing {self.node_name} Node ---")
+        prompt = PromptTemplate(
+            template=template_code_generator,
+            partial_variables={
+                "user_input": state["user_input"],
+                "json_schema": state["json_schema"],
+                "initial_analysis": state["initial_analysis"],
+                "html_code": state["html_code"],
+                "html_analysis": state["html_analysis"]
+            })
 
-        input_keys = self.get_input_keys(state)
-        
-        input_data = [state[key] for key in input_keys]
-        
-        user_prompt = input_data[0] #       get user prompt
-        refined_prompt = input_data[1] #    get refined prompt
-        html_info = input_data[2] #         get html analysis
-        reduced_html = input_data[3] #               get html code
-        answer = input_data[4] #            get answer generated from the generate answer node for verification
-        
-        if self.node_config.get("schema", None) is not None:
-            
-            self.output_schema = self.node_config["schema"].schema() #          get JSON output schema
-            self.simplefied_schema = transform_schema(self.output_schema) #          get JSON output schema
-        
-            prompt = PromptTemplate(
-                template=template_code_generator,
-                partial_variables={
-                    "user_input": user_prompt,
-                    "json_schema": str(self.simplefied_schema),
-                    "initial_analysis": refined_prompt,
-                    "html_code": reduced_html,
-                    "html_analysis": html_info
-                })
+        output_parser = StrOutputParser()
 
-            output_parser = StrOutputParser()
-
-            chain =  prompt | self.llm_model | output_parser
-            generated_code = chain.invoke({})
-            
-            # syntax check
-            print("\Checking code syntax...")
-            generated_code = self.extract_code(generated_code)
-            syntax_valid, syntax_message = self.syntax_check(generated_code)
-            
-            if not syntax_valid:
-                print(f"Syntax not valid: {syntax_message}")
-            
-            # code execution
-            print("\nExecuting code in sandbox...")
-            execution_success, execution_result = self.create_sandbox_and_execute(generated_code, reduced_html)
-            
-            if not execution_success:
-                print(f"Executio failed: {execution_result}")
-                
-            print("Code executed successfully.")
-            print(f"Execution result:\n{execution_result}")
-            
-            validation, errors = self.validate_dict(execution_result, self.output_schema)
-            if not validation:
-                print(f"Output does not match the schema: {errors}")
-            
+        chain =  prompt | self.llm_model | output_parser
+        generated_code = chain.invoke({})
+        return generated_code
+    
+    def syntax_focused_analysis(self, state: dict) -> str:
+        template = """
+        The current code has encountered a syntax error. Here are the details:
         
-        state.update({self.output[0]: generated_code})
-        return state
+        Current Code:
+        ```python
+        {generated_code}
+        ```
+        
+        Syntax Error:
+        {errors}
+        
+        Please analyze in detail the syntax error and suggest a fix. Focus only on correcting the syntax issue while ensuring the code still meets the original requirements.
+        
+        Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
+        """
+        
+        prompt = PromptTemplate(template=template, input_variables=["generated_code", "errors"])
+        chain = prompt | self.llm_model | StrOutputParser()
+        return chain.invoke({
+            "generated_code": state["generated_code"],
+            "errors": state["errors"]["syntax"]
+        })
+    
+    def syntax_focused_code_generation(self, state: dict, analysis: str) -> str:
+        template = """
+        Based on the following analysis of a syntax error, please generate the corrected code, following the suggested fix.:
 
+        Error Analysis:
+        {analysis}
+
+        Original Code:
+        ```python
+        {generated_code}
+        ```
+
+        Generate the corrected code, applying the suggestions from the analysis. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
+        """
+
+        prompt = PromptTemplate(template=template, input_variables=["analysis", "generated_code"])
+        chain = prompt | self.llm_model | StrOutputParser()
+        return chain.invoke({
+            "analysis": analysis,
+            "generated_code": state["generated_code"]
+        })
+    
+    def execution_focused_analysis(self, state: dict) -> str:
+        template = """
+        The current code has encountered an execution error. Here are the details:
+        
+        **Current Code**:
+        ```python
+        {generated_code}
+        ```
+        
+        **Execution Error**:
+        {errors}
+        
+        **HTML Code**:
+        ```html
+        {html_code}
+        ```
+
+        **HTML Structure Analysis**:
+        {html_analysis}
+        
+        Please analyze the execution error and suggest a fix. Focus only on correcting the execution issue while ensuring the code still meets the original requirements and maintains correct syntax.
+        The suggested fix should address the execution error and ensure the function can successfully extract the required data from the provided HTML structure. Be sure to be precise and specific in your analysis.
+        
+        Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
+        """
+        
+        prompt = PromptTemplate(template=template, input_variables=["generated_code", "errors", "html_code", "html_analysis"])
+        chain = prompt | self.llm_model | StrOutputParser()
+        return chain.invoke({
+            "generated_code": state["generated_code"],
+            "errors": state["errors"]["execution"],
+            "html_code": state["html_code"],
+            "html_analysis": state["html_analysis"]
+        })
+    
+    def execution_focused_code_generation(self, state: dict, analysis: str) -> str:
+        template = """
+        Based on the following analysis of an execution error, please generate the corrected code:
+
+        Error Analysis:
+        {analysis}
+
+        Original Code:
+        ```python
+        {generated_code}
+        ```
+
+        Generate the corrected code, applying the suggestions from the analysis. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
+        """
+
+        prompt = PromptTemplate(template=template, input_variables=["analysis", "generated_code"])
+        chain = prompt | self.llm_model | StrOutputParser()
+        return chain.invoke({
+            "analysis": analysis,
+            "generated_code": state["generated_code"]
+        })
+    
+    def validation_focused_analysis(self, state: dict) -> str:
+        template = """
+        The current code's output does not match the required schema. Here are the details:
+        
+        Current Code:
+        ```python
+        {generated_code}
+        ```
+        
+        Validation Errors:
+        {errors}
+        
+        Required Schema:
+        ```json
+        {json_schema}
+        ```
+        
+        Current Output:
+        {execution_result}
+        
+        Please analyze the validation errors and suggest fixes. Focus only on correcting the output to match the required schema while ensuring the code maintains correct syntax and execution.
+        
+        Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
+        """
+        
+        prompt = PromptTemplate(template=template, input_variables=["generated_code", "errors", "json_schema", "execution_result"])
+        chain = prompt | self.llm_model | StrOutputParser()
+        return chain.invoke({
+            "generated_code": state["generated_code"],
+            "errors": state["errors"]["validation"],
+            "json_schema": state["json_schema"],
+            "execution_result": state["execution_result"]
+        })
+    
+    def validation_focused_code_generation(self, state: dict, analysis: str) -> str:
+        template = """
+        Based on the following analysis of a validation error, please generate the corrected code:
+
+        Error Analysis:
+        {analysis}
+
+        Original Code:
+        ```python
+        {generated_code}
+        ```
+
+        Required Schema:
+        ```json
+        {json_schema}
+        ```
+
+        Generate the corrected code, applying the suggestions from the analysis and ensuring the output matches the required schema. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
+        """
+
+        prompt = PromptTemplate(template=template, input_variables=["analysis", "generated_code", "json_schema"])
+        chain = prompt | self.llm_model | StrOutputParser()
+        return chain.invoke({
+            "analysis": analysis,
+            "generated_code": state["generated_code"],
+            "json_schema": state["json_schema"]
+        })
+    
     def syntax_check(self, code):
         try:
             ast.parse(code)

From 34590664f1582f7c22037772b1e86f5a92cac0b0 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Sat, 21 Sep 2024 16:36:22 +0200
Subject: [PATCH 14/27] Code generator updated version

---
 .../code_generation/simple_with_schema.py     |  10 +-
 scrapegraphai/graphs/code_generator_graph.py  |  12 +-
 scrapegraphai/nodes/generate_code_node.py     | 186 +++++++++++++++++-
 3 files changed, 192 insertions(+), 16 deletions(-)

diff --git a/examples/code_generation/simple_with_schema.py b/examples/code_generation/simple_with_schema.py
index c4803c62..58e12e0e 100644
--- a/examples/code_generation/simple_with_schema.py
+++ b/examples/code_generation/simple_with_schema.py
@@ -30,12 +30,18 @@ openai_key = os.getenv("OPENAI_APIKEY")
 graph_config = {
     "llm": {
         "api_key":openai_key,
-        "model": "openai/gpt-4o",\
+        "model": "openai/gpt-4o-mini",\
     },
-    "library": "beautifulsoup",
     "verbose": True,
     "headless": False,
     "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
 }
 
 # ************************************************
diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
index 6dc769b0..6eaa05af 100644
--- a/scrapegraphai/graphs/code_generator_graph.py
+++ b/scrapegraphai/graphs/code_generator_graph.py
@@ -49,8 +49,6 @@ class CodeGeneratorGraph(AbstractGraph):
 
     def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
         
-        self.library = config['library']
-        
         super().__init__(prompt, config, source, schema)
 
         self.input_key = "url" if source.startswith("http") else "local_dir"
@@ -119,10 +117,16 @@ class CodeGeneratorGraph(AbstractGraph):
             input="user_prompt & refined_prompt & html_info & reduced_html & answer",
             output=["generated_code"],
             node_config={
-                "library": self.library,
                 "llm_model": self.llm_model,
                 "additional_info": self.config.get("additional_info"),
-                "schema": self.schema
+                "schema": self.schema,
+                "max_iterations": self.config.get("max_iterations", {
+                    "overall": 10,
+                    "syntax": 3,
+                    "execution": 3,
+                    "validation": 3,
+                    "semantic": 3
+                }),
             }
         )
 
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index 77fc6378..1fef3c5c 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -1,8 +1,9 @@
 """
 GenerateCodeNode Module
 """
-from typing import List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 from langchain.prompts import PromptTemplate
+from langchain.output_parsers import ResponseSchema, StructuredOutputParser
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnableParallel
 from langchain_core.utils.pydantic import is_basemodel_subclass
@@ -19,6 +20,8 @@ from .base_node import BaseNode
 from pydantic import ValidationError
 from ..utils import transform_schema
 from jsonschema import validate, ValidationError
+import json
+import string
 
 
 class GenerateCodeNode(BaseNode):
@@ -69,10 +72,11 @@ class GenerateCodeNode(BaseNode):
             "overall": 10,
             "syntax": 3,
             "execution": 3,
-            "validation": 3
+            "validation": 3,
+            "semantic": 3
         })
         
-        self.output_schema = node_config.get("schema").schema() #          get JSON output schema
+        self.output_schema = node_config.get("schema") #          get JSON output schema
 
     def execute(self, state: dict) -> dict:
         """
@@ -104,7 +108,7 @@ class GenerateCodeNode(BaseNode):
         
         self.raw_html = state['original_html'][0].page_content
         
-        simplefied_schema = str(transform_schema(self.output_schema)) #          get JSON output schema
+        simplefied_schema = str(transform_schema(self.output_schema.schema())) #          get JSON output schema
         
         reasoning_state = {
             "user_input": user_prompt,
@@ -114,10 +118,12 @@ class GenerateCodeNode(BaseNode):
             "html_analysis": html_info,
             "generated_code": "",
             "execution_result": None,
+            "reference_answer": answer,
             "errors": {
                 "syntax": [],
                 "execution": [],
-                "validation": []
+                "validation": [],
+                "semantic": []
             },
             "iteration": 0
         }
@@ -131,6 +137,7 @@ class GenerateCodeNode(BaseNode):
     def overall_reasoning_loop(self, state: dict) -> dict:
         
         state["generated_code"] = self.generate_initial_code(state)
+        state["generated_code"] = self.extract_code(state["generated_code"])
         
         while state["iteration"] < self.max_iterations["overall"]:
             state["iteration"] += 1
@@ -147,6 +154,10 @@ class GenerateCodeNode(BaseNode):
             if state["errors"]["validation"]:
                 continue
             
+            state = self.semantic_comparison_loop(state)
+            if state["errors"]["semantic"]:
+                continue
+            
             # If we've made it here, the code is valid and produces the correct output
             break
         
@@ -162,11 +173,12 @@ class GenerateCodeNode(BaseNode):
             state["errors"]["syntax"] = [syntax_message]
             analysis = self.syntax_focused_analysis(state)
             state["generated_code"] = self.syntax_focused_code_generation(state, analysis)
+            state["generated_code"] = self.extract_code(state["generated_code"])
         return state
     
-    def execution_reasoning_loop(self, state: dict, raw_html: str) -> dict:
+    def execution_reasoning_loop(self, state: dict) -> dict:
         for _ in range(self.max_iterations["execution"]):
-            execution_success, execution_result = self.create_sandbox_and_execute(state["generated_code"], raw_html)
+            execution_success, execution_result = self.create_sandbox_and_execute(state["generated_code"])
             if execution_success:
                 state["execution_result"] = execution_result
                 state["errors"]["execution"] = []
@@ -175,6 +187,7 @@ class GenerateCodeNode(BaseNode):
             state["errors"]["execution"] = [execution_result]
             analysis = self.execution_focused_analysis(state)
             state["generated_code"] = self.execution_focused_code_generation(state, analysis)
+            state["generated_code"] = self.extract_code(state["generated_code"])
         return state
     
     def validation_reasoning_loop(self, state: dict) -> dict:
@@ -187,6 +200,20 @@ class GenerateCodeNode(BaseNode):
             state["errors"]["validation"] = errors
             analysis = self.validation_focused_analysis(state)
             state["generated_code"] = self.validation_focused_code_generation(state, analysis)
+            state["generated_code"] = self.extract_code(state["generated_code"])
+        return state
+    
+    def semantic_comparison_loop(self, state: dict) -> dict:
+        for _ in range(self.max_iterations["semantic"]):
+            comparison_result = self.semantic_comparison(state["execution_result"], state["reference_answer"])
+            if comparison_result["are_semantically_equivalent"]:
+                state["errors"]["semantic"] = []
+                return state
+            
+            state["errors"]["semantic"] = comparison_result["differences"]
+            analysis = self.semantic_focused_analysis(state, comparison_result)
+            state["generated_code"] = self.semantic_focused_code_generation(state, analysis)
+            state["generated_code"] = self.extract_code(state["generated_code"])
         return state
     
     def generate_initial_code(self, state: dict) -> str:
@@ -409,6 +436,114 @@ class GenerateCodeNode(BaseNode):
             "json_schema": state["json_schema"]
         })
     
+    def semantic_comparison(self, generated_result: Any, reference_result: Any) -> Dict[str, Any]:
+        reference_result_dict = self.output_schema(**reference_result).dict()
+        
+        # Check if generated result and reference result are actually equal
+        if are_content_equal(generated_result, reference_result_dict):
+            return {
+                "are_semantically_equivalent": True,
+                "differences": [],
+                "explanation": "The generated result and reference result are exactly equal."
+            }
+        
+        response_schemas = [
+            ResponseSchema(name="are_semantically_equivalent", description="Boolean indicating if the results are semantically equivalent"),
+            ResponseSchema(name="differences", description="List of semantic differences between the results, if any"),
+            ResponseSchema(name="explanation", description="Detailed explanation of the comparison and reasoning")
+        ]
+        output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
+
+        template = """
+        Compare the Generated Result with the Reference Result and determine if they are semantically equivalent:
+
+        Generated Result:
+        {generated_result}
+
+        Reference Result (Correct Output):
+        {reference_result}
+
+        Analyze the content, structure, and meaning of both results. They should be considered semantically equivalent if they convey the same information, even if the exact wording or structure differs.
+        If they are not semantically equivalent, identify what are the key differences in the Generated Result. The Reference Result should be considered the correct output, you need to pinpoint the problems in the Generated Result.
+
+        {format_instructions}
+
+        Human: Are the generated result and reference result semantically equivalent? If not, what are the key differences?
+
+        Assistant: Let's analyze the two results carefully:
+
+        """
+
+        prompt = PromptTemplate(
+            template=template,
+            input_variables=["generated_result", "reference_result"],
+            partial_variables={"format_instructions": output_parser.get_format_instructions()}
+        )
+
+        chain = prompt | self.llm_model | output_parser
+        return chain.invoke({
+            "generated_result": json.dumps(generated_result, indent=2),
+            "reference_result": json.dumps(reference_result_dict, indent=2)
+        })
+    
+    def semantic_focused_analysis(self, state: dict, comparison_result: Dict[str, Any]) -> str:
+        template = """
+        The current code's output is semantically different from the reference answer. Here are the details:
+        
+        Current Code:
+        ```python
+        {generated_code}
+        ```
+        
+        Semantic Differences:
+        {differences}
+        
+        Comparison Explanation:
+        {explanation}
+        
+        Please analyze these semantic differences and suggest how to modify the code to produce a result that is semantically equivalent to the reference answer. Focus on addressing the key differences while maintaining the overall structure and functionality of the code.
+        
+        Provide your analysis and suggestions for fixing the semantic differences. DO NOT generate any code in your response.
+        """
+        
+        prompt = PromptTemplate(template=template, input_variables=["generated_code", "differences", "explanation"])
+        chain = prompt | self.llm_model | StrOutputParser()
+        return chain.invoke({
+            "generated_code": state["generated_code"],
+            "differences": json.dumps(comparison_result["differences"], indent=2),
+            "explanation": comparison_result["explanation"]
+        })
+    
+    def semantic_focused_code_generation(self, state: dict, analysis: str) -> str:
+        template = """
+        Based on the following analysis of semantic differences, please generate the corrected code:
+
+        Semantic Analysis:
+        {analysis}
+
+        Original Code:
+        ```python
+        {generated_code}
+        ```
+
+        Generated Result:
+        {generated_result}
+
+        Reference Result:
+        {reference_result}
+
+        Generate the corrected code, applying the suggestions from the analysis to make the output semantically equivalent to the reference result. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
+        """
+
+        prompt = PromptTemplate(template=template, input_variables=["analysis", "generated_code", "generated_result", "reference_result"])
+        chain = prompt | self.llm_model | StrOutputParser()
+        return chain.invoke({
+            "analysis": analysis,
+            "generated_code": state["generated_code"],
+            "generated_result": json.dumps(state["execution_result"], indent=2),
+            "reference_result": json.dumps(state["reference_answer"], indent=2)
+        })
+    
     def syntax_check(self, code):
         try:
             ast.parse(code)
@@ -416,7 +551,7 @@ class GenerateCodeNode(BaseNode):
         except SyntaxError as e:
             return False, f"Syntax error: {str(e)}"
 
-    def create_sandbox_and_execute(self, function_code, html_doc):
+    def create_sandbox_and_execute(self, function_code):
         # Create a sandbox environment
         sandbox_globals = {
             'BeautifulSoup': BeautifulSoup,
@@ -439,7 +574,7 @@ class GenerateCodeNode(BaseNode):
                 raise NameError("Function 'extract_data' not found in the generated code.")
             
             # Execute the extract_data function with the provided HTML
-            result = extract_data(html_doc)
+            result = extract_data(self.raw_html)
             
             return True, result
         except Exception as e:
@@ -464,4 +599,35 @@ class GenerateCodeNode(BaseNode):
         match = re.search(pattern, code, re.DOTALL)
         
         # If a code block is found, return the code, otherwise return the entire string
-        return match.group(1) if match else code
\ No newline at end of file
+        return match.group(1) if match else code
+
+def normalize_string(s: str) -> str:
+    # Convert to lowercase, remove extra spaces, and strip punctuation
+    return ''.join(c for c in s.lower().strip() if c not in string.punctuation)
+
+def normalize_dict(d: dict) -> dict:
+    """
+    Normalize the dictionary by:
+    - Converting all string values to lowercase and stripping spaces.
+    - Recursively normalizing nested dictionaries.
+    - Sorting the dictionary to ensure key order doesn't matter.
+    """
+    normalized = {}
+    for key, value in d.items():
+        if isinstance(value, str):
+            # Normalize string values
+            normalized[key] = normalize_string(value)
+        elif isinstance(value, dict):
+            # Recursively normalize nested dictionaries
+            normalized[key] = normalize_dict(value)
+        elif isinstance(value, list):
+            # Sort lists and normalize elements
+            normalized[key] = sorted(normalize_dict(v) if isinstance(v, dict) else normalize_string(v) if isinstance(v, str) else v for v in value)
+        else:
+            # Keep other types (e.g., numbers) as is
+            normalized[key] = value
+    return dict(sorted(normalized.items()))  # Ensure dictionary is sorted by keys
+
+def are_content_equal(generated_result: dict, reference_result: dict) -> bool:
+    # Normalize both dictionaries and compare
+    return normalize_dict(generated_result) == normalize_dict(reference_result)
\ No newline at end of file

From f38c5e1d8f7cf9dd4e36ec2a1c4548b3b7e00551 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Sat, 21 Sep 2024 18:01:07 +0200
Subject: [PATCH 15/27] removed test code

---
 scrapegraphai/code_gen/code_exec_test.py   | 526 ---------------------
 scrapegraphai/code_gen/html_reduce.py      |  85 ----
 scrapegraphai/code_gen/script_genarated.py |  64 ---
 scrapegraphai/code_gen/script_runner.py    |   0
 4 files changed, 675 deletions(-)
 delete mode 100644 scrapegraphai/code_gen/code_exec_test.py
 delete mode 100644 scrapegraphai/code_gen/html_reduce.py
 delete mode 100644 scrapegraphai/code_gen/script_genarated.py
 delete mode 100644 scrapegraphai/code_gen/script_runner.py

diff --git a/scrapegraphai/code_gen/code_exec_test.py b/scrapegraphai/code_gen/code_exec_test.py
deleted file mode 100644
index 95d2eefd..00000000
--- a/scrapegraphai/code_gen/code_exec_test.py
+++ /dev/null
@@ -1,526 +0,0 @@
-import ast
-import sys
-from io import StringIO
-from bs4 import BeautifulSoup
-import re
-
-generated_code = "def extract_data(html: str) -> dict:\n    from bs4 import BeautifulSoup\n    import re\n\n    # Parse the HTML content using BeautifulSoup\n    soup = BeautifulSoup(html, 'html.parser')\n\n    # Initialize the projects list\n    projects = []\n\n    # Find all <a> tags that contain project entries\n    project_links = soup.find_all('a', href=True)\n\n    # Iterate through each project link to extract title and description\n    for link in project_links:\n        # Check if the link contains an image and text\n        img_tag = link.find('img')\n        if img_tag and link.string:\n            # Extract the full text and split it into title and description\n            full_text = link.string.strip()\n            # Use regex to separate title and description\n            match = re.match(r'^(.*?)(?:\\s*-\\s*(.*))?$', full_text)\n            if match:\n                title = match.group(1).strip()\n                description = match.group(2).strip() if match.group(2) else ''\n                \n                # Append the project data to the projects list\n                projects.append({\n                    'title': title,\n                    'description': description\n                })\n\n    # Return the structured data as a dictionary\n    return {\n        'projects': projects\n    }\n"
-html = """
-<html lang="en" class="" data-theme="dark"><head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <title>Projects | Marco Perini</title> <meta name="author" content="Marco Perini"> <meta name="description" content="Personal Porfolio Website "> <meta name="keywords" content="jekyll, jekyll-theme, academic-website, portfolio-website, robotics, machine-learning, computer-vision, artificial-intelligence, deep-learning, data-science, data-analysis, data-visualization, reinforcement-learning, computer-science, computer-graphics, computer-architecture, computer-networks, computer-security, computer-aided-design, computer-algebra, computer-alg, hardware"> <link rel="stylesheet" href="/assets/css/bootstrap.min.css?a4b3f509e79c54a512b890d73235ef04"> <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/mdbootstrap@4.20.0/css/mdb.min.css" integrity="sha256-jpjYvU3G3N6nrrBwXJoVEYI/0zw8htfFnhT9ljN3JJw=" crossorigin="anonymous"> <link defer="" rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.22.1/dist/bootstrap-table.min.css"> <link rel="stylesheet" href="/assets/css/academicons.min.css?f0b7046b84e425c55f3463ac249818f5"> <link rel="stylesheet" type="text/css" href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700|Roboto+Slab:100,300,400,500,700|Material+Icons"> <link rel="stylesheet" href="/assets/css/jekyll-pygments-themes-github.css?19f3075a2d19613090fe9e16b564e1fe" media="none" id="highlight_theme_light"> <link rel="shortcut icon" href="data:image/svg+xml,<svg%20xmlns=%22http://www.w3.org/2000/svg%22%20viewBox=%220%200%20100%20100%22><text%20y=%22.9em%22%20font-size=%2290%22>%F0%9F%A6%BE</text></svg>"> <link rel="stylesheet" href="/assets/css/main.css?d41d8cd98f00b204e9800998ecf8427e"> <link rel="canonical" href="https://perinim.github.io/projects/"> <link rel="stylesheet" href="/assets/css/jekyll-pygments-themes-native.css?e74e74bf055e5729d44a7d031a5ca6a5" media="" id="highlight_theme_dark"> <script src="/assets/js/theme.js?96d6b3e1c3604aca8b6134c7afdd5db6"></script> <script src="/assets/js/dark_mode.js?9b17307bb950ffa2e34be0227f53558f"></script> <style type="text/css">/* Chart.js */
-@-webkit-keyframes chartjs-render-animation{from{opacity:0.99}to{opacity:1}}@keyframes chartjs-render-animation{from{opacity:0.99}to{opacity:1}}.chartjs-render-monitor{-webkit-animation:chartjs-render-animation 0.001s;animation:chartjs-render-animation 0.001s;}</style><script id="altmetric-embed-js" src="https://d1bxh8uas1mnw7.cloudfront.net/assets/altmetric_badges-2f3c1a827c4dee5fa0ff35ec229b9204ae106583cc99636c724152d1f7acea04.js"></script><style type="text/css">.medium-zoom-overlay{position:fixed;top:0;right:0;bottom:0;left:0;opacity:0;transition:opacity .3s;will-change:opacity}.medium-zoom--opened .medium-zoom-overlay{cursor:pointer;cursor:zoom-out;opacity:1}.medium-zoom-image{cursor:pointer;cursor:zoom-in;transition:transform .3s cubic-bezier(.2,0,.2,1)!important}.medium-zoom-image--hidden{visibility:hidden}.medium-zoom-image--opened{position:relative;cursor:pointer;cursor:zoom-out;will-change:transform}</style><style type="text/css">.CtxtMenu_InfoClose {  top:.2em; right:.2em;}
-.CtxtMenu_InfoContent {  overflow:auto; text-align:left; font-size:80%;  padding:.4em .6em; border:1px inset; margin:1em 0px;  max-height:20em; max-width:30em; background-color:#EEEEEE;  white-space:normal;}
-.CtxtMenu_Info.CtxtMenu_MousePost {outline:none;}
-.CtxtMenu_Info {  position:fixed; left:50%; width:auto; text-align:center;  border:3px outset; padding:1em 2em; background-color:#DDDDDD;  color:black;  cursor:default; font-family:message-box; font-size:120%;  font-style:normal; text-indent:0; text-transform:none;  line-height:normal; letter-spacing:normal; word-spacing:normal;  word-wrap:normal; white-space:nowrap; float:none; z-index:201;  border-radius: 15px;                     /* Opera 10.5 and IE9 */  -webkit-border-radius:15px;               /* Safari and Chrome */  -moz-border-radius:15px;                  /* Firefox */  -khtml-border-radius:15px;                /* Konqueror */  box-shadow:0px 10px 20px #808080;         /* Opera 10.5 and IE9 */  -webkit-box-shadow:0px 10px 20px #808080; /* Safari 3 & Chrome */  -moz-box-shadow:0px 10px 20px #808080;    /* Forefox 3.5 */  -khtml-box-shadow:0px 10px 20px #808080;  /* Konqueror */  filter:progid:DXImageTransform.Microsoft.dropshadow(OffX=2, OffY=2, Color="gray", Positive="true"); /* IE */}
-</style><style type="text/css">.CtxtMenu_MenuClose {  position:absolute;  cursor:pointer;  display:inline-block;  border:2px solid #AAA;  border-radius:18px;  -webkit-border-radius: 18px;             /* Safari and Chrome */  -moz-border-radius: 18px;                /* Firefox */  -khtml-border-radius: 18px;              /* Konqueror */  font-family: "Courier New", Courier;  font-size:24px;  color:#F0F0F0}
-.CtxtMenu_MenuClose span {  display:block; background-color:#AAA; border:1.5px solid;  border-radius:18px;  -webkit-border-radius: 18px;             /* Safari and Chrome */  -moz-border-radius: 18px;                /* Firefox */  -khtml-border-radius: 18px;              /* Konqueror */  line-height:0;  padding:8px 0 6px     /* may need to be browser-specific */}
-.CtxtMenu_MenuClose:hover {  color:white!important;  border:2px solid #CCC!important}
-.CtxtMenu_MenuClose:hover span {  background-color:#CCC!important}
-.CtxtMenu_MenuClose:hover:focus {  outline:none}
-</style><style type="text/css">.CtxtMenu_Menu {  position:absolute;  background-color:white;  color:black;  width:auto; padding:5px 0px;  border:1px solid #CCCCCC; margin:0; cursor:default;  font: menu; text-align:left; text-indent:0; text-transform:none;  line-height:normal; letter-spacing:normal; word-spacing:normal;  word-wrap:normal; white-space:nowrap; float:none; z-index:201;  border-radius: 5px;                     /* Opera 10.5 and IE9 */  -webkit-border-radius: 5px;             /* Safari and Chrome */  -moz-border-radius: 5px;                /* Firefox */  -khtml-border-radius: 5px;              /* Konqueror */  box-shadow:0px 10px 20px #808080;         /* Opera 10.5 and IE9 */  -webkit-box-shadow:0px 10px 20px #808080; /* Safari 3 & Chrome */  -moz-box-shadow:0px 10px 20px #808080;    /* Forefox 3.5 */  -khtml-box-shadow:0px 10px 20px #808080;  /* Konqueror */}
-.CtxtMenu_MenuItem {  padding: 1px 2em;  background:transparent;}
-.CtxtMenu_MenuArrow {  position:absolute; right:.5em; padding-top:.25em; color:#666666;  font-family: null; font-size: .75em}
-.CtxtMenu_MenuActive .CtxtMenu_MenuArrow {color:white}
-.CtxtMenu_MenuArrow.CtxtMenu_RTL {left:.5em; right:auto}
-.CtxtMenu_MenuCheck {  position:absolute; left:.7em;  font-family: null}
-.CtxtMenu_MenuCheck.CtxtMenu_RTL { right:.7em; left:auto }
-.CtxtMenu_MenuRadioCheck {  position:absolute; left: .7em;}
-.CtxtMenu_MenuRadioCheck.CtxtMenu_RTL {  right: .7em; left:auto}
-.CtxtMenu_MenuInputBox {  padding-left: 1em; right:.5em; color:#666666;  font-family: null;}
-.CtxtMenu_MenuInputBox.CtxtMenu_RTL {  left: .1em;}
-.CtxtMenu_MenuComboBox {  left:.1em; padding-bottom:.5em;}
-.CtxtMenu_MenuSlider {  left: .1em;}
-.CtxtMenu_SliderValue {  position:absolute; right:.1em; padding-top:.25em; color:#333333;  font-size: .75em}
-.CtxtMenu_SliderBar {  outline: none; background: #d3d3d3}
-.CtxtMenu_MenuLabel {  padding: 1px 2em 3px 1.33em;  font-style:italic}
-.CtxtMenu_MenuRule {  border-top: 1px solid #DDDDDD;  margin: 4px 3px;}
-.CtxtMenu_MenuDisabled {  color:GrayText}
-.CtxtMenu_MenuActive {  background-color: #606872;  color: white;}
-.CtxtMenu_MenuDisabled:focus {  background-color: #E8E8E8}
-.CtxtMenu_MenuLabel:focus {  background-color: #E8E8E8}
-.CtxtMenu_ContextMenu:focus {  outline:none}
-.CtxtMenu_ContextMenu .CtxtMenu_MenuItem:focus {  outline:none}
-.CtxtMenu_SelectionMenu {  position:relative; float:left;  border-bottom: none; -webkit-box-shadow:none; -webkit-border-radius:0px; }
-.CtxtMenu_SelectionItem {  padding-right: 1em;}
-.CtxtMenu_Selection {  right: 40%; width:50%; }
-.CtxtMenu_SelectionBox {  padding: 0em; max-height:20em; max-width: none;  background-color:#FFFFFF;}
-.CtxtMenu_SelectionDivider {  clear: both; border-top: 2px solid #000000;}
-.CtxtMenu_Menu .CtxtMenu_MenuClose {  top:-10px; left:-10px}
-</style><style id="MJX-CHTML-styles">
-mjx-container[jax="CHTML"] {
-  line-height: 0;
-}
-
-mjx-container [space="1"] {
-  margin-left: .111em;
-}
-
-mjx-container [space="2"] {
-  margin-left: .167em;
-}
-
-mjx-container [space="3"] {
-  margin-left: .222em;
-}
-
-mjx-container [space="4"] {
-  margin-left: .278em;
-}
-
-mjx-container [space="5"] {
-  margin-left: .333em;
-}
-
-mjx-container [rspace="1"] {
-  margin-right: .111em;
-}
-
-mjx-container [rspace="2"] {
-  margin-right: .167em;
-}
-
-mjx-container [rspace="3"] {
-  margin-right: .222em;
-}
-
-mjx-container [rspace="4"] {
-  margin-right: .278em;
-}
-
-mjx-container [rspace="5"] {
-  margin-right: .333em;
-}
-
-mjx-container [size="s"] {
-  font-size: 70.7%;
-}
-
-mjx-container [size="ss"] {
-  font-size: 50%;
-}
-
-mjx-container [size="Tn"] {
-  font-size: 60%;
-}
-
-mjx-container [size="sm"] {
-  font-size: 85%;
-}
-
-mjx-container [size="lg"] {
-  font-size: 120%;
-}
-
-mjx-container [size="Lg"] {
-  font-size: 144%;
-}
-
-mjx-container [size="LG"] {
-  font-size: 173%;
-}
-
-mjx-container [size="hg"] {
-  font-size: 207%;
-}
-
-mjx-container [size="HG"] {
-  font-size: 249%;
-}
-
-mjx-container [width="full"] {
-  width: 100%;
-}
-
-mjx-box {
-  display: inline-block;
-}
-
-mjx-block {
-  display: block;
-}
-
-mjx-itable {
-  display: inline-table;
-}
-
-mjx-row {
-  display: table-row;
-}
-
-mjx-row > * {
-  display: table-cell;
-}
-
-mjx-mtext {
-  display: inline-block;
-}
-
-mjx-mstyle {
-  display: inline-block;
-}
-
-mjx-merror {
-  display: inline-block;
-  color: red;
-  background-color: yellow;
-}
-
-mjx-mphantom {
-  visibility: hidden;
-}
-
-_::-webkit-full-page-media, _:future, :root mjx-container {
-  will-change: opacity;
-}
-
-mjx-assistive-mml {
-  position: absolute !important;
-  top: 0px;
-  left: 0px;
-  clip: rect(1px, 1px, 1px, 1px);
-  padding: 1px 0px 0px 0px !important;
-  border: 0px !important;
-  display: block !important;
-  width: auto !important;
-  overflow: hidden !important;
-  -webkit-touch-callout: none;
-  -webkit-user-select: none;
-  -khtml-user-select: none;
-  -moz-user-select: none;
-  -ms-user-select: none;
-  user-select: none;
-}
-
-mjx-assistive-mml[display="block"] {
-  width: 100% !important;
-}
-
-mjx-c::before {
-  display: block;
-  width: 0;
-}
-
-.MJX-TEX {
-  font-family: MJXZERO, MJXTEX;
-}
-
-.TEX-B {
-  font-family: MJXZERO, MJXTEX-B;
-}
-
-.TEX-I {
-  font-family: MJXZERO, MJXTEX-I;
-}
-
-.TEX-MI {
-  font-family: MJXZERO, MJXTEX-MI;
-}
-
-.TEX-BI {
-  font-family: MJXZERO, MJXTEX-BI;
-}
-
-.TEX-S1 {
-  font-family: MJXZERO, MJXTEX-S1;
-}
-
-.TEX-S2 {
-  font-family: MJXZERO, MJXTEX-S2;
-}
-
-.TEX-S3 {
-  font-family: MJXZERO, MJXTEX-S3;
-}
-
-.TEX-S4 {
-  font-family: MJXZERO, MJXTEX-S4;
-}
-
-.TEX-A {
-  font-family: MJXZERO, MJXTEX-A;
-}
-
-.TEX-C {
-  font-family: MJXZERO, MJXTEX-C;
-}
-
-.TEX-CB {
-  font-family: MJXZERO, MJXTEX-CB;
-}
-
-.TEX-FR {
-  font-family: MJXZERO, MJXTEX-FR;
-}
-
-.TEX-FRB {
-  font-family: MJXZERO, MJXTEX-FRB;
-}
-
-.TEX-SS {
-  font-family: MJXZERO, MJXTEX-SS;
-}
-
-.TEX-SSB {
-  font-family: MJXZERO, MJXTEX-SSB;
-}
-
-.TEX-SSI {
-  font-family: MJXZERO, MJXTEX-SSI;
-}
-
-.TEX-SC {
-  font-family: MJXZERO, MJXTEX-SC;
-}
-
-.TEX-T {
-  font-family: MJXZERO, MJXTEX-T;
-}
-
-.TEX-V {
-  font-family: MJXZERO, MJXTEX-V;
-}
-
-.TEX-VB {
-  font-family: MJXZERO, MJXTEX-VB;
-}
-
-mjx-stretchy-v mjx-c, mjx-stretchy-h mjx-c {
-  font-family: MJXZERO, MJXTEX-S1, MJXTEX-S4, MJXTEX, MJXTEX-A ! important;
-}
-
-@font-face /* 0 */ {
-  font-family: MJXZERO;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Zero.woff") format("woff");
-}
-
-@font-face /* 1 */ {
-  font-family: MJXTEX;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Main-Regular.woff") format("woff");
-}
-
-@font-face /* 2 */ {
-  font-family: MJXTEX-B;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Main-Bold.woff") format("woff");
-}
-
-@font-face /* 3 */ {
-  font-family: MJXTEX-I;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Math-Italic.woff") format("woff");
-}
-
-@font-face /* 4 */ {
-  font-family: MJXTEX-MI;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Main-Italic.woff") format("woff");
-}
-
-@font-face /* 5 */ {
-  font-family: MJXTEX-BI;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Math-BoldItalic.woff") format("woff");
-}
-
-@font-face /* 6 */ {
-  font-family: MJXTEX-S1;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Size1-Regular.woff") format("woff");
-}
-
-@font-face /* 7 */ {
-  font-family: MJXTEX-S2;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Size2-Regular.woff") format("woff");
-}
-
-@font-face /* 8 */ {
-  font-family: MJXTEX-S3;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Size3-Regular.woff") format("woff");
-}
-
-@font-face /* 9 */ {
-  font-family: MJXTEX-S4;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Size4-Regular.woff") format("woff");
-}
-
-@font-face /* 10 */ {
-  font-family: MJXTEX-A;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_AMS-Regular.woff") format("woff");
-}
-
-@font-face /* 11 */ {
-  font-family: MJXTEX-C;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Calligraphic-Regular.woff") format("woff");
-}
-
-@font-face /* 12 */ {
-  font-family: MJXTEX-CB;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Calligraphic-Bold.woff") format("woff");
-}
-
-@font-face /* 13 */ {
-  font-family: MJXTEX-FR;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Fraktur-Regular.woff") format("woff");
-}
-
-@font-face /* 14 */ {
-  font-family: MJXTEX-FRB;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Fraktur-Bold.woff") format("woff");
-}
-
-@font-face /* 15 */ {
-  font-family: MJXTEX-SS;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_SansSerif-Regular.woff") format("woff");
-}
-
-@font-face /* 16 */ {
-  font-family: MJXTEX-SSB;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_SansSerif-Bold.woff") format("woff");
-}
-
-@font-face /* 17 */ {
-  font-family: MJXTEX-SSI;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_SansSerif-Italic.woff") format("woff");
-}
-
-@font-face /* 18 */ {
-  font-family: MJXTEX-SC;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Script-Regular.woff") format("woff");
-}
-
-@font-face /* 19 */ {
-  font-family: MJXTEX-T;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Typewriter-Regular.woff") format("woff");
-}
-
-@font-face /* 20 */ {
-  font-family: MJXTEX-V;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Vector-Regular.woff") format("woff");
-}
-
-@font-face /* 21 */ {
-  font-family: MJXTEX-VB;
-  src: url("https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/output/chtml/fonts/woff-v2/MathJax_Vector-Bold.woff") format("woff");
-}
-</style><link rel="stylesheet" href="https://badge.dimensions.ai/badge.css"></head> <body class="fixed-top-nav " data-new-gr-c-s-check-loaded="14.1196.0" data-gr-ext-installed="" style="padding-top: 57px;"> <header> <nav id="navbar" class="navbar navbar-light navbar-expand-sm fixed-top"> <div class="container"> <a class="navbar-brand title font-weight-lighter" href="/"><span class="font-weight-bold">Marco&nbsp;</span>Perini</a> <button class="navbar-toggler collapsed ml-auto" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation"> <span class="sr-only">Toggle navigation</span> <span class="icon-bar top-bar"></span> <span class="icon-bar middle-bar"></span> <span class="icon-bar bottom-bar"></span> </button> <div class="collapse navbar-collapse text-right" id="navbarNav"> <ul class="navbar-nav ml-auto flex-nowrap"> <li class="nav-item "> <a class="nav-link" href="/">About</a> </li> <li class="nav-item dropdown active"> <a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Projects<span class="sr-only">(current)</span></a> <div class="dropdown-menu dropdown-menu-right" aria-labelledby="navbarDropdown"> <a class="dropdown-item" href="/projects/">Projects</a> <div class="dropdown-divider"></div> <a class="dropdown-item" href="/competitions/">Competitions</a> </div> </li> <li class="nav-item "> <a class="nav-link" href="/cv/">CV</a> </li> <li class="toggle-container"> <button id="light-toggle" title="Change theme"> <i class="fa-solid fa-moon"></i> <i class="fa-solid fa-sun"></i> </button> </li> </ul> </div> </div> </nav> <progress id="progress" value="40" max="140" style="top: 57px;"> <div class="progress-container"> <span class="progress-bar"></span> </div> </progress> </header> <div class="container mt-5"> <div class="post"> <header class="post-header"> <h1 class="post-title">Projects</h1> <p class="post-description"></p> </header> <article> <div class="projects"> <div class="grid" style="position: relative; height: 803.703px;"> <div class="grid-sizer"></div> <div class="grid-item" style="position: absolute; left: 0px; top: 0px;"> <a href="/projects/rotary-pendulum-rl/"> <div class="card hoverable"> <figure> <picture> <source class="responsive-img-srcset" media="(max-width: 480px)" srcset="/assets/img/rotary_pybullet-480.webp"> <source class="responsive-img-srcset" media="(max-width: 800px)" srcset="/assets/img/rotary_pybullet-800.webp"> <source class="responsive-img-srcset" media="(max-width: 1400px)" srcset="/assets/img/rotary_pybullet-1400.webp"> <img src="/assets/img/rotary_pybullet.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture> </figure> <div class="card-body"> <h4 class="card-title">Rotary Pendulum RL</h4> <p class="card-text">Open Source project aimed at controlling a real life rotary pendulum using RL algorithms</p> <div class="row ml-1 mr-1 p-0"> </div> </div> </div> </a> </div> <div class="grid-sizer"></div> <div class="grid-item" style="position: absolute; left: 260px; top: 0px;"> <a href="https://github.com/PeriniM/DQN-SwingUp" rel="external nofollow noopener" target="_blank"> <div class="card hoverable"> <figure> <picture> <source class="responsive-img-srcset" media="(max-width: 480px)" srcset="/assets/img/value-policy-heatmaps-480.webp"> <source class="responsive-img-srcset" media="(max-width: 800px)" srcset="/assets/img/value-policy-heatmaps-800.webp"> <source class="responsive-img-srcset" media="(max-width: 1400px)" srcset="/assets/img/value-policy-heatmaps-1400.webp"> <img src="/assets/img/value-policy-heatmaps.jpg" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture> </figure> <div class="card-body"> <h4 class="card-title">DQN Implementation from scratch</h4> <p class="card-text">Developed a Deep Q-Network algorithm to train a simple and double pendulum</p> <div class="row ml-1 mr-1 p-0"> </div> </div> </div> </a> </div> <div class="grid-sizer"></div> <div class="grid-item" style="position: absolute; left: 520px; top: 0px;"> <a href="https://github.com/PeriniM/Multi-Agents-HAED" rel="external nofollow noopener" target="_blank"> <div class="card hoverable"> <figure> <picture> <source class="responsive-img-srcset" media="(max-width: 480px)" srcset="/assets/img/multi_agents_haed.gif-480.webp"> <source class="responsive-img-srcset" media="(max-width: 800px)" srcset="/assets/img/multi_agents_haed.gif-800.webp"> <source class="responsive-img-srcset" media="(max-width: 1400px)" srcset="/assets/img/multi_agents_haed.gif-1400.webp"> <img src="/assets/img/multi_agents_haed.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture> </figure> <div class="card-body"> <h4 class="card-title">Multi Agents HAED</h4> <p class="card-text">University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.</p> <div class="row ml-1 mr-1 p-0"> </div> </div> </div> </a> </div> <div class="grid-sizer"></div> <div class="grid-item" style="position: absolute; left: 0px; top: 447.453px;"> <a href="/projects/wireless-esc-drone/"> <div class="card hoverable"> <figure> <picture> <source class="responsive-img-srcset" media="(max-width: 480px)" srcset="/assets/img/wireless_esc.gif-480.webp"> <source class="responsive-img-srcset" media="(max-width: 800px)" srcset="/assets/img/wireless_esc.gif-800.webp"> <source class="responsive-img-srcset" media="(max-width: 1400px)" srcset="/assets/img/wireless_esc.gif-1400.webp"> <img src="/assets/img/wireless_esc.gif" width="auto" height="auto" alt="project thumbnail" onerror="this.onerror=null; $('.responsive-img-srcset').remove();"> </picture> </figure> <div class="card-body"> <h4 class="card-title">Wireless ESC for Modular Drones</h4> <p class="card-text">Modular drone architecture proposal and proof of concept. The project received maximum grade.</p> <div class="row ml-1 mr-1 p-0"> </div> </div> </div> </a> </div> </div> </div> </article> </div> </div> <footer class="fixed-bottom"> <div class="container mt-0"> © Copyright 2023 Marco Perini. Powered by <a href="https://jekyllrb.com/" target="_blank" rel="external nofollow noopener">Jekyll</a> with <a href="https://github.com/alshedivat/al-folio" rel="external nofollow noopener" target="_blank">al-folio</a> theme. Hosted by <a href="https://pages.github.com/" target="_blank" rel="external nofollow noopener">GitHub Pages</a>. </div> </footer> <script src="https://cdn.jsdelivr.net/npm/jquery@3.6.0/dist/jquery.min.js" integrity="sha256-/xUj+3OJU5yExlq6GSYGSHk7tPXikynS7ogEvDej/m4=" crossorigin="anonymous"></script> <script src="/assets/js/bootstrap.bundle.min.js"></script> <script src="https://cdn.jsdelivr.net/npm/mdbootstrap@4.20.0/js/mdb.min.js" integrity="sha256-NdbiivsvWt7VYCt6hYNT3h/th9vSTL4EDWeGs5SN3DA=" crossorigin="anonymous"></script> <script defer="" src="https://cdn.jsdelivr.net/npm/masonry-layout@4.2.2/dist/masonry.pkgd.min.js" integrity="sha256-Nn1q/fx0H7SNLZMQ5Hw5JLaTRZp0yILA/FRexe19VdI=" crossorigin="anonymous"></script> <script defer="" src="https://cdn.jsdelivr.net/npm/imagesloaded@4/imagesloaded.pkgd.min.js"></script> <script defer="" src="/assets/js/masonry.js" type="text/javascript"></script> <script defer="" src="https://cdn.jsdelivr.net/npm/medium-zoom@1.0.8/dist/medium-zoom.min.js" integrity="sha256-7PhEpEWEW0XXQ0k6kQrPKwuoIomz8R8IYyuU1Qew4P8=" crossorigin="anonymous"></script> <script defer="" src="/assets/js/zoom.js?7b30caa5023af4af8408a472dc4e1ebb"></script> <script defer="" src="https://unpkg.com/bootstrap-table@1.22.1/dist/bootstrap-table.min.js"></script> <script src="/assets/js/no_defer.js?d633890033921b33e0ceb13d22340a9c"></script> <script defer="" src="/assets/js/common.js?acdb9690d7641b2f8d40529018c71a01"></script> <script defer="" src="/assets/js/copy_code.js?07b8786bab9b4abe90d10e61f7d12ff7" type="text/javascript"></script> <script async="" src="https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js"></script> <script async="" src="https://badge.dimensions.ai/badge.js"></script> <script type="text/javascript">window.MathJax={tex:{tags:"ams"}};</script> <script defer="" type="text/javascript" id="MathJax-script" src="https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/tex-mml-chtml.js"></script> <script defer="" src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script> <script async="" src="https://www.googletagmanager.com/gtag/js?id=G-W66CHGTB05"></script> <script>function gtag(){window.dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-W66CHGTB05");</script> <script type="text/javascript">function progressBarSetup(){"max"in document.createElement("progress")?(initializeProgressElement(),$(document).on("scroll",function(){progressBar.attr({value:getCurrentScrollPosition()})}),$(window).on("resize",initializeProgressElement)):(resizeProgressBar(),$(document).on("scroll",resizeProgressBar),$(window).on("resize",resizeProgressBar))}function getCurrentScrollPosition(){return $(window).scrollTop()}function initializeProgressElement(){let e=$("#navbar").outerHeight(!0);$("body").css({"padding-top":e}),$("progress-container").css({"padding-top":e}),progressBar.css({top:e}),progressBar.attr({max:getDistanceToScroll(),value:getCurrentScrollPosition()})}function getDistanceToScroll(){return $(document).height()-$(window).height()}function resizeProgressBar(){progressBar.css({width:getWidthPercentage()+"%"})}function getWidthPercentage(){return getCurrentScrollPosition()/getDistanceToScroll()*100}const progressBar=$("#progress");window.onload=function(){setTimeout(progressBarSetup,50)};</script>  <div class="hiddendiv common"></div><iframe id="LgIoc6pK" frameborder="0" src="chrome-extension://ekhagklcjbdpajgpjgmbionohlpdbjgc/translateSandbox/translateSandbox.html" style="width: 0px; height: 0px; display: none;"></iframe></body><grammarly-desktop-integration data-grammarly-shadow-root="true"></grammarly-desktop-integration></html>
-"""
-
-def extract_data(html: str) -> dict:
-    from bs4 import BeautifulSoup
-    import re
-
-    # Parse the HTML content using BeautifulSoup
-    soup = BeautifulSoup(html, 'html.parser')
-
-    # Initialize the list to hold project data
-    projects = []
-
-    # Find all anchor tags that contain project information
-    for a_tag in soup.find_all('a', href=True):
-        # Extract the text content of the anchor tag
-        text_content = a_tag.get_text(strip=True)
-        
-        # Use regex to split the text content into title and description
-        match = re.match(r'(.+?)\s(.+)', text_content)
-        if match:
-            title = match.group(1)
-            description = match.group(2)
-            
-            # Append the project data to the list
-            projects.append({
-                'title': title,
-                'description': description
-            })
-
-    # Structure the data according to the specified JSON schema
-    result = {
-        'title': 'Projects',
-        'type': 'object',
-        'properties': {
-            'projects': {
-                'title': 'Projects',
-                'type': 'array',
-                'items': projects
-            }
-        },
-        'required': ['projects'],
-        'definitions': {
-            'Project': {
-                'title': 'Project',
-                'type': 'object',
-                'properties': {
-                    'title': {
-                        'title': 'Title',
-                        'description': 'The title of the project',
-                        'type': 'string'
-                    },
-                    'description': {
-                        'title': 'Description',
-                        'description': 'The description of the project',
-                        'type': 'string'
-                    }
-                },
-                'required': ['title', 'description']
-            }
-        }
-    }
-
-    return result
-
-def create_sandbox_and_execute(function_code, html_doc):
-        # Create a sandbox environment
-        sandbox_globals = {
-            'BeautifulSoup': BeautifulSoup,
-            're': re,
-            '__builtins__': __builtins__,
-        }
-        
-        # Capture stdout
-        old_stdout = sys.stdout
-        sys.stdout = StringIO()
-        
-        try:
-            # Execute the extract_data function with the provided HTML
-            result = extract_data(html_doc)
-            
-            return True, result
-        except Exception as e:
-            return False, f"Error during execution: {str(e)}"
-        finally:
-            # Restore stdout
-            sys.stdout = old_stdout
-            
-            
-#execution_success, execution_result = create_sandbox_and_execute(generated_code, html)
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-
-class Project(BaseModel):
-    title: str = Field(description="The title of the project")
-    description: str = Field(description="The description of the project")
-
-class Projects(BaseModel):
-    projects: List[Project]
-
-
-def transform_schema(pydantic_schema):
-    def process_properties(properties):
-        result = {}
-        for key, value in properties.items():
-            if 'type' in value:
-                if value['type'] == 'array':
-                    if '$ref' in value['items']:
-                        ref_key = value['items']['$ref'].split('/')[-1]
-                        result[key] = [process_properties(pydantic_schema['definitions'][ref_key]['properties'])]
-                    else:
-                        result[key] = [value['items']['type']]
-                else:
-                    result[key] = {
-                        "type": value['type'],
-                        "description": value.get('description', '')
-                    }
-            elif '$ref' in value:
-                ref_key = value['$ref'].split('/')[-1]
-                result[key] = process_properties(pydantic_schema['definitions'][ref_key]['properties'])
-        return result
-
-    return process_properties(pydantic_schema['properties'])
-
-
-data_schema = Projects.schema()
-transformed_schema = transform_schema(data_schema)
-print(transformed_schema)
\ No newline at end of file
diff --git a/scrapegraphai/code_gen/html_reduce.py b/scrapegraphai/code_gen/html_reduce.py
deleted file mode 100644
index a25cd5d8..00000000
--- a/scrapegraphai/code_gen/html_reduce.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import re
-from bs4 import BeautifulSoup, Comment
-
-
-def minify_html(html):
-    # Remove comments
-    html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
-    
-    # Remove whitespace between tags
-    html = re.sub(r'>\s+<', '><', html)
-    
-    # Remove whitespace at the beginning and end of tags
-    html = re.sub(r'\s+>', '>', html)
-    html = re.sub(r'<\s+', '<', html)
-    
-    # Collapse multiple whitespace characters into a single space
-    html = re.sub(r'\s+', ' ', html)
-    
-    # Remove spaces around equals signs in attributes
-    html = re.sub(r'\s*=\s*', '=', html)
-    
-    return html.strip()
-
-def reduce_html(html, reduction):
-    """
-    html: str, the HTML content to reduce
-    reduction: 0: minification only,
-               1: minification and removig unnecessary tags and attributes,
-               2: minification, removig unnecessary tags and attributes, simplifying text content, removing of the head tag
-    
-    
-    """
-    if reduction == 0:
-        return minify_html(html)
-    
-    soup = BeautifulSoup(html, 'html.parser')
-    
-    # Remove comments
-    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
-        comment.extract()
-    
-    # Remove script and style tag contents, but keep the tags
-    for tag in soup(['script', 'style']):
-        tag.string = ""
-    
-    # Remove unnecessary attributes, but keep class and id
-    attrs_to_keep = ['class', 'id', 'href', 'src']
-    for tag in soup.find_all(True):
-        for attr in list(tag.attrs):
-            if attr not in attrs_to_keep:
-                del tag[attr]
-                
-    if reduction == 1:
-        return minify_html(str(soup))
-    
-    # Remove script and style tags completely
-    for tag in soup(['script', 'style']):
-        tag.decompose()
-    
-    # Focus only on the body
-    body = soup.body
-    if not body:
-        return "No <body> tag found in the HTML"
-    
-    # Simplify text content
-    for tag in body.find_all(string=True):
-        if tag.parent.name not in ['script', 'style']:
-            tag.replace_with(re.sub(r'\s+', ' ', tag.strip())[:20])
-    
-    # Generate reduced HTML
-    reduced_html = str(body)
-    
-    # Apply minification
-    reduced_html = minify_html(reduced_html)
-    
-    return reduced_html
-
-# Get string with html from example.html
-html = open('example_1.html').read()
-
-reduced_html = reduce_html(html, 2)
-
-# Print the reduced html in result.html
-with open('result_1.html', 'w') as f:
-    f.write(reduced_html)
\ No newline at end of file
diff --git a/scrapegraphai/code_gen/script_genarated.py b/scrapegraphai/code_gen/script_genarated.py
deleted file mode 100644
index ee2d9fc3..00000000
--- a/scrapegraphai/code_gen/script_genarated.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from bs4 import BeautifulSoup
-import re
-
-def extract_book_info(html_string):
-    """
-    Extracts book information (title, author, publication date, publisher) from an HTML string using BeautifulSoup.
-
-    Args:
-        html_string: The HTML content as a string.
-
-    Returns:
-        A dictionary containing the extracted book information in the desired JSON schema format.
-    """
-
-    soup = BeautifulSoup(html_string, 'html.parser')
-
-    # Find all book listings
-    book_listings = soup.find_all('div', class_='cc-product-list-item')
-
-    books_data = []
-    for listing in book_listings:
-        # Extract title
-        title_elem = listing.find('a', class_='cc-title')
-        title = title_elem.text.strip() if title_elem else None
-
-        # Extract author
-        author_elem = listing.find('div', class_='cc-author').find('a', class_='cc-author-name')
-        author = author_elem.text.strip() if author_elem else None
-
-        # Extract publisher and publication date
-        publisher_info_elem = listing.find('span', class_='cc-publisher')
-        publisher_info_text = publisher_info_elem.text.strip() if publisher_info_elem else None
-
-        if publisher_info_text:
-            # Assuming publisher name is linked and publication date is the remaining text
-            publisher_elem = publisher_info_elem.find('a', class_='cc-publisher-name')
-            publisher = publisher_elem.text.strip() if publisher_elem else None
-
-            # Use regex to extract year (assuming 4-digit year format)
-            publication_date_match = re.search(r'\b(\d{4})\b', publisher_info_text)
-            publication_date = publication_date_match.group(1) if publication_date_match else None
-        else:
-            publisher = None
-            publication_date = None
-
-        # Create a book dictionary and append to the list
-        book_data = {
-            "title": title,
-            "author": author,
-            "publication_date": publication_date,
-            "publisher": publisher
-        }
-        books_data.append(book_data)
-
-    # Structure the output according to the JSON schema
-    output = {
-        "books": books_data
-    }
-
-    return output
-
-html = open('example_1.html').read()
-result = extract_book_info(html)
-print(result)
\ No newline at end of file
diff --git a/scrapegraphai/code_gen/script_runner.py b/scrapegraphai/code_gen/script_runner.py
deleted file mode 100644
index e69de29b..00000000

From 2ff0f0113ff0b429a0bb56d8b057e7660e148a1e Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Sat, 21 Sep 2024 18:46:16 +0200
Subject: [PATCH 16/27] Added logs

---
 scrapegraphai/graphs/code_generator_graph.py |  6 ++-
 scrapegraphai/nodes/generate_code_node.py    | 26 +++++++++--
 scrapegraphai/nodes/html_analyzer_node.py    |  6 +--
 scrapegraphai/nodes/prompt_refiner_node.py   | 47 +++++++++-----------
 4 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
index 6eaa05af..5da82794 100644
--- a/scrapegraphai/graphs/code_generator_graph.py
+++ b/scrapegraphai/graphs/code_generator_graph.py
@@ -17,8 +17,10 @@ from ..nodes import (
 
 class CodeGeneratorGraph(AbstractGraph):
     """
-    ...
-
+    CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for 
+    extarcting the wanted informations from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
+    It requires a user prompt, a source URL, and a output schema.
+    
     Attributes:
         prompt (str): The prompt for the graph.
         source (str): The source of the graph.
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index 1fef3c5c..891f1a27 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -26,7 +26,7 @@ import string
 
 class GenerateCodeNode(BaseNode):
     """
-    ...
+    A node that generates Python code for a function that extracts data from HTML based on a output schema.
 
     Attributes:
         llm_model: An instance of a language model client, configured for generating answers.
@@ -80,7 +80,7 @@ class GenerateCodeNode(BaseNode):
 
     def execute(self, state: dict) -> dict:
         """
-        ...
+        Generates Python code for a function that extracts data from HTML based on a output schema.
 
         Args:
             state (dict): The current state of the graph. The input keys will be used
@@ -92,6 +92,7 @@ class GenerateCodeNode(BaseNode):
         Raises:
             KeyError: If the input keys are not found in the state, indicating
                       that the necessary information for generating an answer is missing.
+            RuntimeError: If the maximum number of iterations is reached without obtaining the desired code.
         """
         
         self.logger.info(f"--- Executing {self.node_name} Node ---")
@@ -135,25 +136,31 @@ class GenerateCodeNode(BaseNode):
         return state
     
     def overall_reasoning_loop(self, state: dict) -> dict:
-        
+        self.logger.info(f"--- (Generating Code) ---")
         state["generated_code"] = self.generate_initial_code(state)
         state["generated_code"] = self.extract_code(state["generated_code"])
         
         while state["iteration"] < self.max_iterations["overall"]:
             state["iteration"] += 1
+            if self.verbose:
+                self.logger.info(f"--- Iteration {state['iteration']} ---")
             
+            self.logger.info(f"--- (Checking Code Syntax) ---")
             state = self.syntax_reasoning_loop(state)
             if state["errors"]["syntax"]:
                 continue
             
+            self.logger.info(f"--- (Executing the Generated Code) ---")
             state = self.execution_reasoning_loop(state)
             if state["errors"]["execution"]:
                 continue
             
+            self.logger.info(f"--- (Validate the Code Output Schema) ---")
             state = self.validation_reasoning_loop(state)
             if state["errors"]["validation"]:
                 continue
             
+            self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
             state = self.semantic_comparison_loop(state)
             if state["errors"]["semantic"]:
                 continue
@@ -161,6 +168,11 @@ class GenerateCodeNode(BaseNode):
             # If we've made it here, the code is valid and produces the correct output
             break
         
+        if state["iteration"] == self.max_iterations["overall"] and (state["errors"]["syntax"] or state["errors"]["execution"] or state["errors"]["validation"] or state["errors"]["semantic"]):
+            raise RuntimeError("Max iterations reached without obtaining the desired code.")
+        
+        self.logger.info(f"--- (Code Generated Correctly) ---")
+        
         return state
     
     def syntax_reasoning_loop(self, state: dict) -> dict:
@@ -171,7 +183,9 @@ class GenerateCodeNode(BaseNode):
                 return state
             
             state["errors"]["syntax"] = [syntax_message]
+            self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
             analysis = self.syntax_focused_analysis(state)
+            self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
             state["generated_code"] = self.syntax_focused_code_generation(state, analysis)
             state["generated_code"] = self.extract_code(state["generated_code"])
         return state
@@ -185,7 +199,9 @@ class GenerateCodeNode(BaseNode):
                 return state
             
             state["errors"]["execution"] = [execution_result]
+            self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
             analysis = self.execution_focused_analysis(state)
+            self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
             state["generated_code"] = self.execution_focused_code_generation(state, analysis)
             state["generated_code"] = self.extract_code(state["generated_code"])
         return state
@@ -198,7 +214,9 @@ class GenerateCodeNode(BaseNode):
                 return state
             
             state["errors"]["validation"] = errors
+            self.logger.info(f"--- (Code Output not compliant to the deisred Output Schema) ---")
             analysis = self.validation_focused_analysis(state)
+            self.logger.info(f"--- (Regenerating Code to make the Output compliant to the deisred Output Schema) ---")
             state["generated_code"] = self.validation_focused_code_generation(state, analysis)
             state["generated_code"] = self.extract_code(state["generated_code"])
         return state
@@ -211,7 +229,9 @@ class GenerateCodeNode(BaseNode):
                 return state
             
             state["errors"]["semantic"] = comparison_result["differences"]
+            self.logger.info(f"--- (The informations exctrcated are not the all ones requested) ---")
             analysis = self.semantic_focused_analysis(state, comparison_result)
+            self.logger.info(f"--- (Regenerating Code to obtain all the infromation requested) ---")
             state["generated_code"] = self.semantic_focused_code_generation(state, analysis)
             state["generated_code"] = self.extract_code(state["generated_code"])
         return state
diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
index cc8b4106..46da8e95 100644
--- a/scrapegraphai/nodes/html_analyzer_node.py
+++ b/scrapegraphai/nodes/html_analyzer_node.py
@@ -16,8 +16,8 @@ from ..utils import reduce_html
 
 class HtmlAnalyzerNode(BaseNode):
     """
-    ...
-
+    A node that generates an analysis of the provided HTML code based on the wanted infromations to be extracted.
+    
     Attributes:
         llm_model: An instance of a language model client, configured for generating answers.
         verbose (bool): A flag indicating whether to show print statements during execution.
@@ -60,7 +60,7 @@ class HtmlAnalyzerNode(BaseNode):
 
     def execute(self, state: dict) -> dict:
         """
-        ...
+        Generates an analysis of the provided HTML code based on the wanted infromations to be extracted.
 
         Args:
             state (dict): The current state of the graph. The input keys will be used
diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
index 5aa93ba0..88fd9dad 100644
--- a/scrapegraphai/nodes/prompt_refiner_node.py
+++ b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -59,6 +59,8 @@ class PromptRefinerNode(BaseNode):
         )
 
         self.additional_info = node_config.get("additional_info")
+        
+        self.output_schema = node_config.get("schema") #          get JSON output schema
 
     def execute(self, state: dict) -> dict:
         """
@@ -137,33 +139,24 @@ class PromptRefinerNode(BaseNode):
 
         user_prompt = state['user_prompt'] #                            get user prompt
 
-        if self.node_config.get("schema", None) is not None:
+        self.simplefied_schema = transform_schema(self.output_schema.schema()) #             get JSON schema
+        
+        if self.additional_info is not None: #                      use additional context if present
+            prompt = PromptTemplate(
+                template=template_prompt_builder_with_context,
+                partial_variables={"user_input": user_prompt,
+                                    "json_schema": str(self.simplefied_schema),
+                                    "additional_context": self.additional_info})
+        else:
+            prompt = PromptTemplate(
+                template=template_prompt_builder,
+                partial_variables={"user_input": user_prompt,
+                                    "json_schema": str(self.simplefied_schema)})
 
-            self.simplefied_schema = transform_schema(self.node_config["schema"].schema()) #             get JSON schema
-            
-            if self.additional_info is not None: #                      use additional context if present
-                prompt = PromptTemplate(
-                    template=template_prompt_builder_with_context,
-                    partial_variables={"user_input": user_prompt,
-                                        "json_schema": str(self.simplefied_schema),
-                                        "additional_context": self.additional_info})
-            else:
-                prompt = PromptTemplate(
-                    template=template_prompt_builder,
-                    partial_variables={"user_input": user_prompt,
-                                        "json_schema": str(self.simplefied_schema)})
+        output_parser = StrOutputParser()
 
-            output_parser = StrOutputParser()
+        chain =  prompt | self.llm_model | output_parser
+        refined_prompt = chain.invoke({})
 
-            chain =  prompt | self.llm_model | output_parser
-            refined_prompt = chain.invoke({})
-
-            state.update({self.output[0]: refined_prompt})
-            return state
-
-        else: #                                                no schema provided
-            self.logger.error("No schema provided for prompt refinement.")
-            
-            # TODO: Handle the case where no schema is provided => error handling
-            
-            return state
+        state.update({self.output[0]: refined_prompt})
+        return state

From 657ef711f75d38f33c829922d12d9c61404a066f Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Tue, 24 Sep 2024 12:06:45 +0200
Subject: [PATCH 17/27] raise keyerror exception for the schema

---
 scrapegraphai/graphs/code_generator_graph.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
index 5da82794..ca763896 100644
--- a/scrapegraphai/graphs/code_generator_graph.py
+++ b/scrapegraphai/graphs/code_generator_graph.py
@@ -63,6 +63,9 @@ class CodeGeneratorGraph(AbstractGraph):
             BaseGraph: A graph instance representing the web scraping workflow.
         """
         
+        if self.schema is None: 
+            raise KeyError("The schema is required for CodeGeneratorGraph")
+
         fetch_node = FetchNode(
             input="url| local_dir",
             output=["doc"],

From 36a8a1c87295e777ae701509265587cb215812c5 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Tue, 24 Sep 2024 12:36:10 +0200
Subject: [PATCH 18/27] refining and refactoring of the code

---
 scrapegraphai/nodes/generate_code_node.py  |  7 ++--
 scrapegraphai/nodes/html_analyzer_node.py  |  3 --
 scrapegraphai/nodes/prompt_refiner_node.py | 38 ++++++++++++----------
 3 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index 891f1a27..6032fa0f 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -1,14 +1,12 @@
 """
 GenerateCodeNode Module
 """
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 from langchain.prompts import PromptTemplate
 from langchain.output_parsers import ResponseSchema, StructuredOutputParser
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnableParallel
 from langchain_core.utils.pydantic import is_basemodel_subclass
-from langchain_openai import ChatOpenAI, AzureChatOpenAI
-from langchain_mistralai import ChatMistralAI
 from langchain_community.chat_models import ChatOllama
 import ast
 import sys
@@ -23,7 +21,6 @@ from jsonschema import validate, ValidationError
 import json
 import string
 
-
 class GenerateCodeNode(BaseNode):
     """
     A node that generates Python code for a function that extracts data from HTML based on a output schema.
@@ -650,4 +647,4 @@ def normalize_dict(d: dict) -> dict:
 
 def are_content_equal(generated_result: dict, reference_result: dict) -> bool:
     # Normalize both dictionaries and compare
-    return normalize_dict(generated_result) == normalize_dict(reference_result)
\ No newline at end of file
+    return normalize_dict(generated_result) == normalize_dict(reference_result)
diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
index 46da8e95..47e65437 100644
--- a/scrapegraphai/nodes/html_analyzer_node.py
+++ b/scrapegraphai/nodes/html_analyzer_node.py
@@ -6,14 +6,11 @@ from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnableParallel
 from langchain_core.utils.pydantic import is_basemodel_subclass
-from langchain_openai import ChatOpenAI, AzureChatOpenAI
-from langchain_mistralai import ChatMistralAI
 from langchain_community.chat_models import ChatOllama
 from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import reduce_html
 
-
 class HtmlAnalyzerNode(BaseNode):
     """
     A node that generates an analysis of the provided HTML code based on the wanted infromations to be extracted.
diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
index 88fd9dad..50e5f76e 100644
--- a/scrapegraphai/nodes/prompt_refiner_node.py
+++ b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -13,7 +13,6 @@ from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import transform_schema
 
-
 class PromptRefinerNode(BaseNode):
     """
     A node that refine the user prompt with the use of the schema and additional context and
@@ -60,7 +59,7 @@ class PromptRefinerNode(BaseNode):
 
         self.additional_info = node_config.get("additional_info")
         
-        self.output_schema = node_config.get("schema") #          get JSON output schema
+        self.output_schema = node_config.get("schema")
 
     def execute(self, state: dict) -> dict:
         """
@@ -79,7 +78,9 @@ class PromptRefinerNode(BaseNode):
         """
 
         template_prompt_builder = """
-        **Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction. Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.
+        **Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n
+        Break down the user's request into key components, and then explicitly connect these components to the 
+        corresponding elements within the JSON schema.
 
         **User's Request**:
         {user_input}
@@ -91,22 +92,23 @@ class PromptRefinerNode(BaseNode):
 
         **Analysis Instructions**:
         1. **Break Down User Request:** 
-        * Clearly identify the core entities or data types the user is asking for.
-        * Highlight any specific attributes or relationships mentioned in the request.
+        * Clearly identify the core entities or data types the user is asking for.\n
+        * Highlight any specific attributes or relationships mentioned in the request.\n
 
         2. **Map to JSON Schema**:
-        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.
+        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
         * Explain how the schema structure accommodates the user's needs.
-        * If applicable, mention any schema elements that are not directly addressed in the user's request.
+        * If applicable, mention any schema elements that are not directly addressed in the user's request.\n
 
-        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.
+        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
         Please generate only the analysis and no other text.
 
         **Response**:
         """
         
         template_prompt_builder_with_context = """
-        **Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction. Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.
+        **Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n
+        Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n
         
         **User's Request**:
         {user_input}
@@ -121,15 +123,15 @@ class PromptRefinerNode(BaseNode):
 
         **Analysis Instructions**:
         1. **Break Down User Request:** 
-        * Clearly identify the core entities or data types the user is asking for.
-        * Highlight any specific attributes or relationships mentioned in the request.
+        * Clearly identify the core entities or data types the user is asking for.\n
+        * Highlight any specific attributes or relationships mentioned in the request.\n
 
         2. **Map to JSON Schema**:
-        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.
-        * Explain how the schema structure accommodates the user's needs.
-        * If applicable, mention any schema elements that are not directly addressed in the user's request.
+        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
+        * Explain how the schema structure accommodates the user's needs.\n
+        * If applicable, mention any schema elements that are not directly addressed in the user's request.\n
 
-        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.
+        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
         Please generate only the analysis and no other text.
 
         **Response**:
@@ -137,11 +139,11 @@ class PromptRefinerNode(BaseNode):
         
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
-        user_prompt = state['user_prompt'] #                            get user prompt
+        user_prompt = state['user_prompt']
 
-        self.simplefied_schema = transform_schema(self.output_schema.schema()) #             get JSON schema
+        self.simplefied_schema = transform_schema(self.output_schema.schema())
         
-        if self.additional_info is not None: #                      use additional context if present
+        if self.additional_info is not None:
             prompt = PromptTemplate(
                 template=template_prompt_builder_with_context,
                 partial_variables={"user_input": user_prompt,

From df907707792e2230e00af91a26d33d3266ede186 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Tue, 24 Sep 2024 12:44:55 +0200
Subject: [PATCH 19/27] Update generate_code_node.py

---
 scrapegraphai/nodes/generate_code_node.py | 28 ++++++++++++++++-------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index 891f1a27..fd9fd395 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -625,29 +625,41 @@ def normalize_string(s: str) -> str:
     # Convert to lowercase, remove extra spaces, and strip punctuation
     return ''.join(c for c in s.lower().strip() if c not in string.punctuation)
 
+def normalize_string(s: str) -> str:
+    """Normalize a string by converting to lowercase and stripping spaces."""
+    return s.lower().strip()
+
 def normalize_dict(d: dict) -> dict:
     """
     Normalize the dictionary by:
     - Converting all string values to lowercase and stripping spaces.
     - Recursively normalizing nested dictionaries.
-    - Sorting the dictionary to ensure key order doesn't matter.
+    - Sorting lists of primitives and creating sorted list of normalized dicts for lists of dicts.
     """
     normalized = {}
     for key, value in d.items():
         if isinstance(value, str):
-            # Normalize string values
             normalized[key] = normalize_string(value)
         elif isinstance(value, dict):
-            # Recursively normalize nested dictionaries
             normalized[key] = normalize_dict(value)
         elif isinstance(value, list):
-            # Sort lists and normalize elements
-            normalized[key] = sorted(normalize_dict(v) if isinstance(v, dict) else normalize_string(v) if isinstance(v, str) else v for v in value)
+            if all(isinstance(v, dict) for v in value):
+                # For lists of dicts, normalize each dict and sort based on their string representation
+                normalized[key] = sorted(
+                    normalize_dict(v) for v in value
+                )
+            else:
+                # For lists of primitives, sort normally
+                normalized[key] = sorted(
+                    normalize_dict(v) if isinstance(v, dict)
+                    else normalize_string(v) if isinstance(v, str)
+                    else v
+                    for v in value
+                )
         else:
-            # Keep other types (e.g., numbers) as is
             normalized[key] = value
-    return dict(sorted(normalized.items()))  # Ensure dictionary is sorted by keys
+    return dict(sorted(normalized.items()))
 
 def are_content_equal(generated_result: dict, reference_result: dict) -> bool:
-    # Normalize both dictionaries and compare
+    """Compare two dictionaries for semantic equality."""
     return normalize_dict(generated_result) == normalize_dict(reference_result)
\ No newline at end of file

From 04ac7362d4735b77d946721a420f4e2c0539de41 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com>
Date: Tue, 24 Sep 2024 16:57:52 +0200
Subject: [PATCH 20/27] i don't like comments

---
 scrapegraphai/nodes/generate_code_node.py | 30 ++++++-----------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index 4b28bb71..ec0c310a 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -73,7 +73,7 @@ class GenerateCodeNode(BaseNode):
             "semantic": 3
         })
         
-        self.output_schema = node_config.get("schema") #          get JSON output schema
+        self.output_schema = node_config.get("schema")
 
     def execute(self, state: dict) -> dict:
         """
@@ -98,15 +98,15 @@ class GenerateCodeNode(BaseNode):
         
         input_data = [state[key] for key in input_keys]
         
-        user_prompt = input_data[0] #       get user prompt
-        refined_prompt = input_data[1] #    get refined prompt
-        html_info = input_data[2] #         get html analysis
-        reduced_html = input_data[3] #      get html code
-        answer = input_data[4] #            get answer generated from the generate answer node for verification
+        user_prompt = input_data[0]
+        refined_prompt = input_data[1]
+        html_info = input_data[2]
+        reduced_html = input_data[3]
+        answer = input_data[4] 
         
         self.raw_html = state['original_html'][0].page_content
         
-        simplefied_schema = str(transform_schema(self.output_schema.schema())) #          get JSON output schema
+        simplefied_schema = str(transform_schema(self.output_schema.schema()))
         
         reasoning_state = {
             "user_input": user_prompt,
@@ -160,9 +160,7 @@ class GenerateCodeNode(BaseNode):
             self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
             state = self.semantic_comparison_loop(state)
             if state["errors"]["semantic"]:
-                continue
-            
-            # If we've made it here, the code is valid and produces the correct output
+                continue            
             break
         
         if state["iteration"] == self.max_iterations["overall"] and (state["errors"]["syntax"] or state["errors"]["execution"] or state["errors"]["validation"] or state["errors"]["semantic"]):
@@ -488,7 +486,6 @@ class GenerateCodeNode(BaseNode):
         Human: Are the generated result and reference result semantically equivalent? If not, what are the key differences?
 
         Assistant: Let's analyze the two results carefully:
-
         """
 
         prompt = PromptTemplate(
@@ -576,28 +573,23 @@ class GenerateCodeNode(BaseNode):
             '__builtins__': __builtins__,
         }
         
-        # Capture stdout
         old_stdout = sys.stdout
         sys.stdout = StringIO()
         
         try:
-            # Execute the function code in the sandbox
             exec(function_code, sandbox_globals)
             
-            # Get the extract_data function from the sandbox
             extract_data = sandbox_globals.get('extract_data')
             
             if not extract_data:
                 raise NameError("Function 'extract_data' not found in the generated code.")
             
-            # Execute the extract_data function with the provided HTML
             result = extract_data(self.raw_html)
             
             return True, result
         except Exception as e:
             return False, f"Error during execution: {str(e)}"
         finally:
-            # Restore stdout
             sys.stdout = old_stdout
             
     def validate_dict(self, data: dict, schema):
@@ -609,17 +601,13 @@ class GenerateCodeNode(BaseNode):
             return False, errors
     
     def extract_code(self, code: str) -> str:
-        # Pattern to match the code inside a code block
         pattern = r'```(?:python)?\n(.*?)```'
         
-        # Search for the code block, if present
         match = re.search(pattern, code, re.DOTALL)
         
-        # If a code block is found, return the code, otherwise return the entire string
         return match.group(1) if match else code
 
 def normalize_string(s: str) -> str:
-    # Convert to lowercase, remove extra spaces, and strip punctuation
     return ''.join(c for c in s.lower().strip() if c not in string.punctuation)
 
 def normalize_string(s: str) -> str:
@@ -641,12 +629,10 @@ def normalize_dict(d: dict) -> dict:
             normalized[key] = normalize_dict(value)
         elif isinstance(value, list):
             if all(isinstance(v, dict) for v in value):
-                # For lists of dicts, normalize each dict and sort based on their string representation
                 normalized[key] = sorted(
                     normalize_dict(v) for v in value
                 )
             else:
-                # For lists of primitives, sort normally
                 normalized[key] = sorted(
                     normalize_dict(v) if isinstance(v, dict)
                     else normalize_string(v) if isinstance(v, str)

From d38a50115f47e45e14ea48dc48a25fda766a679b Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com>
Date: Tue, 24 Sep 2024 16:59:33 +0200
Subject: [PATCH 21/27] Update html_analyzer_node.py

---
 scrapegraphai/nodes/html_analyzer_node.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
index 47e65437..83bc9fbc 100644
--- a/scrapegraphai/nodes/html_analyzer_node.py
+++ b/scrapegraphai/nodes/html_analyzer_node.py
@@ -134,7 +134,7 @@ class HtmlAnalyzerNode(BaseNode):
         Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
         
         Focus on providing a concise, step-by-step analysis of the HTML structure and the key elements needed for data extraction. Do not include any code examples or implementation logic. Keep the response focused and avoid general statements.**
-        
+        In your code do not include backticks.
         **HTML Analysis for Data Extraction**:
         """
         

From 54ebb397328b753e413ea7cba95286f193641766 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com>
Date: Tue, 24 Sep 2024 17:00:18 +0200
Subject: [PATCH 22/27] Update generate_code_node.py

---
 scrapegraphai/nodes/generate_code_node.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index 4b28bb71..1d9cdaa9 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -268,7 +268,8 @@ class GenerateCodeNode(BaseNode):
         - re
         
         **Output ONLY the Python code of the extract_data function, WITHOUT ANY IMPORTS OR ADDITIONAL TEXT.**
-        
+        In your code do not include backticks.
+
         **Response**:
         """
         

From d6a77029bbec7de0976dd2f41a8a11d2ee43de4f Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Tue, 24 Sep 2024 18:12:50 +0200
Subject: [PATCH 23/27] Validator fixed

---
 requirements-dev.lock                     |  2 +-
 requirements.lock                         |  5 ++-
 scrapegraphai/nodes/generate_code_node.py | 42 ++++++++---------------
 3 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/requirements-dev.lock b/requirements-dev.lock
index 2d0f10a0..0523351a 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -238,7 +238,7 @@ mdurl==0.1.2
 minify-html==0.15.0
     # via scrapegraphai
 mistral-common==1.4.1
-
+    # via scrapegraphai
 mpire==2.10.2
     # via semchunk
 multidict==6.0.5
diff --git a/requirements.lock b/requirements.lock
index 6b66d6f3..6ee34ba9 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -166,6 +166,7 @@ marshmallow==3.21.3
 minify-html==0.15.0
     # via scrapegraphai
 mistral-common==1.4.1
+    # via scrapegraphai
 mpire==2.10.2
     # via semchunk
 multidict==6.0.5
@@ -255,7 +256,6 @@ pyyaml==6.0.1
 referencing==0.35.1
     # via jsonschema
     # via jsonschema-specifications
-
 regex==2024.5.15
     # via tiktoken
     # via transformers
@@ -279,10 +279,9 @@ s3transfer==0.10.2
 safetensors==0.4.5
     # via transformers
 semchunk==2.2.0
-    # via scrapegraphai 
+    # via scrapegraphai
 sentencepiece==0.2.0
     # via mistral-common
-
 six==1.16.0
     # via python-dateutil
 sniffio==1.3.1
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index 99b6852d..c4c04d52 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -608,42 +608,30 @@ class GenerateCodeNode(BaseNode):
         
         return match.group(1) if match else code
 
-def normalize_string(s: str) -> str:
-    return ''.join(c for c in s.lower().strip() if c not in string.punctuation)
 
-def normalize_string(s: str) -> str:
-    """Normalize a string by converting to lowercase and stripping spaces."""
-    return s.lower().strip()
 
-def normalize_dict(d: dict) -> dict:
-    """
-    Normalize the dictionary by:
-    - Converting all string values to lowercase and stripping spaces.
-    - Recursively normalizing nested dictionaries.
-    - Sorting lists of primitives and creating sorted list of normalized dicts for lists of dicts.
-    """
+def normalize_dict(d: Dict[str, Any]) -> Dict[str, Any]:
     normalized = {}
     for key, value in d.items():
         if isinstance(value, str):
-            normalized[key] = normalize_string(value)
+            normalized[key] = value.lower().strip()
         elif isinstance(value, dict):
             normalized[key] = normalize_dict(value)
         elif isinstance(value, list):
-            if all(isinstance(v, dict) for v in value):
-                normalized[key] = sorted(
-                    normalize_dict(v) for v in value
-                )
-            else:
-                normalized[key] = sorted(
-                    normalize_dict(v) if isinstance(v, dict)
-                    else normalize_string(v) if isinstance(v, str)
-                    else v
-                    for v in value
-                )
+            normalized[key] = normalize_list(value)
         else:
             normalized[key] = value
-    return dict(sorted(normalized.items()))
+    return normalized
 
-def are_content_equal(generated_result: dict, reference_result: dict) -> bool:
+def normalize_list(lst: List[Any]) -> List[Any]:
+    return [
+        normalize_dict(item) if isinstance(item, dict)
+        else normalize_list(item) if isinstance(item, list)
+        else item.lower().strip() if isinstance(item, str)
+        else item
+        for item in lst
+    ]
+
+def are_content_equal(generated_result: Dict[str, Any], reference_result: Dict[str, Any]) -> bool:
     """Compare two dictionaries for semantic equality."""
-    return normalize_dict(generated_result) == normalize_dict(reference_result)
+    return normalize_dict(generated_result) == normalize_dict(reference_result)
\ No newline at end of file

From 2d2c7194bef8785bb156483441871552a66a26c4 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Tue, 24 Sep 2024 18:33:12 +0200
Subject: [PATCH 24/27] Template refactoring

---
 scrapegraphai/nodes/generate_code_node.py     | 237 ++----------------
 scrapegraphai/nodes/html_analyzer_node.py     |  74 +-----
 scrapegraphai/nodes/prompt_refiner_node.py    |  67 +----
 scrapegraphai/prompts/__init__.py             |   3 +
 .../prompts/generate_code_node_templates.py   | 213 ++++++++++++++++
 .../prompts/html_analyzer_node_prompts.py     |  71 ++++++
 .../prompts/prompt_refiner_node_prompts.py    |  63 +++++
 7 files changed, 377 insertions(+), 351 deletions(-)
 create mode 100644 scrapegraphai/prompts/generate_code_node_templates.py
 create mode 100644 scrapegraphai/prompts/html_analyzer_node_prompts.py
 create mode 100644 scrapegraphai/prompts/prompt_refiner_node_prompts.py

diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index c4c04d52..c7cdd031 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -20,6 +20,12 @@ from ..utils import transform_schema
 from jsonschema import validate, ValidationError
 import json
 import string
+from ..prompts import (
+    TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_SYNTAX_CODE_GENERATION,
+    TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION, TEMPLATE_VALIDATION_ANALYSIS,
+    TEMPLATE_VALIDATION_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS,
+    TEMPLATE_SEMANTIC_CODE_GENERATION
+)
 
 class GenerateCodeNode(BaseNode):
     """
@@ -232,47 +238,8 @@ class GenerateCodeNode(BaseNode):
         return state
     
     def generate_initial_code(self, state: dict) -> str:
-        template_code_generator = """
-        **Task**: Create a Python function named `extract_data(html: str) -> dict()` using BeautifulSoup that extracts relevant information from the given HTML code string and returns it in a dictionary matching the Desired JSON Output Schema.
-
-        **User's Request**:
-        {user_input}
-
-        **Desired JSON Output Schema**:
-        ```json
-        {json_schema}
-        ```
-
-        **Initial Task Analysis**:
-        {initial_analysis}
-
-        **HTML Code**:
-        ```html
-        {html_code}
-        ```
-
-        **HTML Structure Analysis**:
-        {html_analysis}
-
-        Based on the above analyses, generate the `extract_data(html: str) -> dict()` function that:
-        1. Efficiently extracts the required data from the given HTML structure.
-        2. Processes and structures the data according to the specified JSON schema.
-        3. Returns the structured data as a dictionary.
-        
-        Your code should be well-commented, explaining the reasoning behind key decisions and any potential areas for improvement or customization.
-        
-        Use only the following pre-imported libraries:
-        - BeautifulSoup from bs4
-        - re
-        
-        **Output ONLY the Python code of the extract_data function, WITHOUT ANY IMPORTS OR ADDITIONAL TEXT.**
-        In your code do not include backticks.
-
-        **Response**:
-        """
-        
         prompt = PromptTemplate(
-            template=template_code_generator,
+            template=TEMPLATE_INIT_CODE_GENERATION,
             partial_variables={
                 "user_input": state["user_input"],
                 "json_schema": state["json_schema"],
@@ -288,23 +255,7 @@ class GenerateCodeNode(BaseNode):
         return generated_code
     
     def syntax_focused_analysis(self, state: dict) -> str:
-        template = """
-        The current code has encountered a syntax error. Here are the details:
-        
-        Current Code:
-        ```python
-        {generated_code}
-        ```
-        
-        Syntax Error:
-        {errors}
-        
-        Please analyze in detail the syntax error and suggest a fix. Focus only on correcting the syntax issue while ensuring the code still meets the original requirements.
-        
-        Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
-        """
-        
-        prompt = PromptTemplate(template=template, input_variables=["generated_code", "errors"])
+        prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS, input_variables=["generated_code", "errors"])
         chain = prompt | self.llm_model | StrOutputParser()
         return chain.invoke({
             "generated_code": state["generated_code"],
@@ -312,21 +263,7 @@ class GenerateCodeNode(BaseNode):
         })
     
     def syntax_focused_code_generation(self, state: dict, analysis: str) -> str:
-        template = """
-        Based on the following analysis of a syntax error, please generate the corrected code, following the suggested fix.:
-
-        Error Analysis:
-        {analysis}
-
-        Original Code:
-        ```python
-        {generated_code}
-        ```
-
-        Generate the corrected code, applying the suggestions from the analysis. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
-        """
-
-        prompt = PromptTemplate(template=template, input_variables=["analysis", "generated_code"])
+        prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION, input_variables=["analysis", "generated_code"])
         chain = prompt | self.llm_model | StrOutputParser()
         return chain.invoke({
             "analysis": analysis,
@@ -334,32 +271,7 @@ class GenerateCodeNode(BaseNode):
         })
     
     def execution_focused_analysis(self, state: dict) -> str:
-        template = """
-        The current code has encountered an execution error. Here are the details:
-        
-        **Current Code**:
-        ```python
-        {generated_code}
-        ```
-        
-        **Execution Error**:
-        {errors}
-        
-        **HTML Code**:
-        ```html
-        {html_code}
-        ```
-
-        **HTML Structure Analysis**:
-        {html_analysis}
-        
-        Please analyze the execution error and suggest a fix. Focus only on correcting the execution issue while ensuring the code still meets the original requirements and maintains correct syntax.
-        The suggested fix should address the execution error and ensure the function can successfully extract the required data from the provided HTML structure. Be sure to be precise and specific in your analysis.
-        
-        Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
-        """
-        
-        prompt = PromptTemplate(template=template, input_variables=["generated_code", "errors", "html_code", "html_analysis"])
+        prompt = PromptTemplate(template=TEMPLATE_EXECUTION_ANALYSIS, input_variables=["generated_code", "errors", "html_code", "html_analysis"])
         chain = prompt | self.llm_model | StrOutputParser()
         return chain.invoke({
             "generated_code": state["generated_code"],
@@ -369,21 +281,7 @@ class GenerateCodeNode(BaseNode):
         })
     
     def execution_focused_code_generation(self, state: dict, analysis: str) -> str:
-        template = """
-        Based on the following analysis of an execution error, please generate the corrected code:
-
-        Error Analysis:
-        {analysis}
-
-        Original Code:
-        ```python
-        {generated_code}
-        ```
-
-        Generate the corrected code, applying the suggestions from the analysis. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
-        """
-
-        prompt = PromptTemplate(template=template, input_variables=["analysis", "generated_code"])
+        prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION, input_variables=["analysis", "generated_code"])
         chain = prompt | self.llm_model | StrOutputParser()
         return chain.invoke({
             "analysis": analysis,
@@ -391,31 +289,7 @@ class GenerateCodeNode(BaseNode):
         })
     
     def validation_focused_analysis(self, state: dict) -> str:
-        template = """
-        The current code's output does not match the required schema. Here are the details:
-        
-        Current Code:
-        ```python
-        {generated_code}
-        ```
-        
-        Validation Errors:
-        {errors}
-        
-        Required Schema:
-        ```json
-        {json_schema}
-        ```
-        
-        Current Output:
-        {execution_result}
-        
-        Please analyze the validation errors and suggest fixes. Focus only on correcting the output to match the required schema while ensuring the code maintains correct syntax and execution.
-        
-        Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
-        """
-        
-        prompt = PromptTemplate(template=template, input_variables=["generated_code", "errors", "json_schema", "execution_result"])
+        prompt = PromptTemplate(template=TEMPLATE_VALIDATION_ANALYSIS, input_variables=["generated_code", "errors", "json_schema", "execution_result"])
         chain = prompt | self.llm_model | StrOutputParser()
         return chain.invoke({
             "generated_code": state["generated_code"],
@@ -425,26 +299,7 @@ class GenerateCodeNode(BaseNode):
         })
     
     def validation_focused_code_generation(self, state: dict, analysis: str) -> str:
-        template = """
-        Based on the following analysis of a validation error, please generate the corrected code:
-
-        Error Analysis:
-        {analysis}
-
-        Original Code:
-        ```python
-        {generated_code}
-        ```
-
-        Required Schema:
-        ```json
-        {json_schema}
-        ```
-
-        Generate the corrected code, applying the suggestions from the analysis and ensuring the output matches the required schema. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
-        """
-
-        prompt = PromptTemplate(template=template, input_variables=["analysis", "generated_code", "json_schema"])
+        prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION, input_variables=["analysis", "generated_code", "json_schema"])
         chain = prompt | self.llm_model | StrOutputParser()
         return chain.invoke({
             "analysis": analysis,
@@ -470,27 +325,8 @@ class GenerateCodeNode(BaseNode):
         ]
         output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
 
-        template = """
-        Compare the Generated Result with the Reference Result and determine if they are semantically equivalent:
-
-        Generated Result:
-        {generated_result}
-
-        Reference Result (Correct Output):
-        {reference_result}
-
-        Analyze the content, structure, and meaning of both results. They should be considered semantically equivalent if they convey the same information, even if the exact wording or structure differs.
-        If they are not semantically equivalent, identify what are the key differences in the Generated Result. The Reference Result should be considered the correct output, you need to pinpoint the problems in the Generated Result.
-
-        {format_instructions}
-
-        Human: Are the generated result and reference result semantically equivalent? If not, what are the key differences?
-
-        Assistant: Let's analyze the two results carefully:
-        """
-
         prompt = PromptTemplate(
-            template=template,
+            template=TEMPLATE_SEMANTIC_COMPARISON,
             input_variables=["generated_result", "reference_result"],
             partial_variables={"format_instructions": output_parser.get_format_instructions()}
         )
@@ -501,27 +337,8 @@ class GenerateCodeNode(BaseNode):
             "reference_result": json.dumps(reference_result_dict, indent=2)
         })
     
-    def semantic_focused_analysis(self, state: dict, comparison_result: Dict[str, Any]) -> str:
-        template = """
-        The current code's output is semantically different from the reference answer. Here are the details:
-        
-        Current Code:
-        ```python
-        {generated_code}
-        ```
-        
-        Semantic Differences:
-        {differences}
-        
-        Comparison Explanation:
-        {explanation}
-        
-        Please analyze these semantic differences and suggest how to modify the code to produce a result that is semantically equivalent to the reference answer. Focus on addressing the key differences while maintaining the overall structure and functionality of the code.
-        
-        Provide your analysis and suggestions for fixing the semantic differences. DO NOT generate any code in your response.
-        """
-        
-        prompt = PromptTemplate(template=template, input_variables=["generated_code", "differences", "explanation"])
+    def semantic_focused_analysis(self, state: dict, comparison_result: Dict[str, Any]) -> str:        
+        prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_ANALYSIS, input_variables=["generated_code", "differences", "explanation"])
         chain = prompt | self.llm_model | StrOutputParser()
         return chain.invoke({
             "generated_code": state["generated_code"],
@@ -530,27 +347,7 @@ class GenerateCodeNode(BaseNode):
         })
     
     def semantic_focused_code_generation(self, state: dict, analysis: str) -> str:
-        template = """
-        Based on the following analysis of semantic differences, please generate the corrected code:
-
-        Semantic Analysis:
-        {analysis}
-
-        Original Code:
-        ```python
-        {generated_code}
-        ```
-
-        Generated Result:
-        {generated_result}
-
-        Reference Result:
-        {reference_result}
-
-        Generate the corrected code, applying the suggestions from the analysis to make the output semantically equivalent to the reference result. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
-        """
-
-        prompt = PromptTemplate(template=template, input_variables=["analysis", "generated_code", "generated_result", "reference_result"])
+        prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION, input_variables=["analysis", "generated_code", "generated_result", "reference_result"])
         chain = prompt | self.llm_model | StrOutputParser()
         return chain.invoke({
             "analysis": analysis,
diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
index 83bc9fbc..d526315c 100644
--- a/scrapegraphai/nodes/html_analyzer_node.py
+++ b/scrapegraphai/nodes/html_analyzer_node.py
@@ -10,6 +10,9 @@ from langchain_community.chat_models import ChatOllama
 from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import reduce_html
+from ..prompts import (
+    TEMPLATE_HTML_ANALYSIS, TEMPLATE_HTML_ANALYSIS_WITH_CONTEXT
+)
 
 class HtmlAnalyzerNode(BaseNode):
     """
@@ -70,73 +73,6 @@ class HtmlAnalyzerNode(BaseNode):
             KeyError: If the input keys are not found in the state, indicating
                       that the necessary information for generating an answer is missing.
         """
-
-        template_html_analysis = """
-        Task: Your job is to analyze the provided HTML code in relation to the initial scraping task analysis and provide all the necessary HTML information useful for implementing a function that extracts data from the given HTML string.
-        
-        **Initial Analysis**:
-        {initial_analysis}
-
-        **HTML Code**:
-        ```html
-        {html_code}
-        ```
-
-        **HTML Analysis Instructions**:
-        1. Examine the HTML code and identify elements, classes, or IDs that correspond to each required data field mentioned in the Initial Analysis.
-        2. Look for patterns or repeated structures that could indicate multiple items (e.g., product listings).
-        3. Note any nested structures or relationships between elements that are relevant to the data extraction task.
-        4. Discuss any additional considerations based on the specific HTML layout that are crucial for accurate data extraction.
-        5. Recommend the specific strategy to use for scraping the content, remeber.
-
-        **Important Notes**:
-        - The function that the code generator is gonig to implement will receive the HTML as a string parameter, not as a live webpage.
-        - No web scraping, automation, or handling of dynamic content is required.
-        - The analysis should focus solely on extracting data from the static HTML provided.
-        - Be precise and specific in your analysis, as the code generator will, possibly, not have access to the full HTML context.
-        
-        This HTML analysis will be used to guide the final code generation process for a function that extracts data from the given HTML string.
-        Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
-        
-        Focus on providing a concise, step-by-step analysis of the HTML structure and the key elements needed for data extraction. Do not include any code examples or implementation logic. Keep the response focused and avoid general statements.**
-        
-        **HTML Analysis for Data Extraction**:
-        """
-        
-        template_html_analysis_with_context = """
-        Task: Your job is to analyze the provided HTML code in relation to the initial scraping task analysis and the additional context the user provided and provide all the necessary HTML information useful for implementing a function that extracts data from the given HTML string.
-        
-        **Initial Analysis**:
-        {initial_analysis}
-
-        **HTML Code**:
-        ```html
-        {html_code}
-        ```
-        
-        **Additional Context**:
-        {additional_context}
-
-        **HTML Analysis Instructions**:
-        1. Examine the HTML code and identify elements, classes, or IDs that correspond to each required data field mentioned in the Initial Analysis.
-        2. Look for patterns or repeated structures that could indicate multiple items (e.g., product listings).
-        3. Note any nested structures or relationships between elements that are relevant to the data extraction task.
-        4. Discuss any additional considerations based on the specific HTML layout that are crucial for accurate data extraction.
-        5. Recommend the specific strategy to use for scraping the content, remeber.
-
-        **Important Notes**:
-        - The function that the code generator is gonig to implement will receive the HTML as a string parameter, not as a live webpage.
-        - No web scraping, automation, or handling of dynamic content is required.
-        - The analysis should focus solely on extracting data from the static HTML provided.
-        - Be precise and specific in your analysis, as the code generator will, possibly, not have access to the full HTML context.
-        
-        This HTML analysis will be used to guide the final code generation process for a function that extracts data from the given HTML string.
-        Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
-        
-        Focus on providing a concise, step-by-step analysis of the HTML structure and the key elements needed for data extraction. Do not include any code examples or implementation logic. Keep the response focused and avoid general statements.**
-        In your code do not include backticks.
-        **HTML Analysis for Data Extraction**:
-        """
         
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
@@ -150,13 +86,13 @@ class HtmlAnalyzerNode(BaseNode):
         
         if self.additional_info is not None: #              use additional context if present
             prompt = PromptTemplate(
-                template=template_html_analysis_with_context,
+                template=TEMPLATE_HTML_ANALYSIS_WITH_CONTEXT,
                 partial_variables={"initial_analysis": refined_prompt,
                                     "html_code": reduced_html,
                                     "additional_context": self.additional_info})
         else:
             prompt = PromptTemplate(
-                template=template_html_analysis,
+                template=TEMPLATE_HTML_ANALYSIS,
                 partial_variables={"initial_analysis": refined_prompt,
                                     "html_code": reduced_html})
 
diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
index 50e5f76e..e6f4579c 100644
--- a/scrapegraphai/nodes/prompt_refiner_node.py
+++ b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -12,6 +12,9 @@ from langchain_community.chat_models import ChatOllama
 from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import transform_schema
+from ..prompts import (
+    TEMPLATE_REFINER, TEMPLATE_REFINER_WITH_CONTEXT
+)
 
 class PromptRefinerNode(BaseNode):
     """
@@ -76,66 +79,6 @@ class PromptRefinerNode(BaseNode):
             KeyError: If the input keys are not found in the state, indicating
                       that the necessary information for generating an answer is missing.
         """
-
-        template_prompt_builder = """
-        **Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n
-        Break down the user's request into key components, and then explicitly connect these components to the 
-        corresponding elements within the JSON schema.
-
-        **User's Request**:
-        {user_input}
-
-        **Desired JSON Output Schema**:
-        ```json
-        {json_schema}
-        ```
-
-        **Analysis Instructions**:
-        1. **Break Down User Request:** 
-        * Clearly identify the core entities or data types the user is asking for.\n
-        * Highlight any specific attributes or relationships mentioned in the request.\n
-
-        2. **Map to JSON Schema**:
-        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
-        * Explain how the schema structure accommodates the user's needs.
-        * If applicable, mention any schema elements that are not directly addressed in the user's request.\n
-
-        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
-        Please generate only the analysis and no other text.
-
-        **Response**:
-        """
-        
-        template_prompt_builder_with_context = """
-        **Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n
-        Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n
-        
-        **User's Request**:
-        {user_input}
-
-        **Desired JSON Output Schema**:
-        ```json
-        {json_schema}
-        ```
-        
-        **Additional Context**:
-        {additional_context}
-
-        **Analysis Instructions**:
-        1. **Break Down User Request:** 
-        * Clearly identify the core entities or data types the user is asking for.\n
-        * Highlight any specific attributes or relationships mentioned in the request.\n
-
-        2. **Map to JSON Schema**:
-        * For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
-        * Explain how the schema structure accommodates the user's needs.\n
-        * If applicable, mention any schema elements that are not directly addressed in the user's request.\n
-
-        This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
-        Please generate only the analysis and no other text.
-
-        **Response**:
-        """
         
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
@@ -145,13 +88,13 @@ class PromptRefinerNode(BaseNode):
         
         if self.additional_info is not None:
             prompt = PromptTemplate(
-                template=template_prompt_builder_with_context,
+                template=TEMPLATE_REFINER_WITH_CONTEXT,
                 partial_variables={"user_input": user_prompt,
                                     "json_schema": str(self.simplefied_schema),
                                     "additional_context": self.additional_info})
         else:
             prompt = PromptTemplate(
-                template=template_prompt_builder,
+                template=TEMPLATE_REFINER,
                 partial_variables={"user_input": user_prompt,
                                     "json_schema": str(self.simplefied_schema)})
 
diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py
index f5b72f3e..479d6ece 100644
--- a/scrapegraphai/prompts/__init__.py
+++ b/scrapegraphai/prompts/__init__.py
@@ -11,3 +11,6 @@ from .robots_node_prompts import TEMPLATE_ROBOT
 from .search_internet_node_prompts import TEMPLATE_SEARCH_INTERNET
 from .search_link_node_prompts import TEMPLATE_RELEVANT_LINKS
 from .search_node_with_context_prompts import TEMPLATE_SEARCH_WITH_CONTEXT_CHUNKS, TEMPLATE_SEARCH_WITH_CONTEXT_NO_CHUNKS
+from .prompt_refiner_node_prompts import TEMPLATE_REFINER, TEMPLATE_REFINER_WITH_CONTEXT
+from .html_analyzer_node_prompts import TEMPLATE_HTML_ANALYSIS, TEMPLATE_HTML_ANALYSIS_WITH_CONTEXT
+from .generate_code_node_prompts import TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_SYNTAX_CODE_GENERATION, TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION, TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_VALIDATION_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS, TEMPLATE_SEMANTIC_CODE_GENERATION
\ No newline at end of file
diff --git a/scrapegraphai/prompts/generate_code_node_templates.py b/scrapegraphai/prompts/generate_code_node_templates.py
new file mode 100644
index 00000000..eab92ee4
--- /dev/null
+++ b/scrapegraphai/prompts/generate_code_node_templates.py
@@ -0,0 +1,213 @@
+"""
+Generate code prompts helper
+"""
+
+
+TEMPLATE_INIT_CODE_GENERATION = """
+**Task**: Create a Python function named `extract_data(html: str) -> dict()` using BeautifulSoup that extracts relevant information from the given HTML code string and returns it in a dictionary matching the Desired JSON Output Schema.
+
+**User's Request**:
+{user_input}
+
+**Desired JSON Output Schema**:
+```json
+{json_schema}
+```
+
+**Initial Task Analysis**:
+{initial_analysis}
+
+**HTML Code**:
+```html
+{html_code}
+```
+
+**HTML Structure Analysis**:
+{html_analysis}
+
+Based on the above analyses, generate the `extract_data(html: str) -> dict()` function that:
+1. Efficiently extracts the required data from the given HTML structure.
+2. Processes and structures the data according to the specified JSON schema.
+3. Returns the structured data as a dictionary.
+
+Your code should be well-commented, explaining the reasoning behind key decisions and any potential areas for improvement or customization.
+
+Use only the following pre-imported libraries:
+- BeautifulSoup from bs4
+- re
+
+**Output ONLY the Python code of the extract_data function, WITHOUT ANY IMPORTS OR ADDITIONAL TEXT.**
+In your code do not include backticks.
+
+**Response**:
+"""
+
+TEMPLATE_SYNTAX_ANALYSIS = """
+The current code has encountered a syntax error. Here are the details:
+
+Current Code:
+```python
+{generated_code}
+```
+
+Syntax Error:
+{errors}
+
+Please analyze in detail the syntax error and suggest a fix. Focus only on correcting the syntax issue while ensuring the code still meets the original requirements.
+
+Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
+"""
+
+TEMPLATE_SYNTAX_CODE_GENERATION = """
+Based on the following analysis of a syntax error, please generate the corrected code, following the suggested fix.:
+
+Error Analysis:
+{analysis}
+
+Original Code:
+```python
+{generated_code}
+```
+
+Generate the corrected code, applying the suggestions from the analysis. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
+"""
+
+TEMPLATE_EXECUTION_ANALYSIS = """
+The current code has encountered an execution error. Here are the details:
+
+**Current Code**:
+```python
+{generated_code}
+```
+
+**Execution Error**:
+{errors}
+
+**HTML Code**:
+```html
+{html_code}
+```
+
+**HTML Structure Analysis**:
+{html_analysis}
+
+Please analyze the execution error and suggest a fix. Focus only on correcting the execution issue while ensuring the code still meets the original requirements and maintains correct syntax.
+The suggested fix should address the execution error and ensure the function can successfully extract the required data from the provided HTML structure. Be sure to be precise and specific in your analysis.
+
+Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
+"""
+
+TEMPLATE_EXECUTION_CODE_GENERATION = """
+Based on the following analysis of an execution error, please generate the corrected code:
+
+Error Analysis:
+{analysis}
+
+Original Code:
+```python
+{generated_code}
+```
+
+Generate the corrected code, applying the suggestions from the analysis. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
+"""
+
+TEMPLATE_VALIDATION_ANALYSIS = """
+The current code's output does not match the required schema. Here are the details:
+
+Current Code:
+```python
+{generated_code}
+```
+
+Validation Errors:
+{errors}
+
+Required Schema:
+```json
+{json_schema}
+```
+
+Current Output:
+{execution_result}
+
+Please analyze the validation errors and suggest fixes. Focus only on correcting the output to match the required schema while ensuring the code maintains correct syntax and execution.
+
+Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
+"""
+
+TEMPLATE_VALIDATION_CODE_GENERATION = """
+Based on the following analysis of a validation error, please generate the corrected code:
+
+Error Analysis:
+{analysis}
+
+Original Code:
+```python
+{generated_code}
+```
+
+Required Schema:
+```json
+{json_schema}
+```
+
+Generate the corrected code, applying the suggestions from the analysis and ensuring the output matches the required schema. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
+"""
+
+TEMPLATE_SEMANTIC_COMPARISON = """
+Compare the Generated Result with the Reference Result and determine if they are semantically equivalent:
+
+Generated Result:
+{generated_result}
+
+Reference Result (Correct Output):
+{reference_result}
+
+Analyze the content, structure, and meaning of both results. They should be considered semantically equivalent if they convey the same information, even if the exact wording or structure differs.
+If they are not semantically equivalent, identify what are the key differences in the Generated Result. The Reference Result should be considered the correct output, you need to pinpoint the problems in the Generated Result.
+
+{format_instructions}
+
+Human: Are the generated result and reference result semantically equivalent? If not, what are the key differences?
+
+Assistant: Let's analyze the two results carefully:
+"""
+
+TEMPLATE_SEMANTIC_ANALYSIS = """
+The current code's output is semantically different from the reference answer. Here are the details:
+
+Current Code:
+```python
+{generated_code}
+```
+
+Semantic Differences:
+{differences}
+
+Comparison Explanation:
+{explanation}
+
+Please analyze these semantic differences and suggest how to modify the code to produce a result that is semantically equivalent to the reference answer. Focus on addressing the key differences while maintaining the overall structure and functionality of the code.
+
+Provide your analysis and suggestions for fixing the semantic differences. DO NOT generate any code in your response.
+"""
+
+TEMPLATE_SEMANTIC_CODE_GENERATION = """
+Based on the following analysis of semantic differences, please generate the corrected code:
+
+Semantic Analysis:
+{analysis}
+
+Original Code:
+```python
+{generated_code}
+```
+
+Generated Result:
+{generated_result}
+
+Reference Result:
+{reference_result}
+
+Generate the corrected code, applying the suggestions from the analysis to make the output semantically equivalent to the reference result. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
+"""
\ No newline at end of file
diff --git a/scrapegraphai/prompts/html_analyzer_node_prompts.py b/scrapegraphai/prompts/html_analyzer_node_prompts.py
new file mode 100644
index 00000000..d7e6e342
--- /dev/null
+++ b/scrapegraphai/prompts/html_analyzer_node_prompts.py
@@ -0,0 +1,71 @@
+"""
+HTML analysis prompts helper
+"""
+
+
+TEMPLATE_HTML_ANALYSIS = """
+Task: Your job is to analyze the provided HTML code in relation to the initial scraping task analysis and provide all the necessary HTML information useful for implementing a function that extracts data from the given HTML string.
+
+**Initial Analysis**:
+{initial_analysis}
+
+**HTML Code**:
+```html
+{html_code}
+```
+
+**HTML Analysis Instructions**:
+1. Examine the HTML code and identify elements, classes, or IDs that correspond to each required data field mentioned in the Initial Analysis.
+2. Look for patterns or repeated structures that could indicate multiple items (e.g., product listings).
+3. Note any nested structures or relationships between elements that are relevant to the data extraction task.
+4. Discuss any additional considerations based on the specific HTML layout that are crucial for accurate data extraction.
+5. Recommend the specific strategy to use for scraping the content, remeber.
+
+**Important Notes**:
+- The function that the code generator is gonig to implement will receive the HTML as a string parameter, not as a live webpage.
+- No web scraping, automation, or handling of dynamic content is required.
+- The analysis should focus solely on extracting data from the static HTML provided.
+- Be precise and specific in your analysis, as the code generator will, possibly, not have access to the full HTML context.
+
+This HTML analysis will be used to guide the final code generation process for a function that extracts data from the given HTML string.
+Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
+
+Focus on providing a concise, step-by-step analysis of the HTML structure and the key elements needed for data extraction. Do not include any code examples or implementation logic. Keep the response focused and avoid general statements.**
+
+**HTML Analysis for Data Extraction**:
+"""
+
+TEMPLATE_HTML_ANALYSIS_WITH_CONTEXT = """
+Task: Your job is to analyze the provided HTML code in relation to the initial scraping task analysis and the additional context the user provided and provide all the necessary HTML information useful for implementing a function that extracts data from the given HTML string.
+
+**Initial Analysis**:
+{initial_analysis}
+
+**HTML Code**:
+```html
+{html_code}
+```
+
+**Additional Context**:
+{additional_context}
+
+**HTML Analysis Instructions**:
+1. Examine the HTML code and identify elements, classes, or IDs that correspond to each required data field mentioned in the Initial Analysis.
+2. Look for patterns or repeated structures that could indicate multiple items (e.g., product listings).
+3. Note any nested structures or relationships between elements that are relevant to the data extraction task.
+4. Discuss any additional considerations based on the specific HTML layout that are crucial for accurate data extraction.
+5. Recommend the specific strategy to use for scraping the content, remeber.
+
+**Important Notes**:
+- The function that the code generator is gonig to implement will receive the HTML as a string parameter, not as a live webpage.
+- No web scraping, automation, or handling of dynamic content is required.
+- The analysis should focus solely on extracting data from the static HTML provided.
+- Be precise and specific in your analysis, as the code generator will, possibly, not have access to the full HTML context.
+
+This HTML analysis will be used to guide the final code generation process for a function that extracts data from the given HTML string.
+Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
+
+Focus on providing a concise, step-by-step analysis of the HTML structure and the key elements needed for data extraction. Do not include any code examples or implementation logic. Keep the response focused and avoid general statements.**
+In your code do not include backticks.
+**HTML Analysis for Data Extraction**:
+"""
\ No newline at end of file
diff --git a/scrapegraphai/prompts/prompt_refiner_node_prompts.py b/scrapegraphai/prompts/prompt_refiner_node_prompts.py
new file mode 100644
index 00000000..edbb1498
--- /dev/null
+++ b/scrapegraphai/prompts/prompt_refiner_node_prompts.py
@@ -0,0 +1,63 @@
+"""
+Prompts refiner prompts helper
+"""
+
+TEMPLATE_REFINER = """
+**Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n
+Break down the user's request into key components, and then explicitly connect these components to the 
+corresponding elements within the JSON schema.
+
+**User's Request**:
+{user_input}
+
+**Desired JSON Output Schema**:
+```json
+{json_schema}
+```
+
+**Analysis Instructions**:
+1. **Break Down User Request:** 
+* Clearly identify the core entities or data types the user is asking for.\n
+* Highlight any specific attributes or relationships mentioned in the request.\n
+
+2. **Map to JSON Schema**:
+* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
+* Explain how the schema structure accommodates the user's needs.
+* If applicable, mention any schema elements that are not directly addressed in the user's request.\n
+
+This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
+Please generate only the analysis and no other text.
+
+**Response**:
+"""
+        
+TEMPLATE_REFINER_WITH_CONTEXT = """
+**Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n
+Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n
+
+**User's Request**:
+{user_input}
+
+**Desired JSON Output Schema**:
+```json
+{json_schema}
+```
+
+**Additional Context**:
+{additional_context}
+
+**Analysis Instructions**:
+1. **Break Down User Request:** 
+* Clearly identify the core entities or data types the user is asking for.\n
+* Highlight any specific attributes or relationships mentioned in the request.\n
+
+2. **Map to JSON Schema**:
+* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
+* Explain how the schema structure accommodates the user's needs.\n
+* If applicable, mention any schema elements that are not directly addressed in the user's request.\n
+
+This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
+Please generate only the analysis and no other text.
+
+**Response**:
+"""
\ No newline at end of file

From ce841e21fb3d8f6404e70e11447573f7eb0f18d9 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Tue, 24 Sep 2024 19:04:43 +0200
Subject: [PATCH 25/27] Code generation refactoring

---
 scrapegraphai/nodes/generate_code_node.py     | 149 +++---------------
 scrapegraphai/prompts/__init__.py             |   7 +-
 ...lates.py => generate_code_node_prompts.py} |   0
 scrapegraphai/utils/__init__.py               |   6 +
 scrapegraphai/utils/cleanup_code.py           |  11 ++
 scrapegraphai/utils/code_error_analysis.py    |  48 ++++++
 scrapegraphai/utils/code_error_correction.py  |  45 ++++++
 scrapegraphai/utils/dict_content_compare.py   |  30 ++++
 8 files changed, 168 insertions(+), 128 deletions(-)
 rename scrapegraphai/prompts/{generate_code_node_templates.py => generate_code_node_prompts.py} (100%)
 create mode 100644 scrapegraphai/utils/cleanup_code.py
 create mode 100644 scrapegraphai/utils/code_error_analysis.py
 create mode 100644 scrapegraphai/utils/code_error_correction.py
 create mode 100644 scrapegraphai/utils/dict_content_compare.py

diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
index c7cdd031..1174a4aa 100644
--- a/scrapegraphai/nodes/generate_code_node.py
+++ b/scrapegraphai/nodes/generate_code_node.py
@@ -16,15 +16,17 @@ import re
 from tqdm import tqdm
 from .base_node import BaseNode
 from pydantic import ValidationError
-from ..utils import transform_schema
+from ..utils import (transform_schema,
+                    extract_code,
+                    syntax_focused_analysis, syntax_focused_code_generation,
+                    execution_focused_analysis, execution_focused_code_generation,
+                    validation_focused_analysis, validation_focused_code_generation,
+                    semantic_focused_analysis, semantic_focused_code_generation,
+                    are_content_equal)
 from jsonschema import validate, ValidationError
 import json
-import string
 from ..prompts import (
-    TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_SYNTAX_CODE_GENERATION,
-    TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION, TEMPLATE_VALIDATION_ANALYSIS,
-    TEMPLATE_VALIDATION_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS,
-    TEMPLATE_SEMANTIC_CODE_GENERATION
+    TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON
 )
 
 class GenerateCodeNode(BaseNode):
@@ -141,7 +143,7 @@ class GenerateCodeNode(BaseNode):
     def overall_reasoning_loop(self, state: dict) -> dict:
         self.logger.info(f"--- (Generating Code) ---")
         state["generated_code"] = self.generate_initial_code(state)
-        state["generated_code"] = self.extract_code(state["generated_code"])
+        state["generated_code"] = extract_code(state["generated_code"])
         
         while state["iteration"] < self.max_iterations["overall"]:
             state["iteration"] += 1
@@ -185,10 +187,10 @@ class GenerateCodeNode(BaseNode):
             
             state["errors"]["syntax"] = [syntax_message]
             self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
-            analysis = self.syntax_focused_analysis(state)
+            analysis = syntax_focused_analysis(state, self.llm_model)
             self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
-            state["generated_code"] = self.syntax_focused_code_generation(state, analysis)
-            state["generated_code"] = self.extract_code(state["generated_code"])
+            state["generated_code"] = syntax_focused_code_generation(state, analysis, self.llm_model)
+            state["generated_code"] = extract_code(state["generated_code"])
         return state
     
     def execution_reasoning_loop(self, state: dict) -> dict:
@@ -201,10 +203,10 @@ class GenerateCodeNode(BaseNode):
             
             state["errors"]["execution"] = [execution_result]
             self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
-            analysis = self.execution_focused_analysis(state)
+            analysis = execution_focused_analysis(state, self.llm_model)
             self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
-            state["generated_code"] = self.execution_focused_code_generation(state, analysis)
-            state["generated_code"] = self.extract_code(state["generated_code"])
+            state["generated_code"] = execution_focused_code_generation(state, analysis, self.llm_model)
+            state["generated_code"] = extract_code(state["generated_code"])
         return state
     
     def validation_reasoning_loop(self, state: dict) -> dict:
@@ -216,10 +218,10 @@ class GenerateCodeNode(BaseNode):
             
             state["errors"]["validation"] = errors
             self.logger.info(f"--- (Code Output not compliant to the deisred Output Schema) ---")
-            analysis = self.validation_focused_analysis(state)
+            analysis = validation_focused_analysis(state, self.llm_model)
             self.logger.info(f"--- (Regenerating Code to make the Output compliant to the deisred Output Schema) ---")
-            state["generated_code"] = self.validation_focused_code_generation(state, analysis)
-            state["generated_code"] = self.extract_code(state["generated_code"])
+            state["generated_code"] = validation_focused_code_generation(state, analysis, self.llm_model)
+            state["generated_code"] = extract_code(state["generated_code"])
         return state
     
     def semantic_comparison_loop(self, state: dict) -> dict:
@@ -231,10 +233,10 @@ class GenerateCodeNode(BaseNode):
             
             state["errors"]["semantic"] = comparison_result["differences"]
             self.logger.info(f"--- (The informations exctrcated are not the all ones requested) ---")
-            analysis = self.semantic_focused_analysis(state, comparison_result)
+            analysis = semantic_focused_analysis(state, comparison_result, self.llm_model)
             self.logger.info(f"--- (Regenerating Code to obtain all the infromation requested) ---")
-            state["generated_code"] = self.semantic_focused_code_generation(state, analysis)
-            state["generated_code"] = self.extract_code(state["generated_code"])
+            state["generated_code"] = semantic_focused_code_generation(state, analysis, self.llm_model)
+            state["generated_code"] = extract_code(state["generated_code"])
         return state
     
     def generate_initial_code(self, state: dict) -> str:
@@ -254,59 +256,6 @@ class GenerateCodeNode(BaseNode):
         generated_code = chain.invoke({})
         return generated_code
     
-    def syntax_focused_analysis(self, state: dict) -> str:
-        prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS, input_variables=["generated_code", "errors"])
-        chain = prompt | self.llm_model | StrOutputParser()
-        return chain.invoke({
-            "generated_code": state["generated_code"],
-            "errors": state["errors"]["syntax"]
-        })
-    
-    def syntax_focused_code_generation(self, state: dict, analysis: str) -> str:
-        prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION, input_variables=["analysis", "generated_code"])
-        chain = prompt | self.llm_model | StrOutputParser()
-        return chain.invoke({
-            "analysis": analysis,
-            "generated_code": state["generated_code"]
-        })
-    
-    def execution_focused_analysis(self, state: dict) -> str:
-        prompt = PromptTemplate(template=TEMPLATE_EXECUTION_ANALYSIS, input_variables=["generated_code", "errors", "html_code", "html_analysis"])
-        chain = prompt | self.llm_model | StrOutputParser()
-        return chain.invoke({
-            "generated_code": state["generated_code"],
-            "errors": state["errors"]["execution"],
-            "html_code": state["html_code"],
-            "html_analysis": state["html_analysis"]
-        })
-    
-    def execution_focused_code_generation(self, state: dict, analysis: str) -> str:
-        prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION, input_variables=["analysis", "generated_code"])
-        chain = prompt | self.llm_model | StrOutputParser()
-        return chain.invoke({
-            "analysis": analysis,
-            "generated_code": state["generated_code"]
-        })
-    
-    def validation_focused_analysis(self, state: dict) -> str:
-        prompt = PromptTemplate(template=TEMPLATE_VALIDATION_ANALYSIS, input_variables=["generated_code", "errors", "json_schema", "execution_result"])
-        chain = prompt | self.llm_model | StrOutputParser()
-        return chain.invoke({
-            "generated_code": state["generated_code"],
-            "errors": state["errors"]["validation"],
-            "json_schema": state["json_schema"],
-            "execution_result": state["execution_result"]
-        })
-    
-    def validation_focused_code_generation(self, state: dict, analysis: str) -> str:
-        prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION, input_variables=["analysis", "generated_code", "json_schema"])
-        chain = prompt | self.llm_model | StrOutputParser()
-        return chain.invoke({
-            "analysis": analysis,
-            "generated_code": state["generated_code"],
-            "json_schema": state["json_schema"]
-        })
-    
     def semantic_comparison(self, generated_result: Any, reference_result: Any) -> Dict[str, Any]:
         reference_result_dict = self.output_schema(**reference_result).dict()
         
@@ -337,25 +286,6 @@ class GenerateCodeNode(BaseNode):
             "reference_result": json.dumps(reference_result_dict, indent=2)
         })
     
-    def semantic_focused_analysis(self, state: dict, comparison_result: Dict[str, Any]) -> str:        
-        prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_ANALYSIS, input_variables=["generated_code", "differences", "explanation"])
-        chain = prompt | self.llm_model | StrOutputParser()
-        return chain.invoke({
-            "generated_code": state["generated_code"],
-            "differences": json.dumps(comparison_result["differences"], indent=2),
-            "explanation": comparison_result["explanation"]
-        })
-    
-    def semantic_focused_code_generation(self, state: dict, analysis: str) -> str:
-        prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION, input_variables=["analysis", "generated_code", "generated_result", "reference_result"])
-        chain = prompt | self.llm_model | StrOutputParser()
-        return chain.invoke({
-            "analysis": analysis,
-            "generated_code": state["generated_code"],
-            "generated_result": json.dumps(state["execution_result"], indent=2),
-            "reference_result": json.dumps(state["reference_answer"], indent=2)
-        })
-    
     def syntax_check(self, code):
         try:
             ast.parse(code)
@@ -396,39 +326,4 @@ class GenerateCodeNode(BaseNode):
             return True, None
         except ValidationError as e:
             errors = e.errors()
-            return False, errors
-    
-    def extract_code(self, code: str) -> str:
-        pattern = r'```(?:python)?\n(.*?)```'
-        
-        match = re.search(pattern, code, re.DOTALL)
-        
-        return match.group(1) if match else code
-
-
-
-def normalize_dict(d: Dict[str, Any]) -> Dict[str, Any]:
-    normalized = {}
-    for key, value in d.items():
-        if isinstance(value, str):
-            normalized[key] = value.lower().strip()
-        elif isinstance(value, dict):
-            normalized[key] = normalize_dict(value)
-        elif isinstance(value, list):
-            normalized[key] = normalize_list(value)
-        else:
-            normalized[key] = value
-    return normalized
-
-def normalize_list(lst: List[Any]) -> List[Any]:
-    return [
-        normalize_dict(item) if isinstance(item, dict)
-        else normalize_list(item) if isinstance(item, list)
-        else item.lower().strip() if isinstance(item, str)
-        else item
-        for item in lst
-    ]
-
-def are_content_equal(generated_result: Dict[str, Any], reference_result: Dict[str, Any]) -> bool:
-    """Compare two dictionaries for semantic equality."""
-    return normalize_dict(generated_result) == normalize_dict(reference_result)
\ No newline at end of file
+            return False, errors
\ No newline at end of file
diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py
index 479d6ece..f7be89c1 100644
--- a/scrapegraphai/prompts/__init__.py
+++ b/scrapegraphai/prompts/__init__.py
@@ -13,4 +13,9 @@ from .search_link_node_prompts import TEMPLATE_RELEVANT_LINKS
 from .search_node_with_context_prompts import TEMPLATE_SEARCH_WITH_CONTEXT_CHUNKS, TEMPLATE_SEARCH_WITH_CONTEXT_NO_CHUNKS
 from .prompt_refiner_node_prompts import TEMPLATE_REFINER, TEMPLATE_REFINER_WITH_CONTEXT
 from .html_analyzer_node_prompts import TEMPLATE_HTML_ANALYSIS, TEMPLATE_HTML_ANALYSIS_WITH_CONTEXT
-from .generate_code_node_prompts import TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_SYNTAX_CODE_GENERATION, TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION, TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_VALIDATION_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS, TEMPLATE_SEMANTIC_CODE_GENERATION
\ No newline at end of file
+from .generate_code_node_prompts import (TEMPLATE_INIT_CODE_GENERATION,
+                                         TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_SYNTAX_CODE_GENERATION,
+                                         TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION,
+                                         TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_VALIDATION_CODE_GENERATION,
+                                         TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS,
+                                         TEMPLATE_SEMANTIC_CODE_GENERATION)
\ No newline at end of file
diff --git a/scrapegraphai/prompts/generate_code_node_templates.py b/scrapegraphai/prompts/generate_code_node_prompts.py
similarity index 100%
rename from scrapegraphai/prompts/generate_code_node_templates.py
rename to scrapegraphai/prompts/generate_code_node_prompts.py
diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
index 84ab44b4..303150f0 100644
--- a/scrapegraphai/utils/__init__.py
+++ b/scrapegraphai/utils/__init__.py
@@ -19,3 +19,9 @@ from .tokenizer import num_tokens_calculus
 from .split_text_into_chunks import split_text_into_chunks
 from .llm_callback_manager import CustomLLMCallbackManager
 from .schema_trasform import transform_schema
+from .cleanup_code import extract_code
+from .dict_content_compare import are_content_equal
+from .code_error_analysis import (syntax_focused_analysis, execution_focused_analysis,
+                                  validation_focused_analysis, semantic_focused_analysis)
+from .code_error_correction import (syntax_focused_code_generation, execution_focused_code_generation,
+                                    validation_focused_code_generation, semantic_focused_code_generation)
\ No newline at end of file
diff --git a/scrapegraphai/utils/cleanup_code.py b/scrapegraphai/utils/cleanup_code.py
new file mode 100644
index 00000000..9bf91e62
--- /dev/null
+++ b/scrapegraphai/utils/cleanup_code.py
@@ -0,0 +1,11 @@
+"""
+This utility function extracts the code from a given string.
+"""
+import re
+
+def extract_code(code: str) -> str:
+    pattern = r'```(?:python)?\n(.*?)```'
+    
+    match = re.search(pattern, code, re.DOTALL)
+    
+    return match.group(1) if match else code
\ No newline at end of file
diff --git a/scrapegraphai/utils/code_error_analysis.py b/scrapegraphai/utils/code_error_analysis.py
new file mode 100644
index 00000000..fba7e005
--- /dev/null
+++ b/scrapegraphai/utils/code_error_analysis.py
@@ -0,0 +1,48 @@
+"""
+This module contains the functions that are used to generate the prompts for the code error analysis.
+"""
+from typing import Any, Dict
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+import json
+from ..prompts import (
+    TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_EXECUTION_ANALYSIS,
+    TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_SEMANTIC_ANALYSIS
+)
+
+def syntax_focused_analysis(state: dict, llm_model) -> str:
+    prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS, input_variables=["generated_code", "errors"])
+    chain = prompt | llm_model | StrOutputParser()
+    return chain.invoke({
+        "generated_code": state["generated_code"],
+        "errors": state["errors"]["syntax"]
+    })
+
+def execution_focused_analysis(state: dict, llm_model) -> str:
+    prompt = PromptTemplate(template=TEMPLATE_EXECUTION_ANALYSIS, input_variables=["generated_code", "errors", "html_code", "html_analysis"])
+    chain = prompt | llm_model | StrOutputParser()
+    return chain.invoke({
+        "generated_code": state["generated_code"],
+        "errors": state["errors"]["execution"],
+        "html_code": state["html_code"],
+        "html_analysis": state["html_analysis"]
+    })
+
+def validation_focused_analysis(state: dict, llm_model) -> str:
+    prompt = PromptTemplate(template=TEMPLATE_VALIDATION_ANALYSIS, input_variables=["generated_code", "errors", "json_schema", "execution_result"])
+    chain = prompt | llm_model | StrOutputParser()
+    return chain.invoke({
+        "generated_code": state["generated_code"],
+        "errors": state["errors"]["validation"],
+        "json_schema": state["json_schema"],
+        "execution_result": state["execution_result"]
+    })
+
+def semantic_focused_analysis(state: dict, comparison_result: Dict[str, Any], llm_model) -> str:        
+    prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_ANALYSIS, input_variables=["generated_code", "differences", "explanation"])
+    chain = prompt | llm_model | StrOutputParser()
+    return chain.invoke({
+        "generated_code": state["generated_code"],
+        "differences": json.dumps(comparison_result["differences"], indent=2),
+        "explanation": comparison_result["explanation"]
+    })
\ No newline at end of file
diff --git a/scrapegraphai/utils/code_error_correction.py b/scrapegraphai/utils/code_error_correction.py
new file mode 100644
index 00000000..276c7a62
--- /dev/null
+++ b/scrapegraphai/utils/code_error_correction.py
@@ -0,0 +1,45 @@
+"""
+This module contains the code generation functions for code correction for different types errors.
+"""
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+import json
+from ..prompts import (
+    TEMPLATE_SYNTAX_CODE_GENERATION, TEMPLATE_EXECUTION_CODE_GENERATION,
+    TEMPLATE_VALIDATION_CODE_GENERATION, TEMPLATE_SEMANTIC_CODE_GENERATION
+)
+
+def syntax_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
+    prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION, input_variables=["analysis", "generated_code"])
+    chain = prompt | llm_model | StrOutputParser()
+    return chain.invoke({
+        "analysis": analysis,
+        "generated_code": state["generated_code"]
+    })
+
+def execution_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
+    prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION, input_variables=["analysis", "generated_code"])
+    chain = prompt | llm_model | StrOutputParser()
+    return chain.invoke({
+        "analysis": analysis,
+        "generated_code": state["generated_code"]
+    })
+
+def validation_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
+    prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION, input_variables=["analysis", "generated_code", "json_schema"])
+    chain = prompt | llm_model | StrOutputParser()
+    return chain.invoke({
+        "analysis": analysis,
+        "generated_code": state["generated_code"],
+        "json_schema": state["json_schema"]
+    })
+    
+def semantic_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
+    prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION, input_variables=["analysis", "generated_code", "generated_result", "reference_result"])
+    chain = prompt | llm_model | StrOutputParser()
+    return chain.invoke({
+        "analysis": analysis,
+        "generated_code": state["generated_code"],
+        "generated_result": json.dumps(state["execution_result"], indent=2),
+        "reference_result": json.dumps(state["reference_answer"], indent=2)
+    })
\ No newline at end of file
diff --git a/scrapegraphai/utils/dict_content_compare.py b/scrapegraphai/utils/dict_content_compare.py
new file mode 100644
index 00000000..ddebbbc3
--- /dev/null
+++ b/scrapegraphai/utils/dict_content_compare.py
@@ -0,0 +1,30 @@
+"""
+Utility functions for comparing the content of two dictionaries.
+"""
+from typing import Any, Dict, List
+
+def normalize_dict(d: Dict[str, Any]) -> Dict[str, Any]:
+    normalized = {}
+    for key, value in d.items():
+        if isinstance(value, str):
+            normalized[key] = value.lower().strip()
+        elif isinstance(value, dict):
+            normalized[key] = normalize_dict(value)
+        elif isinstance(value, list):
+            normalized[key] = normalize_list(value)
+        else:
+            normalized[key] = value
+    return normalized
+
+def normalize_list(lst: List[Any]) -> List[Any]:
+    return [
+        normalize_dict(item) if isinstance(item, dict)
+        else normalize_list(item) if isinstance(item, list)
+        else item.lower().strip() if isinstance(item, str)
+        else item
+        for item in lst
+    ]
+
+def are_content_equal(generated_result: Dict[str, Any], reference_result: Dict[str, Any]) -> bool:
+    """Compare two dictionaries for semantic equality."""
+    return normalize_dict(generated_result) == normalize_dict(reference_result)
\ No newline at end of file

From bcf02e5f19ff5eb7e02fa940fcef8ac474555fa5 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Tue, 24 Sep 2024 21:42:45 +0200
Subject: [PATCH 26/27] add possiibility to save the code

---
 .../code_generation/simple_with_schema.py     |  1 +
 extract_data.py                               | 27 ++++++++++++++
 scrapegraphai/graphs/code_generator_graph.py  | 37 ++++++++++++++-----
 3 files changed, 56 insertions(+), 9 deletions(-)
 create mode 100644 extract_data.py

diff --git a/examples/code_generation/simple_with_schema.py b/examples/code_generation/simple_with_schema.py
index 58e12e0e..544c1724 100644
--- a/examples/code_generation/simple_with_schema.py
+++ b/examples/code_generation/simple_with_schema.py
@@ -42,6 +42,7 @@ graph_config = {
         "validation": 3,
         "semantic": 3
     },
+    "output_file_name": "extracted_data.py"
 }
 
 # ************************************************
diff --git a/extract_data.py b/extract_data.py
new file mode 100644
index 00000000..df3babc2
--- /dev/null
+++ b/extract_data.py
@@ -0,0 +1,27 @@
+def extract_data(html: str) -> dict:
+    from bs4 import BeautifulSoup
+
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html, 'html.parser')
+
+    # Initialize an empty list to hold project data
+    projects = []
+
+    # Find all project entries in the HTML
+    project_entries = soup.find_all('div', class_='grid-item')
+
+    # Iterate over each project entry to extract title and description
+    for entry in project_entries:
+        # Extract the title from the h4 element
+        title = entry.find('h4', class_='card-title').get_text(strip=True)
+        # Extract the description from the p element
+        description = entry.find('p', class_='card-text').get_text(strip=True)
+
+        # Append the extracted data as a dictionary to the projects list
+        projects.append({
+            'title': title,
+            'description': description
+        })
+
+    # Return the structured data as a dictionary matching the desired JSON schema
+    return {'projects': projects}
\ No newline at end of file
diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
index ca763896..6dcdf79e 100644
--- a/scrapegraphai/graphs/code_generator_graph.py
+++ b/scrapegraphai/graphs/code_generator_graph.py
@@ -17,17 +17,17 @@ from ..nodes import (
 
 class CodeGeneratorGraph(AbstractGraph):
     """
-    CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for 
-    extarcting the wanted informations from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
-    It requires a user prompt, a source URL, and a output schema.
-    
+    CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for
+    extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
+    It requires a user prompt, a source URL, and an output schema.
+
     Attributes:
         prompt (str): The prompt for the graph.
         source (str): The source of the graph.
         config (dict): Configuration parameters for the graph.
         schema (BaseModel): The schema for the graph output.
         llm_model: An instance of a language model client, configured for generating answers.
-        embedder_model: An instance of an embedding model client, 
+        embedder_model: An instance of an embedding model client,
         configured for generating embeddings.
         verbose (bool): A flag indicating whether to show print statements during execution.
         headless (bool): A flag indicating whether to run the graph in headless mode.
@@ -96,7 +96,6 @@ class CodeGeneratorGraph(AbstractGraph):
                 "schema": self.schema,
             }
         )
-        
         prompt_refier_node = PromptRefinerNode(
             input="user_prompt",
             output=["refined_prompt"],
@@ -106,7 +105,6 @@ class CodeGeneratorGraph(AbstractGraph):
                 "schema": self.schema
             }
         )
-        
         html_analyzer_node = HtmlAnalyzerNode(
             input="refined_prompt & original_html",
             output=["html_info", "reduced_html"],
@@ -117,7 +115,6 @@ class CodeGeneratorGraph(AbstractGraph):
                 "reduction": self.config.get("reduction", 0)
             }
         )
-        
         generate_code_node = GenerateCodeNode(
             input="user_prompt & refined_prompt & html_info & reduced_html & answer",
             output=["generated_code"],
@@ -166,4 +163,26 @@ class CodeGeneratorGraph(AbstractGraph):
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("generated_code", "No code created.")
+        generated_code = self.final_state.get("generated_code", "No code created.")
+
+        if self.config.get("filename") is None:
+            filename = "extracted_data.py"
+        elif ".py" not in self.config.get("filename"):
+            filename += ".py"
+        else:
+            filename = self.config.get("filename")
+
+        self.save_code_to_file(generated_code, filename)
+
+        return generated_code
+
+    def save_code_to_file(self, code: str, filename:str) -> None:
+        """
+        Saves the generated code to a Python file.
+
+        Args:
+            code (str): The generated code to be saved.
+            filename (str): name of the output file
+        """
+        with open(filename, "w") as file:
+            file.write(code)

From fb879012d32ee1d3c2217aa2dafbca78f1b74738 Mon Sep 17 00:00:00 2001
From: Matteo Vedovati <matteo.vedovati.77@gmail.com>
Date: Wed, 25 Sep 2024 10:28:11 +0200
Subject: [PATCH 27/27] Add code generator examples

---
 .../code_generator_graph_anthropic.py         | 60 ++++++++++++++++
 examples/azure/code_generator_graph_azure.py  | 58 +++++++++++++++
 .../bedrock/code_generator_graph_bedrock.py   | 60 ++++++++++++++++
 .../deepseek/code_generator_graph_deepseek.py | 60 ++++++++++++++++
 examples/ernie/code_generator_graph_ernie.py  | 62 ++++++++++++++++
 .../code_generator_graph_fireworks.py         | 60 ++++++++++++++++
 .../code_generator_graph_gemini.py            | 60 ++++++++++++++++
 .../code_generator_graph_vertex.py            | 60 ++++++++++++++++
 examples/groq/code_generator_graph_groq.py    | 61 ++++++++++++++++
 .../code_generator_graph_huggingfacehub.py    | 71 +++++++++++++++++++
 .../code_generator_graph_ollama.py            | 61 ++++++++++++++++
 .../mistral/code_generator_graph_mistral.py   | 60 ++++++++++++++++
 .../moonshot/code_generator_graph_moonshot.py | 67 +++++++++++++++++
 .../nemotron/code_generator_graph_nemotron.py | 58 +++++++++++++++
 .../oneapi/code_generator_graph_oneapi.py     | 61 ++++++++++++++++
 .../code_generator_graph_openai.py}           |  2 +-
 16 files changed, 920 insertions(+), 1 deletion(-)
 create mode 100644 examples/anthropic/code_generator_graph_anthropic.py
 create mode 100644 examples/azure/code_generator_graph_azure.py
 create mode 100644 examples/bedrock/code_generator_graph_bedrock.py
 create mode 100644 examples/deepseek/code_generator_graph_deepseek.py
 create mode 100644 examples/ernie/code_generator_graph_ernie.py
 create mode 100644 examples/fireworks/code_generator_graph_fireworks.py
 create mode 100644 examples/google_genai/code_generator_graph_gemini.py
 create mode 100644 examples/google_vertexai/code_generator_graph_vertex.py
 create mode 100644 examples/groq/code_generator_graph_groq.py
 create mode 100644 examples/huggingfacehub/code_generator_graph_huggingfacehub.py
 create mode 100644 examples/local_models/code_generator_graph_ollama.py
 create mode 100644 examples/mistral/code_generator_graph_mistral.py
 create mode 100644 examples/moonshot/code_generator_graph_moonshot.py
 create mode 100644 examples/nemotron/code_generator_graph_nemotron.py
 create mode 100644 examples/oneapi/code_generator_graph_oneapi.py
 rename examples/{code_generation/simple_with_schema.py => openai/code_generator_graph_openai.py} (97%)

diff --git a/examples/anthropic/code_generator_graph_anthropic.py b/examples/anthropic/code_generator_graph_anthropic.py
new file mode 100644
index 00000000..49bd413d
--- /dev/null
+++ b/examples/anthropic/code_generator_graph_anthropic.py
@@ -0,0 +1,60 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+anthropic_key = os.getenv("ANTHROPIC_API_KEY")
+
+graph_config = {
+    "llm": {
+        "api_key":anthropic_key,
+        "model": "anthropic/claude-3-haiku-20240307",
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
diff --git a/examples/azure/code_generator_graph_azure.py b/examples/azure/code_generator_graph_azure.py
new file mode 100644
index 00000000..79be4534
--- /dev/null
+++ b/examples/azure/code_generator_graph_azure.py
@@ -0,0 +1,58 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "api_key": os.environ["AZURE_OPENAI_KEY"],
+        "model": "azure_openai/gpt-3.5-turbo",
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/bedrock/code_generator_graph_bedrock.py b/examples/bedrock/code_generator_graph_bedrock.py
new file mode 100644
index 00000000..2998873b
--- /dev/null
+++ b/examples/bedrock/code_generator_graph_bedrock.py
@@ -0,0 +1,60 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "client": "client_name",
+        "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+        "temperature": 0.0
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/deepseek/code_generator_graph_deepseek.py b/examples/deepseek/code_generator_graph_deepseek.py
new file mode 100644
index 00000000..17b1a970
--- /dev/null
+++ b/examples/deepseek/code_generator_graph_deepseek.py
@@ -0,0 +1,60 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+deepseek_key = os.getenv("DEEPSEEK_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "deepseek/deepseek-chat",
+        "api_key": deepseek_key,
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/ernie/code_generator_graph_ernie.py b/examples/ernie/code_generator_graph_ernie.py
new file mode 100644
index 00000000..1545238b
--- /dev/null
+++ b/examples/ernie/code_generator_graph_ernie.py
@@ -0,0 +1,62 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "ernie/ernie-bot-turbo",
+        "ernie_client_id": "<ernie_client_id>",
+        "ernie_client_secret": "<ernie_client_secret>",
+        "temperature": 0.1
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/fireworks/code_generator_graph_fireworks.py b/examples/fireworks/code_generator_graph_fireworks.py
new file mode 100644
index 00000000..9bbec7f2
--- /dev/null
+++ b/examples/fireworks/code_generator_graph_fireworks.py
@@ -0,0 +1,60 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": fireworks_api_key,
+        "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/google_genai/code_generator_graph_gemini.py b/examples/google_genai/code_generator_graph_gemini.py
new file mode 100644
index 00000000..4d16fdff
--- /dev/null
+++ b/examples/google_genai/code_generator_graph_gemini.py
@@ -0,0 +1,60 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": gemini_key,
+        "model": "google_genai/gemini-pro",
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/google_vertexai/code_generator_graph_vertex.py b/examples/google_vertexai/code_generator_graph_vertex.py
new file mode 100644
index 00000000..0d1399ea
--- /dev/null
+++ b/examples/google_vertexai/code_generator_graph_vertex.py
@@ -0,0 +1,60 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+     "llm": {
+        "api_key": gemini_key,
+        "model": "google_vertexai/gemini-1.5-pro",
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/groq/code_generator_graph_groq.py b/examples/groq/code_generator_graph_groq.py
new file mode 100644
index 00000000..1f7d6b37
--- /dev/null
+++ b/examples/groq/code_generator_graph_groq.py
@@ -0,0 +1,61 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+groq_key = os.getenv("GROQ_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "groq/gemma-7b-it",
+        "api_key": groq_key,
+        "temperature": 0
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/huggingfacehub/code_generator_graph_huggingfacehub.py b/examples/huggingfacehub/code_generator_graph_huggingfacehub.py
new file mode 100644
index 00000000..085df3eb
--- /dev/null
+++ b/examples/huggingfacehub/code_generator_graph_huggingfacehub.py
@@ -0,0 +1,71 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+
+repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
+
+llm_model_instance = HuggingFaceEndpoint(
+    repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
+)
+
+embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
+    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
+)
+
+graph_config = {
+    "llm": {
+        "model_instance": llm_model_instance
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/local_models/code_generator_graph_ollama.py b/examples/local_models/code_generator_graph_ollama.py
new file mode 100644
index 00000000..9246e952
--- /dev/null
+++ b/examples/local_models/code_generator_graph_ollama.py
@@ -0,0 +1,61 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "model": "ollama/llama3",
+        "temperature": 0,
+        "format": "json",
+        "base_url": "http://localhost:11434",
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/mistral/code_generator_graph_mistral.py b/examples/mistral/code_generator_graph_mistral.py
new file mode 100644
index 00000000..4abdf1f5
--- /dev/null
+++ b/examples/mistral/code_generator_graph_mistral.py
@@ -0,0 +1,60 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+mistral_key = os.getenv("MISTRAL_API_KEY")
+
+graph_config = {
+    "llm": {
+        "api_key": mistral_key,
+        "model": "mistralai/open-mistral-nemo",
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/moonshot/code_generator_graph_moonshot.py b/examples/moonshot/code_generator_graph_moonshot.py
new file mode 100644
index 00000000..11f9fb47
--- /dev/null
+++ b/examples/moonshot/code_generator_graph_moonshot.py
@@ -0,0 +1,67 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain_community.chat_models.moonshot import MoonshotChat
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+llm_instance_config = {
+    "model": "moonshot-v1-8k",
+    "base_url": "https://api.moonshot.cn/v1",
+    "moonshot_api_key": os.getenv("MOONLIGHT_API_KEY"),
+}
+
+llm_model_instance = MoonshotChat(**llm_instance_config)
+
+graph_config = {
+    "llm": {
+        "model_instance": llm_model_instance, 
+        "model_tokens": 10000
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/nemotron/code_generator_graph_nemotron.py b/examples/nemotron/code_generator_graph_nemotron.py
new file mode 100644
index 00000000..1f0ea276
--- /dev/null
+++ b/examples/nemotron/code_generator_graph_nemotron.py
@@ -0,0 +1,58 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("NEMOTRON_APIKEY"),
+        "model": "nvidia/meta/llama3-70b-instruct",
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/oneapi/code_generator_graph_oneapi.py b/examples/oneapi/code_generator_graph_oneapi.py
new file mode 100644
index 00000000..0bbb3ba2
--- /dev/null
+++ b/examples/oneapi/code_generator_graph_oneapi.py
@@ -0,0 +1,61 @@
+""" 
+Basic example of scraping pipeline using Code Generator with schema
+"""
+
+import os, json
+from typing import List
+from dotenv import load_dotenv
+from langchain_core.pydantic_v1 import BaseModel, Field
+from scrapegraphai.graphs import CodeGeneratorGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the output schema for the graph
+# ************************************************
+
+class Project(BaseModel):
+    title: str = Field(description="The title of the project")
+    description: str = Field(description="The description of the project")
+
+class Projects(BaseModel):
+    projects: List[Project]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": "***************************",
+        "model": "oneapi/qwen-turbo",
+        "base_url": "http://127.0.0.1:3000/v1",  # 设置 OneAPI URL
+    },
+    "verbose": True,
+    "headless": False,
+    "reduction": 2,
+    "max_iterations": {
+        "overall": 10,
+        "syntax": 3,
+        "execution": 3,
+        "validation": 3,
+        "semantic": 3
+    },
+    "output_file_name": "extracted_data.py"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+code_generator_graph = CodeGeneratorGraph(
+    prompt="List me all the projects with their description",
+    source="https://perinim.github.io/projects/",
+    schema=Projects,
+    config=graph_config
+)
+
+result = code_generator_graph.run()
+print(result)
\ No newline at end of file
diff --git a/examples/code_generation/simple_with_schema.py b/examples/openai/code_generator_graph_openai.py
similarity index 97%
rename from examples/code_generation/simple_with_schema.py
rename to examples/openai/code_generator_graph_openai.py
index 544c1724..21a4a02f 100644
--- a/examples/code_generation/simple_with_schema.py
+++ b/examples/openai/code_generator_graph_openai.py
@@ -30,7 +30,7 @@ openai_key = os.getenv("OPENAI_APIKEY")
 graph_config = {
     "llm": {
         "api_key":openai_key,
-        "model": "openai/gpt-4o-mini",\
+        "model": "openai/gpt-4o-mini",
     },
     "verbose": True,
     "headless": False,