From 2ff0f0113ff0b429a0bb56d8b057e7660e148a1e Mon Sep 17 00:00:00 2001 From: Matteo Vedovati Date: Sat, 21 Sep 2024 18:46:16 +0200 Subject: [PATCH] Added logs --- scrapegraphai/graphs/code_generator_graph.py | 6 ++- scrapegraphai/nodes/generate_code_node.py | 26 +++++++++-- scrapegraphai/nodes/html_analyzer_node.py | 6 +-- scrapegraphai/nodes/prompt_refiner_node.py | 47 +++++++++----------- 4 files changed, 50 insertions(+), 35 deletions(-) diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py index 6eaa05af..5da82794 100644 --- a/scrapegraphai/graphs/code_generator_graph.py +++ b/scrapegraphai/graphs/code_generator_graph.py @@ -17,8 +17,10 @@ from ..nodes import ( class CodeGeneratorGraph(AbstractGraph): """ - ... - + CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for + extarcting the wanted informations from a HTML page. The code generated is in Python and uses the library BeautifulSoup. + It requires a user prompt, a source URL, and a output schema. + Attributes: prompt (str): The prompt for the graph. source (str): The source of the graph. diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py index 1fef3c5c..891f1a27 100644 --- a/scrapegraphai/nodes/generate_code_node.py +++ b/scrapegraphai/nodes/generate_code_node.py @@ -26,7 +26,7 @@ import string class GenerateCodeNode(BaseNode): """ - ... + A node that generates Python code for a function that extracts data from HTML based on a output schema. Attributes: llm_model: An instance of a language model client, configured for generating answers. @@ -80,7 +80,7 @@ class GenerateCodeNode(BaseNode): def execute(self, state: dict) -> dict: """ - ... + Generates Python code for a function that extracts data from HTML based on a output schema. Args: state (dict): The current state of the graph. The input keys will be used @@ -92,6 +92,7 @@ class GenerateCodeNode(BaseNode): Raises: KeyError: If the input keys are not found in the state, indicating that the necessary information for generating an answer is missing. + RuntimeError: If the maximum number of iterations is reached without obtaining the desired code. """ self.logger.info(f"--- Executing {self.node_name} Node ---") @@ -135,25 +136,31 @@ class GenerateCodeNode(BaseNode): return state def overall_reasoning_loop(self, state: dict) -> dict: - + self.logger.info(f"--- (Generating Code) ---") state["generated_code"] = self.generate_initial_code(state) state["generated_code"] = self.extract_code(state["generated_code"]) while state["iteration"] < self.max_iterations["overall"]: state["iteration"] += 1 + if self.verbose: + self.logger.info(f"--- Iteration {state['iteration']} ---") + self.logger.info(f"--- (Checking Code Syntax) ---") state = self.syntax_reasoning_loop(state) if state["errors"]["syntax"]: continue + self.logger.info(f"--- (Executing the Generated Code) ---") state = self.execution_reasoning_loop(state) if state["errors"]["execution"]: continue + self.logger.info(f"--- (Validate the Code Output Schema) ---") state = self.validation_reasoning_loop(state) if state["errors"]["validation"]: continue + self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---") state = self.semantic_comparison_loop(state) if state["errors"]["semantic"]: continue @@ -161,6 +168,11 @@ class GenerateCodeNode(BaseNode): # If we've made it here, the code is valid and produces the correct output break + if state["iteration"] == self.max_iterations["overall"] and (state["errors"]["syntax"] or state["errors"]["execution"] or state["errors"]["validation"] or state["errors"]["semantic"]): + raise RuntimeError("Max iterations reached without obtaining the desired code.") + + self.logger.info(f"--- (Code Generated Correctly) ---") + return state def syntax_reasoning_loop(self, state: dict) -> dict: @@ -171,7 +183,9 @@ class GenerateCodeNode(BaseNode): return state state["errors"]["syntax"] = [syntax_message] + self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---") analysis = self.syntax_focused_analysis(state) + self.logger.info(f"--- (Regenerating Code to fix the Error) ---") state["generated_code"] = self.syntax_focused_code_generation(state, analysis) state["generated_code"] = self.extract_code(state["generated_code"]) return state @@ -185,7 +199,9 @@ class GenerateCodeNode(BaseNode): return state state["errors"]["execution"] = [execution_result] + self.logger.info(f"--- (Code Execution Error: {execution_result}) ---") analysis = self.execution_focused_analysis(state) + self.logger.info(f"--- (Regenerating Code to fix the Error) ---") state["generated_code"] = self.execution_focused_code_generation(state, analysis) state["generated_code"] = self.extract_code(state["generated_code"]) return state @@ -198,7 +214,9 @@ class GenerateCodeNode(BaseNode): return state state["errors"]["validation"] = errors + self.logger.info(f"--- (Code Output not compliant to the deisred Output Schema) ---") analysis = self.validation_focused_analysis(state) + self.logger.info(f"--- (Regenerating Code to make the Output compliant to the deisred Output Schema) ---") state["generated_code"] = self.validation_focused_code_generation(state, analysis) state["generated_code"] = self.extract_code(state["generated_code"]) return state @@ -211,7 +229,9 @@ class GenerateCodeNode(BaseNode): return state state["errors"]["semantic"] = comparison_result["differences"] + self.logger.info(f"--- (The informations exctrcated are not the all ones requested) ---") analysis = self.semantic_focused_analysis(state, comparison_result) + self.logger.info(f"--- (Regenerating Code to obtain all the infromation requested) ---") state["generated_code"] = self.semantic_focused_code_generation(state, analysis) state["generated_code"] = self.extract_code(state["generated_code"]) return state diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py index cc8b4106..46da8e95 100644 --- a/scrapegraphai/nodes/html_analyzer_node.py +++ b/scrapegraphai/nodes/html_analyzer_node.py @@ -16,8 +16,8 @@ from ..utils import reduce_html class HtmlAnalyzerNode(BaseNode): """ - ... - + A node that generates an analysis of the provided HTML code based on the wanted infromations to be extracted. + Attributes: llm_model: An instance of a language model client, configured for generating answers. verbose (bool): A flag indicating whether to show print statements during execution. @@ -60,7 +60,7 @@ class HtmlAnalyzerNode(BaseNode): def execute(self, state: dict) -> dict: """ - ... + Generates an analysis of the provided HTML code based on the wanted infromations to be extracted. Args: state (dict): The current state of the graph. The input keys will be used diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py index 5aa93ba0..88fd9dad 100644 --- a/scrapegraphai/nodes/prompt_refiner_node.py +++ b/scrapegraphai/nodes/prompt_refiner_node.py @@ -59,6 +59,8 @@ class PromptRefinerNode(BaseNode): ) self.additional_info = node_config.get("additional_info") + + self.output_schema = node_config.get("schema") # get JSON output schema def execute(self, state: dict) -> dict: """ @@ -137,33 +139,24 @@ class PromptRefinerNode(BaseNode): user_prompt = state['user_prompt'] # get user prompt - if self.node_config.get("schema", None) is not None: + self.simplefied_schema = transform_schema(self.output_schema.schema()) # get JSON schema + + if self.additional_info is not None: # use additional context if present + prompt = PromptTemplate( + template=template_prompt_builder_with_context, + partial_variables={"user_input": user_prompt, + "json_schema": str(self.simplefied_schema), + "additional_context": self.additional_info}) + else: + prompt = PromptTemplate( + template=template_prompt_builder, + partial_variables={"user_input": user_prompt, + "json_schema": str(self.simplefied_schema)}) - self.simplefied_schema = transform_schema(self.node_config["schema"].schema()) # get JSON schema - - if self.additional_info is not None: # use additional context if present - prompt = PromptTemplate( - template=template_prompt_builder_with_context, - partial_variables={"user_input": user_prompt, - "json_schema": str(self.simplefied_schema), - "additional_context": self.additional_info}) - else: - prompt = PromptTemplate( - template=template_prompt_builder, - partial_variables={"user_input": user_prompt, - "json_schema": str(self.simplefied_schema)}) + output_parser = StrOutputParser() - output_parser = StrOutputParser() + chain = prompt | self.llm_model | output_parser + refined_prompt = chain.invoke({}) - chain = prompt | self.llm_model | output_parser - refined_prompt = chain.invoke({}) - - state.update({self.output[0]: refined_prompt}) - return state - - else: # no schema provided - self.logger.error("No schema provided for prompt refinement.") - - # TODO: Handle the case where no schema is provided => error handling - - return state + state.update({self.output[0]: refined_prompt}) + return state