From 99dc8497d85289759286a973e4aecc3f924d3ada Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Sat, 20 Jul 2024 20:02:26 +0200 Subject: [PATCH] docs(gpt-4o-mini): added new gpt, fixed chromium lazy loading, add documentation and metrics --- docs/source/scrapers/graph_config.rst | 2 +- docs/source/scrapers/telemetry.rst | 5 ++++ scrapegraphai/docloaders/chromium.py | 2 +- scrapegraphai/graphs/base_graph.py | 35 +++++++++++++++++++++- scrapegraphai/helpers/models_tokens.py | 4 ++- scrapegraphai/nodes/graph_iterator_node.py | 3 +- scrapegraphai/telemetry/telemetry.py | 7 ++++- 7 files changed, 52 insertions(+), 6 deletions(-) diff --git a/docs/source/scrapers/graph_config.rst b/docs/source/scrapers/graph_config.rst index 9e1d49e0..e16ccae7 100644 --- a/docs/source/scrapers/graph_config.rst +++ b/docs/source/scrapers/graph_config.rst @@ -14,7 +14,7 @@ Some interesting ones are: - `burr_kwargs`: A dictionary with additional parameters to enable `Burr` graphical user interface. - `max_images`: The maximum number of images to be analyzed. Useful in `OmniScraperGraph` and `OmniSearchGraph`. - `cache_path`: The path where the cache files will be saved. If already exists, the cache will be loaded from this path. - +- `additional_info`: Add additional text to default prompts defined in the graphs. .. _Burr: Burr Integration diff --git a/docs/source/scrapers/telemetry.rst b/docs/source/scrapers/telemetry.rst index f5e9f27c..a80eb3b6 100644 --- a/docs/source/scrapers/telemetry.rst +++ b/docs/source/scrapers/telemetry.rst @@ -27,8 +27,13 @@ Additionally, the following properties are collected: "llm_model": llm_model_name, "embedder_model": embedder_model_name, "source_type": source_type, + "source": source, "execution_time": execution_time, + "prompt": prompt, + "schema": schema, "error_node": error_node_name, + "exception": exception, + "response": response, "total_tokens": total_tokens, } diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 579933e6..474c22de 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -82,7 +82,7 @@ class ChromiumLoader(BaseLoader): context = await browser.new_context() await Malenia.apply_stealth(context) page = await context.new_page() - await page.goto(url) + await page.goto(url, wait_until="domcontentloaded") await page.wait_for_load_state(self.load_state) results = await page.content() # Simply get the HTML content logger.info("Content scraped") diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index df2845fa..e6df2dec 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -106,18 +106,32 @@ class BaseGraph: source_type = None llm_model = None embedder_model = None + source = [] + prompt = None + schema = None while current_node_name: curr_time = time.time() current_node = next(node for node in self.nodes if node.node_name == current_node_name) + # check if there is a "source" key in the node config if current_node.__class__.__name__ == "FetchNode": # get the second key name of the state dictionary source_type = list(state.keys())[1] + if state.get("user_prompt", None): + prompt = state["user_prompt"] if type(state["user_prompt"]) == str else None # quick fix for local_dir source type if source_type == "local_dir": source_type = "html_dir" + elif source_type == "url": + if type(state[source_type]) == list: + # iterate through the list of urls and see if they are strings + for url in state[source_type]: + if type(url) == str: + source.append(url) + elif type(state[source_type]) == str: + source.append(state[source_type]) # check if there is an "llm_model" variable in the class if hasattr(current_node, "llm_model") and llm_model is None: @@ -135,6 +149,16 @@ class BaseGraph: elif hasattr(embedder_model, "model"): embedder_model = embedder_model.model + if hasattr(current_node, "node_config"): + if type(current_node.node_config) is dict: + if current_node.node_config.get("schema", None) and schema is None: + if type(current_node.node_config["schema"]) is not dict: + # convert to dict + try: + schema = current_node.node_config["schema"].schema() + except Exception as e: + schema = None + with get_openai_callback() as cb: try: result = current_node.execute(state) @@ -144,11 +168,15 @@ class BaseGraph: graph_execution_time = time.time() - start_time log_graph_execution( graph_name=self.graph_name, + source=source, + prompt=prompt, + schema=schema, llm_model=llm_model, embedder_model=embedder_model, source_type=source_type, execution_time=graph_execution_time, - error_node=error_node + error_node=error_node, + exception=str(e) ) raise e node_exec_time = time.time() - curr_time @@ -191,11 +219,16 @@ class BaseGraph: # Log the graph execution telemetry graph_execution_time = time.time() - start_time + response = state.get("answer", None) if source_type == "url" else None log_graph_execution( graph_name=self.graph_name, + source=source, + prompt=prompt, + schema=schema, llm_model=llm_model, embedder_model=embedder_model, source_type=source_type, + response=response, execution_time=graph_execution_time, total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None, ) diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 7becbf8b..ab96c46a 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -16,9 +16,10 @@ models_tokens = { "gpt-4-32k": 32768, "gpt-4-32k-0613": 32768, "gpt-4o": 128000, + "gpt-4o-mini":128000, }, "azure": { - "gpt-3.5-turbo-0125": 16385, + "gpt-3.5-turbo-0125": 16385, "gpt-3.5": 4096, "gpt-3.5-turbo": 16385, "gpt-3.5-turbo-1106": 16385, @@ -34,6 +35,7 @@ models_tokens = { "gpt-4-32k": 32768, "gpt-4-32k-0613": 32768, "gpt-4o": 128000, + "gpt-4o-mini":128000, }, "gemini": { "gemini-pro": 128000, diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index 7e0872e3..061be77a 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -126,7 +126,8 @@ class GraphIteratorNode(BaseNode): for url in urls: instance = copy.copy(graph_instance) instance.source = url - + if url.startswith("http"): + instance.input_key = "url" participants.append(instance) futures = [_async_run(graph) for graph in participants] diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py index 20d38186..d1c8a367 100644 --- a/scrapegraphai/telemetry/telemetry.py +++ b/scrapegraphai/telemetry/telemetry.py @@ -156,14 +156,19 @@ def log_event(event: str, properties: Dict[str, any]): send_event_json(event_json) -def log_graph_execution(graph_name: str, llm_model: str, embedder_model: str, source_type: str, execution_time: float, error_node: str = None, total_tokens: int = None): +def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None): properties = { "graph_name": graph_name, + "source": source, + "prompt": prompt, + "schema": schema, "llm_model": llm_model, "embedder_model": embedder_model, "source_type": source_type, + "response": response, "execution_time": execution_time, "error_node": error_node, + "exception": exception, "total_tokens": total_tokens, } log_event("graph_execution", properties)