diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py index b0515770..6771b817 100644 --- a/examples/openai/smart_scraper_openai.py +++ b/examples/openai/smart_scraper_openai.py @@ -27,8 +27,8 @@ graph_config = { # ************************************************ smart_scraper_graph = SmartScraperGraph( - prompt="Extract me the python code inside the page", - source="https://www.exploit-db.com/exploits/51447", + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", config=graph_config ) diff --git a/pyproject.toml b/pyproject.toml index 10e8b61f..41c417ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,8 @@ authors = [ ] dependencies = [ "langchain>=0.2.10", + "langchain-fireworks>=0.1.3", + "langchain_community>=0.2.9", "langchain-google-genai>=1.0.7", "langchain-google-vertexai", "langchain-openai>=0.1.17", @@ -36,7 +38,6 @@ dependencies = [ "undetected-playwright>=0.3.0", "semchunk>=1.0.1", "html2text>=2024.2.26", - "langchain-fireworks>=0.1.3", ] license = "MIT" diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index e6df2dec..21f564d7 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -220,6 +220,8 @@ class BaseGraph: # Log the graph execution telemetry graph_execution_time = time.time() - start_time response = state.get("answer", None) if source_type == "url" else None + content = state.get("parsed_doc", None) if response is not None else None + log_graph_execution( graph_name=self.graph_name, source=source, @@ -228,6 +230,7 @@ class BaseGraph: llm_model=llm_model, embedder_model=embedder_model, source_type=source_type, + content=content, response=response, execution_time=graph_execution_time, total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None, diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 42e7489f..56366677 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -185,7 +185,7 @@ class FetchNode(BaseNode): parsed_content = cleanup_html(response, source) if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator): - parsed_content = convert_to_md(source) + parsed_content = convert_to_md(source, input_data[0]) compressed_document = [Document(page_content=parsed_content)] else: self.logger.warning( @@ -207,7 +207,8 @@ class FetchNode(BaseNode): parsed_content = document[0].page_content if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: - parsed_content = convert_to_md(document[0].page_content) + + parsed_content = convert_to_md(document[0].page_content, input_data[0]) compressed_document = [ diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py index d1c8a367..db8f701f 100644 --- a/scrapegraphai/telemetry/telemetry.py +++ b/scrapegraphai/telemetry/telemetry.py @@ -156,7 +156,7 @@ def log_event(event: str, properties: Dict[str, any]): send_event_json(event_json) -def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None): +def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, content: str = None, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None): properties = { "graph_name": graph_name, "source": source, @@ -165,11 +165,13 @@ def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, l "llm_model": llm_model, "embedder_model": embedder_model, "source_type": source_type, + "content": content, "response": response, "execution_time": execution_time, "error_node": error_node, "exception": exception, "total_tokens": total_tokens, + "type": "community-library" } log_event("graph_execution", properties) diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index 35123042..6f1a2334 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -2,8 +2,9 @@ convert_to_md modul """ import html2text +from urllib.parse import urlparse -def convert_to_md(html): +def convert_to_md(html: str, url: str = None) -> str: """ Convert HTML to Markdown. This function uses the html2text library to convert the provided HTML content to Markdown format. @@ -18,6 +19,12 @@ def convert_to_md(html): 'This is a paragraph.\n\n# This is a heading.' Note: All the styles and links are ignored during the conversion. """ + + if url: + parsed_url = urlparse(url) + domain = f"{parsed_url.scheme}://{parsed_url.netloc}" h = html2text.HTML2Text() h.ignore_links = False + h.baseurl = domain + h.body_width = 0 return h.handle(html)