fix(md_conversion): add absolute links md, added missing dependency

2026-07-04 21:00:36 +08:00 · 2024-07-23 15:34:12 +02:00 · 2024-07-23 15:34:12 +02:00 · 12b5eada6e
commit 12b5eada6e
parent 1756e8522f
6 changed files with 21 additions and 7 deletions
--- a/examples/openai/smart_scraper_openai.py
+++ b/examples/openai/smart_scraper_openai.py
@ -27,8 +27,8 @@ graph_config = {
 # ************************************************

 smart_scraper_graph = SmartScraperGraph(
-    prompt="Extract me the python code inside the page",
-    source="https://www.exploit-db.com/exploits/51447",
+    prompt="List me what does the company do, the name and a contact email.",
+    source="https://scrapegraphai.com/",
    config=graph_config
 )

--- a/pyproject.toml
+++ b/pyproject.toml
@ -14,6 +14,8 @@ authors = [
 ]
 dependencies = [
    "langchain>=0.2.10",
+    "langchain-fireworks>=0.1.3",
+    "langchain_community>=0.2.9",
    "langchain-google-genai>=1.0.7",
    "langchain-google-vertexai",
    "langchain-openai>=0.1.17",
@ -36,7 +38,6 @@ dependencies = [
    "undetected-playwright>=0.3.0",
    "semchunk>=1.0.1",
    "html2text>=2024.2.26",
-    "langchain-fireworks>=0.1.3",
 ]

 license = "MIT"
--- a/scrapegraphai/graphs/base_graph.py
+++ b/scrapegraphai/graphs/base_graph.py
@ -220,6 +220,8 @@ class BaseGraph:
        # Log the graph execution telemetry
        graph_execution_time = time.time() - start_time
        response = state.get("answer", None) if source_type == "url" else None
+        content = state.get("parsed_doc", None) if response is not None else None
+        
        log_graph_execution(
            graph_name=self.graph_name,
            source=source,
@ -228,6 +230,7 @@ class BaseGraph:
            llm_model=llm_model,
            embedder_model=embedder_model,
            source_type=source_type,
+            content=content,
            response=response,
            execution_time=graph_execution_time,
            total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@ -185,7 +185,7 @@ class FetchNode(BaseNode):
                    parsed_content = cleanup_html(response, source)

                if  (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
-                    parsed_content = convert_to_md(source)
+                    parsed_content = convert_to_md(source, input_data[0])
                compressed_document = [Document(page_content=parsed_content)]
            else:
                self.logger.warning(
@ -207,7 +207,8 @@ class FetchNode(BaseNode):
            parsed_content = document[0].page_content

            if  isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
-                parsed_content = convert_to_md(document[0].page_content)
+
+                parsed_content = convert_to_md(document[0].page_content, input_data[0])


            compressed_document = [
--- a/scrapegraphai/telemetry/telemetry.py
+++ b/scrapegraphai/telemetry/telemetry.py
@ -156,7 +156,7 @@ def log_event(event: str, properties: Dict[str, any]):
        send_event_json(event_json)


-def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
+def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, content: str = None, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
    properties = {
        "graph_name": graph_name,
        "source": source,
@ -165,11 +165,13 @@ def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, l
        "llm_model": llm_model,
        "embedder_model": embedder_model,
        "source_type": source_type,
+        "content": content,
        "response": response,
        "execution_time": execution_time,
        "error_node": error_node,
        "exception": exception,
        "total_tokens": total_tokens,
+        "type": "community-library"
    }
    log_event("graph_execution", properties)

--- a/scrapegraphai/utils/convert_to_md.py
+++ b/scrapegraphai/utils/convert_to_md.py
@ -2,8 +2,9 @@
 convert_to_md modul
 """
 import html2text
+from urllib.parse import urlparse

-def convert_to_md(html):
+def convert_to_md(html: str, url: str = None) -> str:
    """ Convert HTML to Markdown.
    This function uses the html2text library to convert the provided HTML content to Markdown 
    format.
@ -18,6 +19,12 @@ def convert_to_md(html):
    'This is a paragraph.\n\n# This is a heading.'

    Note: All the styles and links are ignored during the conversion. """
+
+    if url:
+        parsed_url = urlparse(url)
+        domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
    h = html2text.HTML2Text()
    h.ignore_links = False
+    h.baseurl = domain
+    h.body_width = 0
    return h.handle(html)