fix(md_conversion): add absolute links md, added missing dependency

This commit is contained in:
Marco Perini 2024-07-23 15:34:12 +02:00 committed by GitHub
parent 1756e8522f
commit 12b5eada6e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 21 additions and 7 deletions

View File

@ -27,8 +27,8 @@ graph_config = {
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="Extract me the python code inside the page",
source="https://www.exploit-db.com/exploits/51447",
prompt="List me what does the company do, the name and a contact email.",
source="https://scrapegraphai.com/",
config=graph_config
)

View File

@ -14,6 +14,8 @@ authors = [
]
dependencies = [
"langchain>=0.2.10",
"langchain-fireworks>=0.1.3",
"langchain_community>=0.2.9",
"langchain-google-genai>=1.0.7",
"langchain-google-vertexai",
"langchain-openai>=0.1.17",
@ -36,7 +38,6 @@ dependencies = [
"undetected-playwright>=0.3.0",
"semchunk>=1.0.1",
"html2text>=2024.2.26",
"langchain-fireworks>=0.1.3",
]
license = "MIT"

View File

@ -220,6 +220,8 @@ class BaseGraph:
# Log the graph execution telemetry
graph_execution_time = time.time() - start_time
response = state.get("answer", None) if source_type == "url" else None
content = state.get("parsed_doc", None) if response is not None else None
log_graph_execution(
graph_name=self.graph_name,
source=source,
@ -228,6 +230,7 @@ class BaseGraph:
llm_model=llm_model,
embedder_model=embedder_model,
source_type=source_type,
content=content,
response=response,
execution_time=graph_execution_time,
total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,

View File

@ -185,7 +185,7 @@ class FetchNode(BaseNode):
parsed_content = cleanup_html(response, source)
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
parsed_content = convert_to_md(source)
parsed_content = convert_to_md(source, input_data[0])
compressed_document = [Document(page_content=parsed_content)]
else:
self.logger.warning(
@ -207,7 +207,8 @@ class FetchNode(BaseNode):
parsed_content = document[0].page_content
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
parsed_content = convert_to_md(document[0].page_content)
parsed_content = convert_to_md(document[0].page_content, input_data[0])
compressed_document = [

View File

@ -156,7 +156,7 @@ def log_event(event: str, properties: Dict[str, any]):
send_event_json(event_json)
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, content: str = None, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
properties = {
"graph_name": graph_name,
"source": source,
@ -165,11 +165,13 @@ def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, l
"llm_model": llm_model,
"embedder_model": embedder_model,
"source_type": source_type,
"content": content,
"response": response,
"execution_time": execution_time,
"error_node": error_node,
"exception": exception,
"total_tokens": total_tokens,
"type": "community-library"
}
log_event("graph_execution", properties)

View File

@ -2,8 +2,9 @@
convert_to_md modul
"""
import html2text
from urllib.parse import urlparse
def convert_to_md(html):
def convert_to_md(html: str, url: str = None) -> str:
""" Convert HTML to Markdown.
This function uses the html2text library to convert the provided HTML content to Markdown
format.
@ -18,6 +19,12 @@ def convert_to_md(html):
'This is a paragraph.\n\n# This is a heading.'
Note: All the styles and links are ignored during the conversion. """
if url:
parsed_url = urlparse(url)
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
h = html2text.HTML2Text()
h.ignore_links = False
h.baseurl = domain
h.body_width = 0
return h.handle(html)