fix(md_conversion): add absolute links md, added missing dependency

This commit is contained in:
Marco Perini 2024-07-23 15:34:12 +02:00 committed by GitHub
parent 1756e8522f
commit 12b5eada6e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 21 additions and 7 deletions

View File

@ -27,8 +27,8 @@ graph_config = {
# ************************************************ # ************************************************
smart_scraper_graph = SmartScraperGraph( smart_scraper_graph = SmartScraperGraph(
prompt="Extract me the python code inside the page", prompt="List me what does the company do, the name and a contact email.",
source="https://www.exploit-db.com/exploits/51447", source="https://scrapegraphai.com/",
config=graph_config config=graph_config
) )

View File

@ -14,6 +14,8 @@ authors = [
] ]
dependencies = [ dependencies = [
"langchain>=0.2.10", "langchain>=0.2.10",
"langchain-fireworks>=0.1.3",
"langchain_community>=0.2.9",
"langchain-google-genai>=1.0.7", "langchain-google-genai>=1.0.7",
"langchain-google-vertexai", "langchain-google-vertexai",
"langchain-openai>=0.1.17", "langchain-openai>=0.1.17",
@ -36,7 +38,6 @@ dependencies = [
"undetected-playwright>=0.3.0", "undetected-playwright>=0.3.0",
"semchunk>=1.0.1", "semchunk>=1.0.1",
"html2text>=2024.2.26", "html2text>=2024.2.26",
"langchain-fireworks>=0.1.3",
] ]
license = "MIT" license = "MIT"

View File

@ -220,6 +220,8 @@ class BaseGraph:
# Log the graph execution telemetry # Log the graph execution telemetry
graph_execution_time = time.time() - start_time graph_execution_time = time.time() - start_time
response = state.get("answer", None) if source_type == "url" else None response = state.get("answer", None) if source_type == "url" else None
content = state.get("parsed_doc", None) if response is not None else None
log_graph_execution( log_graph_execution(
graph_name=self.graph_name, graph_name=self.graph_name,
source=source, source=source,
@ -228,6 +230,7 @@ class BaseGraph:
llm_model=llm_model, llm_model=llm_model,
embedder_model=embedder_model, embedder_model=embedder_model,
source_type=source_type, source_type=source_type,
content=content,
response=response, response=response,
execution_time=graph_execution_time, execution_time=graph_execution_time,
total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None, total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,

View File

@ -185,7 +185,7 @@ class FetchNode(BaseNode):
parsed_content = cleanup_html(response, source) parsed_content = cleanup_html(response, source)
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator): if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
parsed_content = convert_to_md(source) parsed_content = convert_to_md(source, input_data[0])
compressed_document = [Document(page_content=parsed_content)] compressed_document = [Document(page_content=parsed_content)]
else: else:
self.logger.warning( self.logger.warning(
@ -207,7 +207,8 @@ class FetchNode(BaseNode):
parsed_content = document[0].page_content parsed_content = document[0].page_content
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
parsed_content = convert_to_md(document[0].page_content)
parsed_content = convert_to_md(document[0].page_content, input_data[0])
compressed_document = [ compressed_document = [

View File

@ -156,7 +156,7 @@ def log_event(event: str, properties: Dict[str, any]):
send_event_json(event_json) send_event_json(event_json)
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None): def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, content: str = None, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
properties = { properties = {
"graph_name": graph_name, "graph_name": graph_name,
"source": source, "source": source,
@ -165,11 +165,13 @@ def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, l
"llm_model": llm_model, "llm_model": llm_model,
"embedder_model": embedder_model, "embedder_model": embedder_model,
"source_type": source_type, "source_type": source_type,
"content": content,
"response": response, "response": response,
"execution_time": execution_time, "execution_time": execution_time,
"error_node": error_node, "error_node": error_node,
"exception": exception, "exception": exception,
"total_tokens": total_tokens, "total_tokens": total_tokens,
"type": "community-library"
} }
log_event("graph_execution", properties) log_event("graph_execution", properties)

View File

@ -2,8 +2,9 @@
convert_to_md modul convert_to_md modul
""" """
import html2text import html2text
from urllib.parse import urlparse
def convert_to_md(html): def convert_to_md(html: str, url: str = None) -> str:
""" Convert HTML to Markdown. """ Convert HTML to Markdown.
This function uses the html2text library to convert the provided HTML content to Markdown This function uses the html2text library to convert the provided HTML content to Markdown
format. format.
@ -18,6 +19,12 @@ def convert_to_md(html):
'This is a paragraph.\n\n# This is a heading.' 'This is a paragraph.\n\n# This is a heading.'
Note: All the styles and links are ignored during the conversion. """ Note: All the styles and links are ignored during the conversion. """
if url:
parsed_url = urlparse(url)
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
h = html2text.HTML2Text() h = html2text.HTML2Text()
h.ignore_links = False h.ignore_links = False
h.baseurl = domain
h.body_width = 0
return h.handle(html) return h.handle(html)