mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-04 21:00:36 +08:00
fix(md_conversion): add absolute links md, added missing dependency
This commit is contained in:
parent
1756e8522f
commit
12b5eada6e
@ -27,8 +27,8 @@ graph_config = {
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="Extract me the python code inside the page",
|
||||
source="https://www.exploit-db.com/exploits/51447",
|
||||
prompt="List me what does the company do, the name and a contact email.",
|
||||
source="https://scrapegraphai.com/",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
|
||||
@ -14,6 +14,8 @@ authors = [
|
||||
]
|
||||
dependencies = [
|
||||
"langchain>=0.2.10",
|
||||
"langchain-fireworks>=0.1.3",
|
||||
"langchain_community>=0.2.9",
|
||||
"langchain-google-genai>=1.0.7",
|
||||
"langchain-google-vertexai",
|
||||
"langchain-openai>=0.1.17",
|
||||
@ -36,7 +38,6 @@ dependencies = [
|
||||
"undetected-playwright>=0.3.0",
|
||||
"semchunk>=1.0.1",
|
||||
"html2text>=2024.2.26",
|
||||
"langchain-fireworks>=0.1.3",
|
||||
]
|
||||
|
||||
license = "MIT"
|
||||
|
||||
@ -220,6 +220,8 @@ class BaseGraph:
|
||||
# Log the graph execution telemetry
|
||||
graph_execution_time = time.time() - start_time
|
||||
response = state.get("answer", None) if source_type == "url" else None
|
||||
content = state.get("parsed_doc", None) if response is not None else None
|
||||
|
||||
log_graph_execution(
|
||||
graph_name=self.graph_name,
|
||||
source=source,
|
||||
@ -228,6 +230,7 @@ class BaseGraph:
|
||||
llm_model=llm_model,
|
||||
embedder_model=embedder_model,
|
||||
source_type=source_type,
|
||||
content=content,
|
||||
response=response,
|
||||
execution_time=graph_execution_time,
|
||||
total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,
|
||||
|
||||
@ -185,7 +185,7 @@ class FetchNode(BaseNode):
|
||||
parsed_content = cleanup_html(response, source)
|
||||
|
||||
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
|
||||
parsed_content = convert_to_md(source)
|
||||
parsed_content = convert_to_md(source, input_data[0])
|
||||
compressed_document = [Document(page_content=parsed_content)]
|
||||
else:
|
||||
self.logger.warning(
|
||||
@ -207,7 +207,8 @@ class FetchNode(BaseNode):
|
||||
parsed_content = document[0].page_content
|
||||
|
||||
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
|
||||
parsed_content = convert_to_md(document[0].page_content)
|
||||
|
||||
parsed_content = convert_to_md(document[0].page_content, input_data[0])
|
||||
|
||||
|
||||
compressed_document = [
|
||||
|
||||
@ -156,7 +156,7 @@ def log_event(event: str, properties: Dict[str, any]):
|
||||
send_event_json(event_json)
|
||||
|
||||
|
||||
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
|
||||
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, content: str = None, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
|
||||
properties = {
|
||||
"graph_name": graph_name,
|
||||
"source": source,
|
||||
@ -165,11 +165,13 @@ def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, l
|
||||
"llm_model": llm_model,
|
||||
"embedder_model": embedder_model,
|
||||
"source_type": source_type,
|
||||
"content": content,
|
||||
"response": response,
|
||||
"execution_time": execution_time,
|
||||
"error_node": error_node,
|
||||
"exception": exception,
|
||||
"total_tokens": total_tokens,
|
||||
"type": "community-library"
|
||||
}
|
||||
log_event("graph_execution", properties)
|
||||
|
||||
|
||||
@ -2,8 +2,9 @@
|
||||
convert_to_md modul
|
||||
"""
|
||||
import html2text
|
||||
from urllib.parse import urlparse
|
||||
|
||||
def convert_to_md(html):
|
||||
def convert_to_md(html: str, url: str = None) -> str:
|
||||
""" Convert HTML to Markdown.
|
||||
This function uses the html2text library to convert the provided HTML content to Markdown
|
||||
format.
|
||||
@ -18,6 +19,12 @@ def convert_to_md(html):
|
||||
'This is a paragraph.\n\n# This is a heading.'
|
||||
|
||||
Note: All the styles and links are ignored during the conversion. """
|
||||
|
||||
if url:
|
||||
parsed_url = urlparse(url)
|
||||
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = False
|
||||
h.baseurl = domain
|
||||
h.body_width = 0
|
||||
return h.handle(html)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user