mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-04 21:00:36 +08:00
fix(md_conversion): add absolute links md, added missing dependency
This commit is contained in:
parent
1756e8522f
commit
12b5eada6e
@ -27,8 +27,8 @@ graph_config = {
|
|||||||
# ************************************************
|
# ************************************************
|
||||||
|
|
||||||
smart_scraper_graph = SmartScraperGraph(
|
smart_scraper_graph = SmartScraperGraph(
|
||||||
prompt="Extract me the python code inside the page",
|
prompt="List me what does the company do, the name and a contact email.",
|
||||||
source="https://www.exploit-db.com/exploits/51447",
|
source="https://scrapegraphai.com/",
|
||||||
config=graph_config
|
config=graph_config
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -14,6 +14,8 @@ authors = [
|
|||||||
]
|
]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"langchain>=0.2.10",
|
"langchain>=0.2.10",
|
||||||
|
"langchain-fireworks>=0.1.3",
|
||||||
|
"langchain_community>=0.2.9",
|
||||||
"langchain-google-genai>=1.0.7",
|
"langchain-google-genai>=1.0.7",
|
||||||
"langchain-google-vertexai",
|
"langchain-google-vertexai",
|
||||||
"langchain-openai>=0.1.17",
|
"langchain-openai>=0.1.17",
|
||||||
@ -36,7 +38,6 @@ dependencies = [
|
|||||||
"undetected-playwright>=0.3.0",
|
"undetected-playwright>=0.3.0",
|
||||||
"semchunk>=1.0.1",
|
"semchunk>=1.0.1",
|
||||||
"html2text>=2024.2.26",
|
"html2text>=2024.2.26",
|
||||||
"langchain-fireworks>=0.1.3",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|||||||
@ -220,6 +220,8 @@ class BaseGraph:
|
|||||||
# Log the graph execution telemetry
|
# Log the graph execution telemetry
|
||||||
graph_execution_time = time.time() - start_time
|
graph_execution_time = time.time() - start_time
|
||||||
response = state.get("answer", None) if source_type == "url" else None
|
response = state.get("answer", None) if source_type == "url" else None
|
||||||
|
content = state.get("parsed_doc", None) if response is not None else None
|
||||||
|
|
||||||
log_graph_execution(
|
log_graph_execution(
|
||||||
graph_name=self.graph_name,
|
graph_name=self.graph_name,
|
||||||
source=source,
|
source=source,
|
||||||
@ -228,6 +230,7 @@ class BaseGraph:
|
|||||||
llm_model=llm_model,
|
llm_model=llm_model,
|
||||||
embedder_model=embedder_model,
|
embedder_model=embedder_model,
|
||||||
source_type=source_type,
|
source_type=source_type,
|
||||||
|
content=content,
|
||||||
response=response,
|
response=response,
|
||||||
execution_time=graph_execution_time,
|
execution_time=graph_execution_time,
|
||||||
total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,
|
total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,
|
||||||
|
|||||||
@ -185,7 +185,7 @@ class FetchNode(BaseNode):
|
|||||||
parsed_content = cleanup_html(response, source)
|
parsed_content = cleanup_html(response, source)
|
||||||
|
|
||||||
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
|
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
|
||||||
parsed_content = convert_to_md(source)
|
parsed_content = convert_to_md(source, input_data[0])
|
||||||
compressed_document = [Document(page_content=parsed_content)]
|
compressed_document = [Document(page_content=parsed_content)]
|
||||||
else:
|
else:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
@ -207,7 +207,8 @@ class FetchNode(BaseNode):
|
|||||||
parsed_content = document[0].page_content
|
parsed_content = document[0].page_content
|
||||||
|
|
||||||
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
|
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
|
||||||
parsed_content = convert_to_md(document[0].page_content)
|
|
||||||
|
parsed_content = convert_to_md(document[0].page_content, input_data[0])
|
||||||
|
|
||||||
|
|
||||||
compressed_document = [
|
compressed_document = [
|
||||||
|
|||||||
@ -156,7 +156,7 @@ def log_event(event: str, properties: Dict[str, any]):
|
|||||||
send_event_json(event_json)
|
send_event_json(event_json)
|
||||||
|
|
||||||
|
|
||||||
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
|
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, content: str = None, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
|
||||||
properties = {
|
properties = {
|
||||||
"graph_name": graph_name,
|
"graph_name": graph_name,
|
||||||
"source": source,
|
"source": source,
|
||||||
@ -165,11 +165,13 @@ def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, l
|
|||||||
"llm_model": llm_model,
|
"llm_model": llm_model,
|
||||||
"embedder_model": embedder_model,
|
"embedder_model": embedder_model,
|
||||||
"source_type": source_type,
|
"source_type": source_type,
|
||||||
|
"content": content,
|
||||||
"response": response,
|
"response": response,
|
||||||
"execution_time": execution_time,
|
"execution_time": execution_time,
|
||||||
"error_node": error_node,
|
"error_node": error_node,
|
||||||
"exception": exception,
|
"exception": exception,
|
||||||
"total_tokens": total_tokens,
|
"total_tokens": total_tokens,
|
||||||
|
"type": "community-library"
|
||||||
}
|
}
|
||||||
log_event("graph_execution", properties)
|
log_event("graph_execution", properties)
|
||||||
|
|
||||||
|
|||||||
@ -2,8 +2,9 @@
|
|||||||
convert_to_md modul
|
convert_to_md modul
|
||||||
"""
|
"""
|
||||||
import html2text
|
import html2text
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
def convert_to_md(html):
|
def convert_to_md(html: str, url: str = None) -> str:
|
||||||
""" Convert HTML to Markdown.
|
""" Convert HTML to Markdown.
|
||||||
This function uses the html2text library to convert the provided HTML content to Markdown
|
This function uses the html2text library to convert the provided HTML content to Markdown
|
||||||
format.
|
format.
|
||||||
@ -18,6 +19,12 @@ def convert_to_md(html):
|
|||||||
'This is a paragraph.\n\n# This is a heading.'
|
'This is a paragraph.\n\n# This is a heading.'
|
||||||
|
|
||||||
Note: All the styles and links are ignored during the conversion. """
|
Note: All the styles and links are ignored during the conversion. """
|
||||||
|
|
||||||
|
if url:
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
h.ignore_links = False
|
h.ignore_links = False
|
||||||
|
h.baseurl = domain
|
||||||
|
h.body_width = 0
|
||||||
return h.handle(html)
|
return h.handle(html)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user