mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
fix: md conversion
This commit is contained in:
parent
a711186a0c
commit
1d41f6eafe
@ -163,10 +163,10 @@ class FetchNode(BaseNode):
|
||||
if not source.strip():
|
||||
raise ValueError("No HTML body content found in the local source.")
|
||||
|
||||
parsed_content = source
|
||||
|
||||
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator:
|
||||
if (not self.script_creator) or (self.force and not self.script_creator):
|
||||
parsed_content = convert_to_md(source)
|
||||
else:
|
||||
parsed_content = source
|
||||
|
||||
compressed_document = [
|
||||
Document(page_content=parsed_content, metadata={"source": "local_dir"})
|
||||
@ -184,8 +184,8 @@ class FetchNode(BaseNode):
|
||||
if not self.cut:
|
||||
parsed_content = cleanup_html(response, source)
|
||||
|
||||
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
|
||||
parsed_content = convert_to_md(source, input_data[0])
|
||||
if (not self.script_creator) or (self.force and not self.script_creator):
|
||||
parsed_content = convert_to_md(parsed_content, source)
|
||||
compressed_document = [Document(page_content=parsed_content)]
|
||||
else:
|
||||
self.logger.warning(
|
||||
@ -206,9 +206,9 @@ class FetchNode(BaseNode):
|
||||
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
||||
parsed_content = document[0].page_content
|
||||
|
||||
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
|
||||
if (not self.script_creator) or (self.force and not self.script_creator and not self.openai_md_enabled):
|
||||
|
||||
parsed_content = convert_to_md(document[0].page_content, input_data[0])
|
||||
parsed_content = convert_to_md(document[0].page_content, source)
|
||||
|
||||
|
||||
compressed_document = [
|
||||
|
||||
@ -23,7 +23,7 @@ def convert_to_md(html: str, url: str = None) -> str:
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = False
|
||||
h.body_width = 0
|
||||
if url:
|
||||
if url is not None:
|
||||
parsed_url = urlparse(url)
|
||||
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
h.baseurl = domain
|
||||
|
||||
Loading…
Reference in New Issue
Block a user