mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
fix: md conversion
This commit is contained in:
parent
a711186a0c
commit
1d41f6eafe
@ -163,10 +163,10 @@ class FetchNode(BaseNode):
|
|||||||
if not source.strip():
|
if not source.strip():
|
||||||
raise ValueError("No HTML body content found in the local source.")
|
raise ValueError("No HTML body content found in the local source.")
|
||||||
|
|
||||||
parsed_content = source
|
if (not self.script_creator) or (self.force and not self.script_creator):
|
||||||
|
|
||||||
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator:
|
|
||||||
parsed_content = convert_to_md(source)
|
parsed_content = convert_to_md(source)
|
||||||
|
else:
|
||||||
|
parsed_content = source
|
||||||
|
|
||||||
compressed_document = [
|
compressed_document = [
|
||||||
Document(page_content=parsed_content, metadata={"source": "local_dir"})
|
Document(page_content=parsed_content, metadata={"source": "local_dir"})
|
||||||
@ -184,8 +184,8 @@ class FetchNode(BaseNode):
|
|||||||
if not self.cut:
|
if not self.cut:
|
||||||
parsed_content = cleanup_html(response, source)
|
parsed_content = cleanup_html(response, source)
|
||||||
|
|
||||||
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
|
if (not self.script_creator) or (self.force and not self.script_creator):
|
||||||
parsed_content = convert_to_md(source, input_data[0])
|
parsed_content = convert_to_md(parsed_content, source)
|
||||||
compressed_document = [Document(page_content=parsed_content)]
|
compressed_document = [Document(page_content=parsed_content)]
|
||||||
else:
|
else:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
@ -206,9 +206,9 @@ class FetchNode(BaseNode):
|
|||||||
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
||||||
parsed_content = document[0].page_content
|
parsed_content = document[0].page_content
|
||||||
|
|
||||||
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
|
if (not self.script_creator) or (self.force and not self.script_creator and not self.openai_md_enabled):
|
||||||
|
|
||||||
parsed_content = convert_to_md(document[0].page_content, input_data[0])
|
parsed_content = convert_to_md(document[0].page_content, source)
|
||||||
|
|
||||||
|
|
||||||
compressed_document = [
|
compressed_document = [
|
||||||
|
|||||||
@ -23,7 +23,7 @@ def convert_to_md(html: str, url: str = None) -> str:
|
|||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
h.ignore_links = False
|
h.ignore_links = False
|
||||||
h.body_width = 0
|
h.body_width = 0
|
||||||
if url:
|
if url is not None:
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||||
h.baseurl = domain
|
h.baseurl = domain
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user