fix: md conversion

This commit is contained in:
Marco Perini 2024-07-23 19:27:18 +02:00 committed by GitHub
parent a711186a0c
commit 1d41f6eafe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 8 additions and 8 deletions

View File

@ -163,10 +163,10 @@ class FetchNode(BaseNode):
if not source.strip():
raise ValueError("No HTML body content found in the local source.")
parsed_content = source
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator:
if (not self.script_creator) or (self.force and not self.script_creator):
parsed_content = convert_to_md(source)
else:
parsed_content = source
compressed_document = [
Document(page_content=parsed_content, metadata={"source": "local_dir"})
@ -184,8 +184,8 @@ class FetchNode(BaseNode):
if not self.cut:
parsed_content = cleanup_html(response, source)
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator):
parsed_content = convert_to_md(source, input_data[0])
if (not self.script_creator) or (self.force and not self.script_creator):
parsed_content = convert_to_md(parsed_content, source)
compressed_document = [Document(page_content=parsed_content)]
else:
self.logger.warning(
@ -206,9 +206,9 @@ class FetchNode(BaseNode):
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
parsed_content = document[0].page_content
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
if (not self.script_creator) or (self.force and not self.script_creator and not self.openai_md_enabled):
parsed_content = convert_to_md(document[0].page_content, input_data[0])
parsed_content = convert_to_md(document[0].page_content, source)
compressed_document = [

View File

@ -23,7 +23,7 @@ def convert_to_md(html: str, url: str = None) -> str:
h = html2text.HTML2Text()
h.ignore_links = False
h.body_width = 0
if url:
if url is not None:
parsed_url = urlparse(url)
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
h.baseurl = domain