fix: md conversion

This commit is contained in:
Marco Perini 2024-07-23 19:27:18 +02:00 committed by GitHub
parent a711186a0c
commit 1d41f6eafe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 8 additions and 8 deletions

View File

@ -163,10 +163,10 @@ class FetchNode(BaseNode):
if not source.strip(): if not source.strip():
raise ValueError("No HTML body content found in the local source.") raise ValueError("No HTML body content found in the local source.")
parsed_content = source if (not self.script_creator) or (self.force and not self.script_creator):
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator:
parsed_content = convert_to_md(source) parsed_content = convert_to_md(source)
else:
parsed_content = source
compressed_document = [ compressed_document = [
Document(page_content=parsed_content, metadata={"source": "local_dir"}) Document(page_content=parsed_content, metadata={"source": "local_dir"})
@ -184,8 +184,8 @@ class FetchNode(BaseNode):
if not self.cut: if not self.cut:
parsed_content = cleanup_html(response, source) parsed_content = cleanup_html(response, source)
if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator): if (not self.script_creator) or (self.force and not self.script_creator):
parsed_content = convert_to_md(source, input_data[0]) parsed_content = convert_to_md(parsed_content, source)
compressed_document = [Document(page_content=parsed_content)] compressed_document = [Document(page_content=parsed_content)]
else: else:
self.logger.warning( self.logger.warning(
@ -206,9 +206,9 @@ class FetchNode(BaseNode):
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
parsed_content = document[0].page_content parsed_content = document[0].page_content
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: if (not self.script_creator) or (self.force and not self.script_creator and not self.openai_md_enabled):
parsed_content = convert_to_md(document[0].page_content, input_data[0]) parsed_content = convert_to_md(document[0].page_content, source)
compressed_document = [ compressed_document = [

View File

@ -23,7 +23,7 @@ def convert_to_md(html: str, url: str = None) -> str:
h = html2text.HTML2Text() h = html2text.HTML2Text()
h.ignore_links = False h.ignore_links = False
h.body_width = 0 h.body_width = 0
if url: if url is not None:
parsed_url = urlparse(url) parsed_url = urlparse(url)
domain = f"{parsed_url.scheme}://{parsed_url.netloc}" domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
h.baseurl = domain h.baseurl = domain