From 1d41f6eafe8ed0e191bb6a258d54c6388ff283c6 Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Tue, 23 Jul 2024 19:27:18 +0200 Subject: [PATCH] fix: md conversion --- scrapegraphai/nodes/fetch_node.py | 14 +++++++------- scrapegraphai/utils/convert_to_md.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 56366677..5f55b6e6 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -163,10 +163,10 @@ class FetchNode(BaseNode): if not source.strip(): raise ValueError("No HTML body content found in the local source.") - parsed_content = source - - if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator: + if (not self.script_creator) or (self.force and not self.script_creator): parsed_content = convert_to_md(source) + else: + parsed_content = source compressed_document = [ Document(page_content=parsed_content, metadata={"source": "local_dir"}) @@ -184,8 +184,8 @@ class FetchNode(BaseNode): if not self.cut: parsed_content = cleanup_html(response, source) - if (isinstance(self.llm_model, OpenAI) and not self.script_creator) or (self.force and not self.script_creator): - parsed_content = convert_to_md(source, input_data[0]) + if (not self.script_creator) or (self.force and not self.script_creator): + parsed_content = convert_to_md(parsed_content, source) compressed_document = [Document(page_content=parsed_content)] else: self.logger.warning( @@ -206,9 +206,9 @@ class FetchNode(BaseNode): raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") parsed_content = document[0].page_content - if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: + if (not self.script_creator) or (self.force and not self.script_creator and not self.openai_md_enabled): - parsed_content = convert_to_md(document[0].page_content, input_data[0]) + parsed_content = convert_to_md(document[0].page_content, source) compressed_document = [ diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index c9961ae5..ece41c60 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -23,7 +23,7 @@ def convert_to_md(html: str, url: str = None) -> str: h = html2text.HTML2Text() h.ignore_links = False h.body_width = 0 - if url: + if url is not None: parsed_url = urlparse(url) domain = f"{parsed_url.scheme}://{parsed_url.netloc}" h.baseurl = domain