mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
136 lines
5.3 KiB
Python
136 lines
5.3 KiB
Python
"""
|
|
Module for parsing the HTML node
|
|
"""
|
|
from langchain_community.document_transformers import BeautifulSoupTransformer
|
|
from langchain.docstore.document import Document
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_community.vectorstores import FAISS
|
|
from langchain_community.document_transformers import (
|
|
Html2TextTransformer,
|
|
EmbeddingsRedundantFilter
|
|
)
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from langchain.retrievers import ContextualCompressionRetriever
|
|
from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline
|
|
from .base_node import BaseNode
|
|
|
|
|
|
class ParseHTMLNode(BaseNode):
|
|
"""
|
|
A node responsible for parsing HTML content from a document using specified tags.
|
|
It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting
|
|
specific parts of an HTML document based on the tags provided in the state.
|
|
|
|
This node enhances the scraping workflow by allowing for targeted extraction of
|
|
content, thereby optimizing the processing of large HTML documents.
|
|
|
|
Attributes:
|
|
node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode".
|
|
node_type (str): The type of the node, set to "node" indicating a standard operational node.
|
|
|
|
Args:
|
|
node_name (str, optional): The unique identifier name for the node.
|
|
Defaults to "ParseHTMLNode".
|
|
|
|
Methods:
|
|
execute(state): Parses the HTML document contained within the state using
|
|
the specified tags, if provided, and updates the state with the parsed content.
|
|
"""
|
|
|
|
def __init__(self, llm, node_name="ParseHTMLNode"):
|
|
"""
|
|
Initializes the ParseHTMLNode with a node name.
|
|
"""
|
|
super().__init__(node_name, "parseHTML")
|
|
self.llm = llm
|
|
|
|
def execute(self, state):
|
|
"""
|
|
Executes the node's logic to parse the HTML document based on specified tags.
|
|
If tags are provided in the state, the document is parsed accordingly; otherwise,
|
|
the document remains unchanged. The method updates the state with either the original
|
|
or parsed document under the 'parsed_document' key.
|
|
|
|
Args:
|
|
state (dict): The current state of the graph, expected to contain
|
|
'document' within 'keys', and optionally 'tags' for targeted parsing.
|
|
|
|
Returns:
|
|
dict: The updated state with the 'parsed_document' key containing the parsed content,
|
|
if tags were provided, or the original document otherwise.
|
|
|
|
Raises:
|
|
KeyError: If 'document' is not found in the state, indicating that the necessary
|
|
information for parsing is missing.
|
|
"""
|
|
|
|
print("---PARSE HTML DOCUMENT---")
|
|
try:
|
|
document = state["document"]
|
|
except KeyError as e:
|
|
print(f"Error: {e} not found in state.")
|
|
raise
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
|
chunk_size=4000,
|
|
chunk_overlap=0,
|
|
)
|
|
|
|
docs_transformed = Html2TextTransformer(
|
|
).transform_documents(document)[0]
|
|
|
|
chunks = text_splitter.split_text(docs_transformed.page_content)
|
|
print("----------")
|
|
print(len(chunks))
|
|
if len(chunks) == 1:
|
|
|
|
tags = state.get("tags", None)
|
|
|
|
if not tags:
|
|
print("No specific tags provided; returning document as is.")
|
|
return state
|
|
|
|
bs_transformer = BeautifulSoupTransformer()
|
|
parsed_document = bs_transformer.transform_documents(
|
|
document, tags_to_extract=tags)
|
|
print("Document parsed with specified tags.")
|
|
state.update({"parsed_document": parsed_document})
|
|
else:
|
|
try:
|
|
user_input = state["user_input"]
|
|
except KeyError as e:
|
|
print(f"Error: {e} not found in state.")
|
|
raise
|
|
chunked_docs = []
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
doc = Document(
|
|
page_content=chunk,
|
|
metadata={
|
|
"chunk": i + 1,
|
|
},
|
|
)
|
|
chunked_docs.append(doc)
|
|
|
|
openai_key = self.llm.openai_api_key
|
|
retriever = FAISS.from_documents(chunked_docs,
|
|
OpenAIEmbeddings(api_key=openai_key)).as_retriever()
|
|
# could be any embedding of your choice
|
|
embeddings = OpenAIEmbeddings(api_key=openai_key)
|
|
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
|
|
# similarity_threshold could be set, now k=20
|
|
relevant_filter = EmbeddingsFilter(embeddings=embeddings)
|
|
pipeline_compressor = DocumentCompressorPipeline(
|
|
transformers=[redundant_filter, relevant_filter]
|
|
)
|
|
|
|
compression_retriever = ContextualCompressionRetriever(
|
|
base_compressor=pipeline_compressor, base_retriever=retriever
|
|
)
|
|
|
|
compressed_docs = compression_retriever.get_relevant_documents(
|
|
user_input)
|
|
print("Documents compressed and stored in a vector database.")
|
|
state.update({"relevant_chunks": compressed_docs})
|
|
return state
|