Scrapegraph-ai/examples/example_multichunks.py
2024-02-21 18:17:34 +01:00

81 lines
2.6 KiB
Python

from langchain_community.document_transformers import Html2TextTransformer
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain_openai import OpenAI
from langchain_community.vectorstores import FAISS
load_dotenv()
# Helper function for printing docs
def pretty_print_docs(docs):
print(
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
)
)
# Define the configuration for the language model
openai_key = os.getenv("OPENAI_APIKEY")
# chroma = Chroma('test', OpenAIEmbeddings(api_key=openai_key))
# html2text 2020.1.16
urls = ["https://www.mymovies.it/cinema/roma"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=4000,
chunk_overlap=200,
)
docs_transformed = Html2TextTransformer().transform_documents(docs)
doc = docs[0]
chunks = text_splitter.split_text(doc.page_content)
chunked_docs = []
for i, chunk in enumerate(chunks):
doc = Document(
page_content=chunk,
metadata={
"chunk": i + 1,
},
)
chunked_docs.append(doc)
retriever = FAISS.from_documents(chunked_docs, OpenAIEmbeddings(api_key=openai_key)).as_retriever()
embeddings = OpenAIEmbeddings(api_key=openai_key) # could be any embedding of your choice
embeddings_filter = EmbeddingsFilter(embeddings=embeddings)
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings) # similarity_threshold could be set, now k=20
pipeline_compressor = DocumentCompressorPipeline(
transformers=[redundant_filter, relevant_filter]
)
compression_retriever = ContextualCompressionRetriever(
base_compressor=pipeline_compressor, base_retriever=retriever
)
compressed_docs = compression_retriever.get_relevant_documents(
"Dammi i nomi dei cinema in provincia di Roma"
)
pretty_print_docs(compressed_docs)
# db = Chroma.from_documents(chunked_docs, OpenAIEmbeddings(api_key=openai_key))
# chroma.similarity_search_with_relevance_scores('Find the cinema name', 10)