updated SmartScraperGraph with rag

This commit is contained in:
Perinim 2024-02-21 23:52:45 +01:00
parent a8b9b9e619
commit 62cc9170c8
4 changed files with 12 additions and 145 deletions

View File

@ -1,48 +0,0 @@
"""
Example of custom graph using existing nodes
"""
import os
from dotenv import load_dotenv
from scrapegraphai.models import OpenAI
from scrapegraphai.graphs import BaseGraph
from scrapegraphai.nodes import FetchHTMLNode, RAGNode, GenerateAnswerNode
load_dotenv()
# Define the configuration for the language model
openai_key = os.getenv("OPENAI_APIKEY")
llm_config = {
"api_key": openai_key,
"model_name": "gpt-3.5-turbo",
"temperature": 0,
"streaming": True
}
model = OpenAI(llm_config)
# define the nodes for the graph
fetch_html_node = FetchHTMLNode("fetch_html")
rag_node = RAGNode(model, "rag")
generate_answer_node = GenerateAnswerNode(model, "generate_answer")
# create the graph
graph = BaseGraph(
nodes={
fetch_html_node,
rag_node,
generate_answer_node
},
edges={
(fetch_html_node, rag_node),
(rag_node, generate_answer_node)
},
entry_point=fetch_html_node
)
# execute the graph
inputs = {"user_input": "Give me the summary of the page", "url": "https://python.langchain.com/docs/expression_language/how_to/map"}
result = graph.execute(inputs)
# get the answer from the result
answer = result.get("answer", "No answer found.")
print(answer)

View File

@ -1,80 +0,0 @@
from langchain_community.document_transformers import Html2TextTransformer
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain_openai import OpenAI
from langchain_community.vectorstores import FAISS
load_dotenv()
# Helper function for printing docs
def pretty_print_docs(docs):
print(
f"\n{'-' * 100}\n".join(
[f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
)
)
# Define the configuration for the language model
openai_key = os.getenv("OPENAI_APIKEY")
# chroma = Chroma('test', OpenAIEmbeddings(api_key=openai_key))
# html2text 2020.1.16
urls = ["https://www.mymovies.it/cinema/roma"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=4000,
chunk_overlap=200,
)
docs_transformed = Html2TextTransformer().transform_documents(docs)
doc = docs[0]
chunks = text_splitter.split_text(doc.page_content)
chunked_docs = []
for i, chunk in enumerate(chunks):
doc = Document(
page_content=chunk,
metadata={
"chunk": i + 1,
},
)
chunked_docs.append(doc)
retriever = FAISS.from_documents(chunked_docs, OpenAIEmbeddings(api_key=openai_key)).as_retriever()
embeddings = OpenAIEmbeddings(api_key=openai_key) # could be any embedding of your choice
embeddings_filter = EmbeddingsFilter(embeddings=embeddings)
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings) # similarity_threshold could be set, now k=20
pipeline_compressor = DocumentCompressorPipeline(
transformers=[redundant_filter, relevant_filter]
)
compression_retriever = ContextualCompressionRetriever(
base_compressor=pipeline_compressor, base_retriever=retriever
)
compressed_docs = compression_retriever.get_relevant_documents(
"Dammi i nomi dei cinema in provincia di Roma"
)
pretty_print_docs(compressed_docs)
# db = Chroma.from_documents(chunked_docs, OpenAIEmbeddings(api_key=openai_key))
# chroma.similarity_search_with_relevance_scores('Find the cinema name', 10)

View File

@ -15,8 +15,12 @@ llm_config = {
"model_name": "gpt-3.5-turbo",
}
smart_scraper_graph = SmartScraperGraph("List me all the titles and project descriptions",
"https://perinim.github.io/projects/", llm_config)
# Define URL and prompt
url = "https://perinim.github.io/projects/"
prompt = "List me all the titles and project descriptions"
# Create the SmartScraperGraph instance
smart_scraper_graph = SmartScraperGraph(prompt, url, llm_config)
answer = smart_scraper_graph.run()
print(answer)

View File

@ -2,10 +2,8 @@ from ..models import OpenAI
from .base_graph import BaseGraph
from ..nodes import (
FetchHTMLNode,
ConditionalNode,
GetProbableTagsNode,
GenerateAnswerNode,
ParseHTMLNode
RAGNode,
GenerateAnswerNode
)
@ -74,25 +72,18 @@ class SmartScraperGraph:
BaseGraph: An instance of the BaseGraph class.
"""
fetch_html_node = FetchHTMLNode("fetch_html")
get_probable_tags_node = GetProbableTagsNode(
self.llm, "get_probable_tags")
parse_document_node = ParseHTMLNode("parse_document")
rag_node = RAGNode(self.llm, "rag")
generate_answer_node = GenerateAnswerNode(self.llm, "generate_answer")
conditional_node = ConditionalNode(
"conditional", [parse_document_node, generate_answer_node])
return BaseGraph(
nodes={
fetch_html_node,
get_probable_tags_node,
conditional_node,
parse_document_node,
rag_node,
generate_answer_node,
},
edges={
(fetch_html_node, get_probable_tags_node),
(get_probable_tags_node, conditional_node),
(parse_document_node, generate_answer_node)
(fetch_html_node, rag_node),
(rag_node, generate_answer_node)
},
entry_point=fetch_html_node
)