mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
updated SmartScraperGraph with rag
This commit is contained in:
parent
a8b9b9e619
commit
62cc9170c8
@ -1,48 +0,0 @@
|
||||
"""
|
||||
Example of custom graph using existing nodes
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.models import OpenAI
|
||||
from scrapegraphai.graphs import BaseGraph
|
||||
from scrapegraphai.nodes import FetchHTMLNode, RAGNode, GenerateAnswerNode
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Define the configuration for the language model
|
||||
openai_key = os.getenv("OPENAI_APIKEY")
|
||||
llm_config = {
|
||||
"api_key": openai_key,
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"temperature": 0,
|
||||
"streaming": True
|
||||
}
|
||||
model = OpenAI(llm_config)
|
||||
|
||||
# define the nodes for the graph
|
||||
fetch_html_node = FetchHTMLNode("fetch_html")
|
||||
rag_node = RAGNode(model, "rag")
|
||||
generate_answer_node = GenerateAnswerNode(model, "generate_answer")
|
||||
|
||||
# create the graph
|
||||
graph = BaseGraph(
|
||||
nodes={
|
||||
fetch_html_node,
|
||||
rag_node,
|
||||
generate_answer_node
|
||||
},
|
||||
edges={
|
||||
(fetch_html_node, rag_node),
|
||||
(rag_node, generate_answer_node)
|
||||
},
|
||||
entry_point=fetch_html_node
|
||||
)
|
||||
|
||||
# execute the graph
|
||||
inputs = {"user_input": "Give me the summary of the page", "url": "https://python.langchain.com/docs/expression_language/how_to/map"}
|
||||
result = graph.execute(inputs)
|
||||
|
||||
# get the answer from the result
|
||||
answer = result.get("answer", "No answer found.")
|
||||
print(answer)
|
||||
@ -1,80 +0,0 @@
|
||||
from langchain_community.document_transformers import Html2TextTransformer
|
||||
from langchain_community.document_loaders import AsyncHtmlLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain.docstore.document import Document
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from langchain.retrievers import ContextualCompressionRetriever
|
||||
from langchain.retrievers.document_compressors import EmbeddingsFilter
|
||||
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
|
||||
from langchain_community.document_transformers import EmbeddingsRedundantFilter
|
||||
from langchain_openai import OpenAI
|
||||
from langchain_community.vectorstores import FAISS
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Helper function for printing docs
|
||||
|
||||
def pretty_print_docs(docs):
|
||||
print(
|
||||
f"\n{'-' * 100}\n".join(
|
||||
[f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
|
||||
)
|
||||
)
|
||||
|
||||
# Define the configuration for the language model
|
||||
openai_key = os.getenv("OPENAI_APIKEY")
|
||||
|
||||
# chroma = Chroma('test', OpenAIEmbeddings(api_key=openai_key))
|
||||
|
||||
# html2text 2020.1.16
|
||||
urls = ["https://www.mymovies.it/cinema/roma"]
|
||||
loader = AsyncHtmlLoader(urls)
|
||||
docs = loader.load()
|
||||
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
||||
chunk_size=4000,
|
||||
chunk_overlap=200,
|
||||
)
|
||||
|
||||
docs_transformed = Html2TextTransformer().transform_documents(docs)
|
||||
|
||||
doc = docs[0]
|
||||
|
||||
chunks = text_splitter.split_text(doc.page_content)
|
||||
|
||||
chunked_docs = []
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
doc = Document(
|
||||
page_content=chunk,
|
||||
metadata={
|
||||
"chunk": i + 1,
|
||||
},
|
||||
)
|
||||
chunked_docs.append(doc)
|
||||
|
||||
retriever = FAISS.from_documents(chunked_docs, OpenAIEmbeddings(api_key=openai_key)).as_retriever()
|
||||
|
||||
embeddings = OpenAIEmbeddings(api_key=openai_key) # could be any embedding of your choice
|
||||
embeddings_filter = EmbeddingsFilter(embeddings=embeddings)
|
||||
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
|
||||
relevant_filter = EmbeddingsFilter(embeddings=embeddings) # similarity_threshold could be set, now k=20
|
||||
pipeline_compressor = DocumentCompressorPipeline(
|
||||
transformers=[redundant_filter, relevant_filter]
|
||||
)
|
||||
|
||||
compression_retriever = ContextualCompressionRetriever(
|
||||
base_compressor=pipeline_compressor, base_retriever=retriever
|
||||
)
|
||||
|
||||
compressed_docs = compression_retriever.get_relevant_documents(
|
||||
"Dammi i nomi dei cinema in provincia di Roma"
|
||||
)
|
||||
|
||||
pretty_print_docs(compressed_docs)
|
||||
|
||||
# db = Chroma.from_documents(chunked_docs, OpenAIEmbeddings(api_key=openai_key))
|
||||
# chroma.similarity_search_with_relevance_scores('Find the cinema name', 10)
|
||||
|
||||
@ -15,8 +15,12 @@ llm_config = {
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
}
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph("List me all the titles and project descriptions",
|
||||
"https://perinim.github.io/projects/", llm_config)
|
||||
# Define URL and prompt
|
||||
url = "https://perinim.github.io/projects/"
|
||||
prompt = "List me all the titles and project descriptions"
|
||||
|
||||
# Create the SmartScraperGraph instance
|
||||
smart_scraper_graph = SmartScraperGraph(prompt, url, llm_config)
|
||||
|
||||
answer = smart_scraper_graph.run()
|
||||
print(answer)
|
||||
|
||||
@ -2,10 +2,8 @@ from ..models import OpenAI
|
||||
from .base_graph import BaseGraph
|
||||
from ..nodes import (
|
||||
FetchHTMLNode,
|
||||
ConditionalNode,
|
||||
GetProbableTagsNode,
|
||||
GenerateAnswerNode,
|
||||
ParseHTMLNode
|
||||
RAGNode,
|
||||
GenerateAnswerNode
|
||||
)
|
||||
|
||||
|
||||
@ -74,25 +72,18 @@ class SmartScraperGraph:
|
||||
BaseGraph: An instance of the BaseGraph class.
|
||||
"""
|
||||
fetch_html_node = FetchHTMLNode("fetch_html")
|
||||
get_probable_tags_node = GetProbableTagsNode(
|
||||
self.llm, "get_probable_tags")
|
||||
parse_document_node = ParseHTMLNode("parse_document")
|
||||
rag_node = RAGNode(self.llm, "rag")
|
||||
generate_answer_node = GenerateAnswerNode(self.llm, "generate_answer")
|
||||
conditional_node = ConditionalNode(
|
||||
"conditional", [parse_document_node, generate_answer_node])
|
||||
|
||||
return BaseGraph(
|
||||
nodes={
|
||||
fetch_html_node,
|
||||
get_probable_tags_node,
|
||||
conditional_node,
|
||||
parse_document_node,
|
||||
rag_node,
|
||||
generate_answer_node,
|
||||
},
|
||||
edges={
|
||||
(fetch_html_node, get_probable_tags_node),
|
||||
(get_probable_tags_node, conditional_node),
|
||||
(parse_document_node, generate_answer_node)
|
||||
(fetch_html_node, rag_node),
|
||||
(rag_node, generate_answer_node)
|
||||
},
|
||||
entry_point=fetch_html_node
|
||||
)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user