mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
add example
This commit is contained in:
parent
f5cbd80c97
commit
4d42d7bfc6
47
examples/local_models/json_scraper_multi_ollama.py
Normal file
47
examples/local_models/json_scraper_multi_ollama.py
Normal file
@ -0,0 +1,47 @@
|
||||
"""
|
||||
Module for showing how PDFScraper multi works
|
||||
"""
|
||||
import os
|
||||
from scrapegraphai.graphs import PdfScraperMultiGraph
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/llama3",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
"model_tokens": 4000,
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
},
|
||||
"verbose": True,
|
||||
"headless": False,
|
||||
}
|
||||
FILE_NAME = "inputs/example.json"
|
||||
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
file_path = os.path.join(curr_dir, FILE_NAME)
|
||||
|
||||
with open(file_path, 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
|
||||
json_scraper_graph = JSONScraperGraph(
|
||||
prompt="List me all the authors, title and genres of the books",
|
||||
source=text, # Pass the content of the file, not the file object
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
|
||||
|
||||
results = []
|
||||
for source in sources:
|
||||
pdf_scraper_graph = PdfScraperMultiGraph(
|
||||
prompt=prompt,
|
||||
source=source,
|
||||
config=graph_config
|
||||
)
|
||||
result = pdf_scraper_graph.run()
|
||||
results.append(result)
|
||||
|
||||
print(results)
|
||||
@ -1,6 +1,7 @@
|
||||
"""
|
||||
Module for showing how PDFScraper multi works
|
||||
"""
|
||||
import json
|
||||
from scrapegraphai.graphs import PdfScraperMultiGraph
|
||||
|
||||
graph_config = {
|
||||
@ -56,14 +57,16 @@ Independent Variable (IV): Exposure to social media.
|
||||
Dependent Variable (DV): Mental health outcomes.
|
||||
Exogenous Shock: staggered introduction of Facebook across U.S. colleges.
|
||||
"""
|
||||
results = []
|
||||
for source in sources:
|
||||
pdf_scraper_graph = PdfScraperMultiGraph(
|
||||
prompt=prompt,
|
||||
source=source,
|
||||
config=graph_config
|
||||
)
|
||||
result = pdf_scraper_graph.run()
|
||||
results.append(result)
|
||||
# *******************************************************
|
||||
# Create the SmartScraperMultiGraph instance and run it
|
||||
# *******************************************************
|
||||
|
||||
print(results)
|
||||
multiple_search_graph = PdfScraperMultiGraph(
|
||||
prompt=prompt,
|
||||
source= sources,
|
||||
schema=None,
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = multiple_search_graph.run()
|
||||
print(json.dumps(result, indent=4))
|
||||
|
||||
@ -2,7 +2,8 @@
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
|
||||
import os, json
|
||||
import os
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SmartScraperMultiGraph
|
||||
|
||||
|
||||
@ -17,3 +17,4 @@ from .omni_scraper_graph import OmniScraperGraph
|
||||
from .omni_search_graph import OmniSearchGraph
|
||||
from .smart_scraper_multi_graph import SmartScraperMultiGraph
|
||||
from .pdf_scraper_multi import PdfScraperMultiGraph
|
||||
from .json_scraper_multi import JsonScraperMultiGraph
|
||||
|
||||
116
scrapegraphai/graphs/json_scraper_multi.py
Normal file
116
scrapegraphai/graphs/json_scraper_multi.py
Normal file
@ -0,0 +1,116 @@
|
||||
"""
|
||||
JsonScraperMultiGraph Module
|
||||
"""
|
||||
|
||||
from copy import copy, deepcopy
|
||||
from typing import List, Optional
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from .json_scraper_graph import JSONScraperGraph
|
||||
|
||||
from ..nodes import (
|
||||
GraphIteratorNode,
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
|
||||
class JsonScraperMultiGraph(AbstractGraph):
|
||||
"""
|
||||
JsonScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
|
||||
It only requires a user prompt and a list of URLs.
|
||||
|
||||
Attributes:
|
||||
prompt (str): The user prompt to search the internet.
|
||||
llm_model (dict): The configuration for the language model.
|
||||
embedder_model (dict): The configuration for the embedder model.
|
||||
headless (bool): A flag to run the browser in headless mode.
|
||||
verbose (bool): A flag to display the execution information.
|
||||
model_token (int): The token limit for the language model.
|
||||
|
||||
Args:
|
||||
prompt (str): The user prompt to search the internet.
|
||||
source (List[str]): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (Optional[str]): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> search_graph = MultipleSearchGraph(
|
||||
... "What is Chioggia famous for?",
|
||||
... {"llm": {"model": "gpt-3.5-turbo"}}
|
||||
... )
|
||||
>>> result = search_graph.run()
|
||||
"""
|
||||
|
||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
|
||||
|
||||
self.max_results = config.get("max_results", 3)
|
||||
|
||||
if all(isinstance(value, str) for value in config.values()):
|
||||
self.copy_config = copy(config)
|
||||
else:
|
||||
self.copy_config = deepcopy(config)
|
||||
|
||||
super().__init__(prompt, config, source, schema)
|
||||
|
||||
def _create_graph(self) -> BaseGraph:
|
||||
"""
|
||||
Creates the graph of nodes representing the workflow for web scraping and searching.
|
||||
|
||||
Returns:
|
||||
BaseGraph: A graph instance representing the web scraping and searching workflow.
|
||||
"""
|
||||
|
||||
# ************************************************
|
||||
# Create a SmartScraperGraph instance
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_instance = JSONScraperGraph(
|
||||
prompt="",
|
||||
source="",
|
||||
config=self.copy_config,
|
||||
)
|
||||
|
||||
# ************************************************
|
||||
# Define the graph nodes
|
||||
# ************************************************
|
||||
|
||||
graph_iterator_node = GraphIteratorNode(
|
||||
input="user_prompt & jsons",
|
||||
output=["results"],
|
||||
node_config={
|
||||
"graph_instance": smart_scraper_instance,
|
||||
}
|
||||
)
|
||||
|
||||
merge_answers_node = MergeAnswersNode(
|
||||
input="user_prompt & results",
|
||||
output=["answer"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"schema": self.schema
|
||||
}
|
||||
)
|
||||
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
graph_iterator_node,
|
||||
merge_answers_node,
|
||||
],
|
||||
edges=[
|
||||
(graph_iterator_node, merge_answers_node),
|
||||
],
|
||||
entry_point=graph_iterator_node
|
||||
)
|
||||
|
||||
def run(self) -> str:
|
||||
"""
|
||||
Executes the web scraping and searching process.
|
||||
|
||||
Returns:
|
||||
str: The answer to the prompt.
|
||||
"""
|
||||
inputs = {"user_prompt": self.prompt, "jsons": self.source}
|
||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||
|
||||
return self.final_state.get("answer", "No answer found.")
|
||||
Loading…
Reference in New Issue
Block a user