diff --git a/examples/local_models/json_scraper_multi_ollama.py b/examples/local_models/json_scraper_multi_ollama.py new file mode 100644 index 00000000..d3540301 --- /dev/null +++ b/examples/local_models/json_scraper_multi_ollama.py @@ -0,0 +1,47 @@ +""" +Module for showing how PDFScraper multi works +""" +import os +from scrapegraphai.graphs import PdfScraperMultiGraph + +graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + "model_tokens": 4000, + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + }, + "verbose": True, + "headless": False, +} +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + + +json_scraper_graph = JSONScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + + + +results = [] +for source in sources: + pdf_scraper_graph = PdfScraperMultiGraph( + prompt=prompt, + source=source, + config=graph_config + ) + result = pdf_scraper_graph.run() + results.append(result) + +print(results) diff --git a/examples/local_models/pdf_scraper_multi_ollama.py b/examples/local_models/pdf_scraper_multi_ollama.py index c7b439bd..77565918 100644 --- a/examples/local_models/pdf_scraper_multi_ollama.py +++ b/examples/local_models/pdf_scraper_multi_ollama.py @@ -1,6 +1,7 @@ """ Module for showing how PDFScraper multi works """ +import json from scrapegraphai.graphs import PdfScraperMultiGraph graph_config = { @@ -56,14 +57,16 @@ Independent Variable (IV): Exposure to social media. Dependent Variable (DV): Mental health outcomes. Exogenous Shock: staggered introduction of Facebook across U.S. colleges. """ -results = [] -for source in sources: - pdf_scraper_graph = PdfScraperMultiGraph( - prompt=prompt, - source=source, - config=graph_config - ) - result = pdf_scraper_graph.run() - results.append(result) +# ******************************************************* +# Create the SmartScraperMultiGraph instance and run it +# ******************************************************* -print(results) +multiple_search_graph = PdfScraperMultiGraph( + prompt=prompt, + source= sources, + schema=None, + config=graph_config +) + +result = multiple_search_graph.run() +print(json.dumps(result, indent=4)) diff --git a/examples/openai/smart_scraper_multi_openai.py b/examples/openai/smart_scraper_multi_openai.py index ddfc6239..504e00a8 100644 --- a/examples/openai/smart_scraper_multi_openai.py +++ b/examples/openai/smart_scraper_multi_openai.py @@ -2,7 +2,8 @@ Basic example of scraping pipeline using SmartScraper """ -import os, json +import os +import json from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperMultiGraph diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index b572905e..b70686a7 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -17,3 +17,4 @@ from .omni_scraper_graph import OmniScraperGraph from .omni_search_graph import OmniSearchGraph from .smart_scraper_multi_graph import SmartScraperMultiGraph from .pdf_scraper_multi import PdfScraperMultiGraph +from .json_scraper_multi import JsonScraperMultiGraph diff --git a/scrapegraphai/graphs/json_scraper_multi.py b/scrapegraphai/graphs/json_scraper_multi.py new file mode 100644 index 00000000..c7632d79 --- /dev/null +++ b/scrapegraphai/graphs/json_scraper_multi.py @@ -0,0 +1,116 @@ +""" +JsonScraperMultiGraph Module +""" + +from copy import copy, deepcopy +from typing import List, Optional + +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .json_scraper_graph import JSONScraperGraph + +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode +) + + +class JsonScraperMultiGraph(AbstractGraph): + """ + JsonScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + It only requires a user prompt and a list of URLs. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (Optional[str]): The schema for the graph output. + + Example: + >>> search_graph = MultipleSearchGraph( + ... "What is Chioggia famous for?", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None): + + self.max_results = config.get("max_results", 3) + + if all(isinstance(value, str) for value in config.values()): + self.copy_config = copy(config) + else: + self.copy_config = deepcopy(config) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. + """ + + # ************************************************ + # Create a SmartScraperGraph instance + # ************************************************ + + smart_scraper_instance = JSONScraperGraph( + prompt="", + source="", + config=self.copy_config, + ) + + # ************************************************ + # Define the graph nodes + # ************************************************ + + graph_iterator_node = GraphIteratorNode( + input="user_prompt & jsons", + output=["results"], + node_config={ + "graph_instance": smart_scraper_instance, + } + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node + ) + + def run(self) -> str: + """ + Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "jsons": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.")