diff --git a/examples/openai/inputs/markdown_example.md b/examples/openai/inputs/markdown_example.md new file mode 100644 index 00000000..85088f29 --- /dev/null +++ b/examples/openai/inputs/markdown_example.md @@ -0,0 +1,35 @@ +Marco Perini Toggle navigation + + * About + * Projects(current) + +Projects + +Competitions + + * CV + * ____ + +# Projects + + ![project thumbnail Rotary Pendulum RL +Open Source project aimed at controlling a real life rotary pendulum using RL +algorithms ](/projects/rotary-pendulum-rl/) + + ![project thumbnail DQN +Implementation from scratch Developed a Deep Q-Network algorithm to train a +simple and double pendulum ](https://github.com/PeriniM/DQN-SwingUp) + + ![project thumbnail Multi Agents HAED +University project which focuses on simulating a multi-agent system to perform +environment mapping. Agents, equipped with sensors, explore and record their +surroundings, considering uncertainties in their readings. +](https://github.com/PeriniM/Multi-Agents-HAED) + + ![project thumbnail Wireless ESC for Modular +Drones Modular drone architecture proposal and proof of concept. The project +received maximum grade. ](/projects/wireless-esc-drone/) + +© Copyright 2023 Marco Perini. Powered by Jekyll with +al-folio theme. Hosted by [GitHub +Pages](https://pages.github.com/). \ No newline at end of file diff --git a/examples/openai/md_scraper_openai.py b/examples/openai/md_scraper_openai.py new file mode 100644 index 00000000..7a163137 --- /dev/null +++ b/examples/openai/md_scraper_openai.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using MDScraperGraph from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import MDScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/markdown_example.md" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-3.5-turbo", + }, +} + +# ************************************************ +# Create the MDScraperGraph instance and run it +# ************************************************ + +md_scraper_graph = MDScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = md_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = md_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 8819811c..b1bf1242 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -21,3 +21,5 @@ from .json_scraper_multi_graph import JSONScraperMultiGraph from .csv_scraper_multi_graph import CSVScraperMultiGraph from .xml_scraper_multi_graph import XMLScraperMultiGraph from .script_creator_multi_graph import ScriptCreatorMultiGraph +from .markdown_scraper_graph import MDScraperGraph +from .markdown_scraper_multi_graph import MDScraperMultiGraph diff --git a/scrapegraphai/graphs/markdown_scraper_graph.py b/scrapegraphai/graphs/markdown_scraper_graph.py new file mode 100644 index 00000000..655aee94 --- /dev/null +++ b/scrapegraphai/graphs/markdown_scraper_graph.py @@ -0,0 +1,110 @@ +from typing import Optional +import logging +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from ..nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode + +class MDScraperGraph(AbstractGraph): + """ + MDScraperGraph is a scraping pipeline that automates the process of + extracting information from web pages using a natural language model to interpret + and answer prompts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + + Example: + >>> smart_scraper = MDScraperGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + """ + + def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None): + super().__init__(prompt, config, source, schema) + + self.input_key = "md" if source.endswith("md") else "md_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + fetch_node = FetchNode( + input="md | md_dir", + output=["doc"], + node_config={ + "loader_kwargs": self.config.get("loader_kwargs", {}), + } + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "parse_html": False, + "chunk_size": self.model_token + } + ) + rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm_model": self.llm_model, + "embedder_model": self.embedder_model + } + ) + generate_answer_node = GenerateAnswerNode( + input="user_prompt & (relevant_chunks | parsed_doc | doc)", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.schema, + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + rag_node, + generate_answer_node, + ], + edges=[ + (fetch_node, parse_node), + (parse_node, rag_node), + (rag_node, generate_answer_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/markdown_scraper_multi_graph.py b/scrapegraphai/graphs/markdown_scraper_multi_graph.py new file mode 100644 index 00000000..ec47f74d --- /dev/null +++ b/scrapegraphai/graphs/markdown_scraper_multi_graph.py @@ -0,0 +1,112 @@ +""" +MDScraperMultiGraph Module +""" + +from copy import copy, deepcopy +from typing import List, Optional +from pydantic import BaseModel + +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph +from .markdown_scraper_graph import MDScraperGraph + +from ..nodes import ( + GraphIteratorNode, + MergeAnswersNode +) + + +class MDScraperMultiGraph(AbstractGraph): + """ + MDScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and + generates answers to a given prompt. It only requires a user prompt and a list of URLs. + + Attributes: + prompt (str): The user prompt to search the internet. + llm_model (dict): The configuration for the language model. + embedder_model (dict): The configuration for the embedder model. + headless (bool): A flag to run the browser in headless mode. + verbose (bool): A flag to display the execution information. + model_token (int): The token limit for the language model. + + Args: + prompt (str): The user prompt to search the internet. + source (List[str]): The list of URLs to scrape. + config (dict): Configuration parameters for the graph. + schema (Optional[BaseModel]): The schema for the graph output. + + Example: + >>> search_graph = MDScraperMultiGraph( + ... "What is Chioggia famous for?", + ... ["http://example.com/page1", "http://example.com/page2"], + ... {"llm_model": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = search_graph.run() + """ + + def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): + if all(isinstance(value, str) for value in config.values()): + self.copy_config = copy(config) + else: + self.copy_config = deepcopy(config) + + self.copy_schema = deepcopy(schema) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping and searching. + + Returns: + BaseGraph: A graph instance representing the web scraping and searching workflow. + """ + # Create a SmartScraperGraph instance + smart_scraper_instance = MDScraperGraph( + prompt="", + source="", + config=self.copy_config, + schema=self.copy_schema + ) + + # Define the graph nodes + graph_iterator_node = GraphIteratorNode( + input="user_prompt & jsons", + output=["results"], + node_config={ + "graph_instance": smart_scraper_instance, + } + ) + + merge_answers_node = MergeAnswersNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.schema + } + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, merge_answers_node), + ], + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the web scraping and searching process. + + Returns: + str: The answer to the prompt. + """ + inputs = {"user_prompt": self.prompt, "xmls": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/pdf_scraper_multi_graph.py b/scrapegraphai/graphs/pdf_scraper_multi_graph.py index 86b2477f..f9b3061b 100644 --- a/scrapegraphai/graphs/pdf_scraper_multi_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_multi_graph.py @@ -46,8 +46,6 @@ class PdfScraperMultiGraph(AbstractGraph): def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) - if all(isinstance(value, str) for value in config.values()): self.copy_config = copy(config) else: diff --git a/scrapegraphai/graphs/xml_scraper_multi_graph.py b/scrapegraphai/graphs/xml_scraper_multi_graph.py index da772647..a6f90bea 100644 --- a/scrapegraphai/graphs/xml_scraper_multi_graph.py +++ b/scrapegraphai/graphs/xml_scraper_multi_graph.py @@ -46,8 +46,6 @@ class XMLScraperMultiGraph(AbstractGraph): def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): - self.max_results = config.get("max_results", 3) - if all(isinstance(value, str) for value in config.values()): self.copy_config = copy(config) else: @@ -116,7 +114,7 @@ class XMLScraperMultiGraph(AbstractGraph): Returns: str: The answer to the prompt. """ - inputs = {"user_prompt": self.prompt, "jsons": self.source} + inputs = {"user_prompt": self.prompt, "xmls": self.source} self.final_state, self.execution_info = self.graph.execute(inputs) return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 681ce6fd..638c590c 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -51,8 +51,8 @@ class FetchNode(BaseNode): self.verbose = ( False if node_config is None else node_config.get("verbose", False) ) - self.useSoup = ( - False if node_config is None else node_config.get("useSoup", False) + self.use_soup = ( + False if node_config is None else node_config.get("use_soup", False) ) self.loader_kwargs = ( {} if node_config is None else node_config.get("loader_kwargs", {}) @@ -88,17 +88,17 @@ class FetchNode(BaseNode): or input_keys[0] == "xml_dir" or input_keys[0] == "csv_dir" or input_keys[0] == "pdf_dir" + or input_keys[0] == "md_dir" ): compressed_document = [ source ] - + state.update({self.output[0]: compressed_document}) return state # handling pdf elif input_keys[0] == "pdf": - - # TODO: fix bytes content issue + loader = PyPDFLoader(source) compressed_document = loader.load() state.update({self.output[0]: compressed_document}) @@ -128,6 +128,14 @@ class FetchNode(BaseNode): ] state.update({self.output[0]: compressed_document}) return state + elif input_keys[0] == "md": + with open(source, "r", encoding="utf-8") as f: + data = f.read() + compressed_document = [ + Document(page_content=data, metadata={"source": "md"}) + ] + state.update({self.output[0]: compressed_document}) + return state elif self.input == "pdf_dir": pass @@ -142,7 +150,7 @@ class FetchNode(BaseNode): Document(page_content=parsed_content, metadata={"source": "local_dir"}) ] - elif self.useSoup: + elif self.use_soup: self.logger.info(f"--- (Fetching HTML from: {source}) ---") response = requests.get(source) if response.status_code == 200: @@ -169,12 +177,14 @@ class FetchNode(BaseNode): document = loader.load() if not document or not document[0].page_content.strip(): - raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") + raise ValueError("""No HTML body content found in the + document fetched by ChromiumLoader.""") title, minimized_body, link_urls, image_urls = cleanup_html( str(document[0].page_content), source ) - parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}" + parsed_content = f"""Title: {title}, Body: {minimized_body}, + Links: {link_urls}, Images: {image_urls}""" compressed_document = [ Document(page_content=parsed_content, metadata={"source": source}) diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index 3dac0efb..a2bea856 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -56,4 +56,3 @@ def cleanup_html(html_content: str, base_url: str) -> str: else: raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}") -