feat: add integrations for markdown files
Some checks are pending
/ build (3.10) (push) Waiting to run

This commit is contained in:
Marco Vinciguerra 2024-06-29 13:35:39 +02:00
parent f3cbbcee92
commit 2804434a9e
9 changed files with 335 additions and 14 deletions

View File

@ -0,0 +1,35 @@
Marco Perini Toggle navigation
* About
* Projects(current)
Projects
Competitions
* CV
* ____
# Projects
![project thumbnail Rotary Pendulum RL
Open Source project aimed at controlling a real life rotary pendulum using RL
algorithms ](/projects/rotary-pendulum-rl/)
![project thumbnail DQN
Implementation from scratch Developed a Deep Q-Network algorithm to train a
simple and double pendulum ](https://github.com/PeriniM/DQN-SwingUp)
![project thumbnail Multi Agents HAED
University project which focuses on simulating a multi-agent system to perform
environment mapping. Agents, equipped with sensors, explore and record their
surroundings, considering uncertainties in their readings.
](https://github.com/PeriniM/Multi-Agents-HAED)
![project thumbnail Wireless ESC for Modular
Drones Modular drone architecture proposal and proof of concept. The project
received maximum grade. ](/projects/wireless-esc-drone/)
© Copyright 2023 Marco Perini. Powered by Jekyll with
al-folio theme. Hosted by [GitHub
Pages](https://pages.github.com/).

View File

@ -0,0 +1,57 @@
"""
Basic example of scraping pipeline using MDScraperGraph from XML documents
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import MDScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()
# ************************************************
# Read the XML file
# ************************************************
FILE_NAME = "inputs/markdown_example.md"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)
with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
},
}
# ************************************************
# Create the MDScraperGraph instance and run it
# ************************************************
md_scraper_graph = MDScraperGraph(
prompt="List me all the authors, title and genres of the books",
source=text, # Pass the content of the file, not the file object
config=graph_config
)
result = md_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = md_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -21,3 +21,5 @@ from .json_scraper_multi_graph import JSONScraperMultiGraph
from .csv_scraper_multi_graph import CSVScraperMultiGraph
from .xml_scraper_multi_graph import XMLScraperMultiGraph
from .script_creator_multi_graph import ScriptCreatorMultiGraph
from .markdown_scraper_graph import MDScraperGraph
from .markdown_scraper_multi_graph import MDScraperMultiGraph

View File

@ -0,0 +1,110 @@
from typing import Optional
import logging
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode
class MDScraperGraph(AbstractGraph):
"""
MDScraperGraph is a scraping pipeline that automates the process of
extracting information from web pages using a natural language model to interpret
and answer prompts.
Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
Args:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
Example:
>>> smart_scraper = MDScraperGraph(
... "List me all the attractions in Chioggia.",
... "https://en.wikipedia.org/wiki/Chioggia",
... {"llm": {"model": "gpt-3.5-turbo"}}
... )
>>> result = smart_scraper.run()
"""
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
super().__init__(prompt, config, source, schema)
self.input_key = "md" if source.endswith("md") else "md_dir"
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
fetch_node = FetchNode(
input="md | md_dir",
output=["doc"],
node_config={
"loader_kwargs": self.config.get("loader_kwargs", {}),
}
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"parse_html": False,
"chunk_size": self.model_token
}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
}
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.schema,
}
)
return BaseGraph(
nodes=[
fetch_node,
parse_node,
rag_node,
generate_answer_node,
],
edges=[
(fetch_node, parse_node),
(parse_node, rag_node),
(rag_node, generate_answer_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__
)
def run(self) -> str:
"""
Executes the scraping process and returns the answer to the prompt.
Returns:
str: The answer to the prompt.
"""
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("answer", "No answer found.")

View File

@ -0,0 +1,112 @@
"""
MDScraperMultiGraph Module
"""
from copy import copy, deepcopy
from typing import List, Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .markdown_scraper_graph import MDScraperGraph
from ..nodes import (
GraphIteratorNode,
MergeAnswersNode
)
class MDScraperMultiGraph(AbstractGraph):
"""
MDScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and
generates answers to a given prompt. It only requires a user prompt and a list of URLs.
Attributes:
prompt (str): The user prompt to search the internet.
llm_model (dict): The configuration for the language model.
embedder_model (dict): The configuration for the embedder model.
headless (bool): A flag to run the browser in headless mode.
verbose (bool): A flag to display the execution information.
model_token (int): The token limit for the language model.
Args:
prompt (str): The user prompt to search the internet.
source (List[str]): The list of URLs to scrape.
config (dict): Configuration parameters for the graph.
schema (Optional[BaseModel]): The schema for the graph output.
Example:
>>> search_graph = MDScraperMultiGraph(
... "What is Chioggia famous for?",
... ["http://example.com/page1", "http://example.com/page2"],
... {"llm_model": {"model": "gpt-3.5-turbo"}}
... )
>>> result = search_graph.run()
"""
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
if all(isinstance(value, str) for value in config.values()):
self.copy_config = copy(config)
else:
self.copy_config = deepcopy(config)
self.copy_schema = deepcopy(schema)
super().__init__(prompt, config, source, schema)
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping and searching.
Returns:
BaseGraph: A graph instance representing the web scraping and searching workflow.
"""
# Create a SmartScraperGraph instance
smart_scraper_instance = MDScraperGraph(
prompt="",
source="",
config=self.copy_config,
schema=self.copy_schema
)
# Define the graph nodes
graph_iterator_node = GraphIteratorNode(
input="user_prompt & jsons",
output=["results"],
node_config={
"graph_instance": smart_scraper_instance,
}
)
merge_answers_node = MergeAnswersNode(
input="user_prompt & results",
output=["answer"],
node_config={
"llm_model": self.llm_model,
"schema": self.schema
}
)
return BaseGraph(
nodes=[
graph_iterator_node,
merge_answers_node,
],
edges=[
(graph_iterator_node, merge_answers_node),
],
entry_point=graph_iterator_node,
graph_name=self.__class__.__name__
)
def run(self) -> str:
"""
Executes the web scraping and searching process.
Returns:
str: The answer to the prompt.
"""
inputs = {"user_prompt": self.prompt, "xmls": self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("answer", "No answer found.")

View File

@ -46,8 +46,6 @@ class PdfScraperMultiGraph(AbstractGraph):
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
self.max_results = config.get("max_results", 3)
if all(isinstance(value, str) for value in config.values()):
self.copy_config = copy(config)
else:

View File

@ -46,8 +46,6 @@ class XMLScraperMultiGraph(AbstractGraph):
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
self.max_results = config.get("max_results", 3)
if all(isinstance(value, str) for value in config.values()):
self.copy_config = copy(config)
else:
@ -116,7 +114,7 @@ class XMLScraperMultiGraph(AbstractGraph):
Returns:
str: The answer to the prompt.
"""
inputs = {"user_prompt": self.prompt, "jsons": self.source}
inputs = {"user_prompt": self.prompt, "xmls": self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("answer", "No answer found.")

View File

@ -51,8 +51,8 @@ class FetchNode(BaseNode):
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
self.useSoup = (
False if node_config is None else node_config.get("useSoup", False)
self.use_soup = (
False if node_config is None else node_config.get("use_soup", False)
)
self.loader_kwargs = (
{} if node_config is None else node_config.get("loader_kwargs", {})
@ -88,17 +88,17 @@ class FetchNode(BaseNode):
or input_keys[0] == "xml_dir"
or input_keys[0] == "csv_dir"
or input_keys[0] == "pdf_dir"
or input_keys[0] == "md_dir"
):
compressed_document = [
source
]
state.update({self.output[0]: compressed_document})
return state
# handling pdf
elif input_keys[0] == "pdf":
# TODO: fix bytes content issue
loader = PyPDFLoader(source)
compressed_document = loader.load()
state.update({self.output[0]: compressed_document})
@ -128,6 +128,14 @@ class FetchNode(BaseNode):
]
state.update({self.output[0]: compressed_document})
return state
elif input_keys[0] == "md":
with open(source, "r", encoding="utf-8") as f:
data = f.read()
compressed_document = [
Document(page_content=data, metadata={"source": "md"})
]
state.update({self.output[0]: compressed_document})
return state
elif self.input == "pdf_dir":
pass
@ -142,7 +150,7 @@ class FetchNode(BaseNode):
Document(page_content=parsed_content, metadata={"source": "local_dir"})
]
elif self.useSoup:
elif self.use_soup:
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
response = requests.get(source)
if response.status_code == 200:
@ -169,12 +177,14 @@ class FetchNode(BaseNode):
document = loader.load()
if not document or not document[0].page_content.strip():
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
raise ValueError("""No HTML body content found in the
document fetched by ChromiumLoader.""")
title, minimized_body, link_urls, image_urls = cleanup_html(
str(document[0].page_content), source
)
parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
parsed_content = f"""Title: {title}, Body: {minimized_body},
Links: {link_urls}, Images: {image_urls}"""
compressed_document = [
Document(page_content=parsed_content, metadata={"source": source})

View File

@ -56,4 +56,3 @@ def cleanup_html(html_content: str, base_url: str) -> str:
else:
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")