mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
feat: add integrations for markdown files
Some checks are pending
/ build (3.10) (push) Waiting to run
Some checks are pending
/ build (3.10) (push) Waiting to run
This commit is contained in:
parent
f3cbbcee92
commit
2804434a9e
35
examples/openai/inputs/markdown_example.md
Normal file
35
examples/openai/inputs/markdown_example.md
Normal file
@ -0,0 +1,35 @@
|
||||
Marco Perini Toggle navigation
|
||||
|
||||
* About
|
||||
* Projects(current)
|
||||
|
||||
Projects
|
||||
|
||||
Competitions
|
||||
|
||||
* CV
|
||||
* ____
|
||||
|
||||
# Projects
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
© Copyright 2023 Marco Perini. Powered by Jekyll with
|
||||
al-folio theme. Hosted by [GitHub
|
||||
Pages](https://pages.github.com/).
|
||||
57
examples/openai/md_scraper_openai.py
Normal file
57
examples/openai/md_scraper_openai.py
Normal file
@ -0,0 +1,57 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using MDScraperGraph from XML documents
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import MDScraperGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Read the XML file
|
||||
# ************************************************
|
||||
|
||||
FILE_NAME = "inputs/markdown_example.md"
|
||||
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
file_path = os.path.join(curr_dir, FILE_NAME)
|
||||
|
||||
with open(file_path, 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
openai_key = os.getenv("OPENAI_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": openai_key,
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the MDScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
md_scraper_graph = MDScraperGraph(
|
||||
prompt="List me all the authors, title and genres of the books",
|
||||
source=text, # Pass the content of the file, not the file object
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = md_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = md_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json or csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
@ -21,3 +21,5 @@ from .json_scraper_multi_graph import JSONScraperMultiGraph
|
||||
from .csv_scraper_multi_graph import CSVScraperMultiGraph
|
||||
from .xml_scraper_multi_graph import XMLScraperMultiGraph
|
||||
from .script_creator_multi_graph import ScriptCreatorMultiGraph
|
||||
from .markdown_scraper_graph import MDScraperGraph
|
||||
from .markdown_scraper_multi_graph import MDScraperMultiGraph
|
||||
|
||||
110
scrapegraphai/graphs/markdown_scraper_graph.py
Normal file
110
scrapegraphai/graphs/markdown_scraper_graph.py
Normal file
@ -0,0 +1,110 @@
|
||||
from typing import Optional
|
||||
import logging
|
||||
from pydantic import BaseModel
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from ..nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode
|
||||
|
||||
class MDScraperGraph(AbstractGraph):
|
||||
"""
|
||||
MDScraperGraph is a scraping pipeline that automates the process of
|
||||
extracting information from web pages using a natural language model to interpret
|
||||
and answer prompts.
|
||||
|
||||
Attributes:
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client, configured for generating embeddings.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
headless (bool): A flag indicating whether to run the graph in headless mode.
|
||||
|
||||
Args:
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> smart_scraper = MDScraperGraph(
|
||||
... "List me all the attractions in Chioggia.",
|
||||
... "https://en.wikipedia.org/wiki/Chioggia",
|
||||
... {"llm": {"model": "gpt-3.5-turbo"}}
|
||||
... )
|
||||
>>> result = smart_scraper.run()
|
||||
"""
|
||||
|
||||
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
|
||||
super().__init__(prompt, config, source, schema)
|
||||
|
||||
self.input_key = "md" if source.endswith("md") else "md_dir"
|
||||
|
||||
def _create_graph(self) -> BaseGraph:
|
||||
"""
|
||||
Creates the graph of nodes representing the workflow for web scraping.
|
||||
|
||||
Returns:
|
||||
BaseGraph: A graph instance representing the web scraping workflow.
|
||||
"""
|
||||
fetch_node = FetchNode(
|
||||
input="md | md_dir",
|
||||
output=["doc"],
|
||||
node_config={
|
||||
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||
}
|
||||
)
|
||||
parse_node = ParseNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"parse_html": False,
|
||||
"chunk_size": self.model_token
|
||||
}
|
||||
)
|
||||
rag_node = RAGNode(
|
||||
input="user_prompt & (parsed_doc | doc)",
|
||||
output=["relevant_chunks"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model
|
||||
}
|
||||
)
|
||||
generate_answer_node = GenerateAnswerNode(
|
||||
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
||||
output=["answer"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"schema": self.schema,
|
||||
}
|
||||
)
|
||||
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
fetch_node,
|
||||
parse_node,
|
||||
rag_node,
|
||||
generate_answer_node,
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, parse_node),
|
||||
(parse_node, rag_node),
|
||||
(rag_node, generate_answer_node)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
)
|
||||
|
||||
def run(self) -> str:
|
||||
"""
|
||||
Executes the scraping process and returns the answer to the prompt.
|
||||
|
||||
Returns:
|
||||
str: The answer to the prompt.
|
||||
"""
|
||||
|
||||
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||
|
||||
return self.final_state.get("answer", "No answer found.")
|
||||
112
scrapegraphai/graphs/markdown_scraper_multi_graph.py
Normal file
112
scrapegraphai/graphs/markdown_scraper_multi_graph.py
Normal file
@ -0,0 +1,112 @@
|
||||
"""
|
||||
MDScraperMultiGraph Module
|
||||
"""
|
||||
|
||||
from copy import copy, deepcopy
|
||||
from typing import List, Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from .markdown_scraper_graph import MDScraperGraph
|
||||
|
||||
from ..nodes import (
|
||||
GraphIteratorNode,
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
|
||||
class MDScraperMultiGraph(AbstractGraph):
|
||||
"""
|
||||
MDScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and
|
||||
generates answers to a given prompt. It only requires a user prompt and a list of URLs.
|
||||
|
||||
Attributes:
|
||||
prompt (str): The user prompt to search the internet.
|
||||
llm_model (dict): The configuration for the language model.
|
||||
embedder_model (dict): The configuration for the embedder model.
|
||||
headless (bool): A flag to run the browser in headless mode.
|
||||
verbose (bool): A flag to display the execution information.
|
||||
model_token (int): The token limit for the language model.
|
||||
|
||||
Args:
|
||||
prompt (str): The user prompt to search the internet.
|
||||
source (List[str]): The list of URLs to scrape.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (Optional[BaseModel]): The schema for the graph output.
|
||||
|
||||
Example:
|
||||
>>> search_graph = MDScraperMultiGraph(
|
||||
... "What is Chioggia famous for?",
|
||||
... ["http://example.com/page1", "http://example.com/page2"],
|
||||
... {"llm_model": {"model": "gpt-3.5-turbo"}}
|
||||
... )
|
||||
>>> result = search_graph.run()
|
||||
"""
|
||||
|
||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
||||
if all(isinstance(value, str) for value in config.values()):
|
||||
self.copy_config = copy(config)
|
||||
else:
|
||||
self.copy_config = deepcopy(config)
|
||||
|
||||
self.copy_schema = deepcopy(schema)
|
||||
|
||||
super().__init__(prompt, config, source, schema)
|
||||
|
||||
def _create_graph(self) -> BaseGraph:
|
||||
"""
|
||||
Creates the graph of nodes representing the workflow for web scraping and searching.
|
||||
|
||||
Returns:
|
||||
BaseGraph: A graph instance representing the web scraping and searching workflow.
|
||||
"""
|
||||
# Create a SmartScraperGraph instance
|
||||
smart_scraper_instance = MDScraperGraph(
|
||||
prompt="",
|
||||
source="",
|
||||
config=self.copy_config,
|
||||
schema=self.copy_schema
|
||||
)
|
||||
|
||||
# Define the graph nodes
|
||||
graph_iterator_node = GraphIteratorNode(
|
||||
input="user_prompt & jsons",
|
||||
output=["results"],
|
||||
node_config={
|
||||
"graph_instance": smart_scraper_instance,
|
||||
}
|
||||
)
|
||||
|
||||
merge_answers_node = MergeAnswersNode(
|
||||
input="user_prompt & results",
|
||||
output=["answer"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"schema": self.schema
|
||||
}
|
||||
)
|
||||
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
graph_iterator_node,
|
||||
merge_answers_node,
|
||||
],
|
||||
edges=[
|
||||
(graph_iterator_node, merge_answers_node),
|
||||
],
|
||||
entry_point=graph_iterator_node,
|
||||
graph_name=self.__class__.__name__
|
||||
)
|
||||
|
||||
def run(self) -> str:
|
||||
"""
|
||||
Executes the web scraping and searching process.
|
||||
|
||||
Returns:
|
||||
str: The answer to the prompt.
|
||||
"""
|
||||
inputs = {"user_prompt": self.prompt, "xmls": self.source}
|
||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||
|
||||
return self.final_state.get("answer", "No answer found.")
|
||||
@ -46,8 +46,6 @@ class PdfScraperMultiGraph(AbstractGraph):
|
||||
|
||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
||||
|
||||
self.max_results = config.get("max_results", 3)
|
||||
|
||||
if all(isinstance(value, str) for value in config.values()):
|
||||
self.copy_config = copy(config)
|
||||
else:
|
||||
|
||||
@ -46,8 +46,6 @@ class XMLScraperMultiGraph(AbstractGraph):
|
||||
|
||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
||||
|
||||
self.max_results = config.get("max_results", 3)
|
||||
|
||||
if all(isinstance(value, str) for value in config.values()):
|
||||
self.copy_config = copy(config)
|
||||
else:
|
||||
@ -116,7 +114,7 @@ class XMLScraperMultiGraph(AbstractGraph):
|
||||
Returns:
|
||||
str: The answer to the prompt.
|
||||
"""
|
||||
inputs = {"user_prompt": self.prompt, "jsons": self.source}
|
||||
inputs = {"user_prompt": self.prompt, "xmls": self.source}
|
||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||
|
||||
return self.final_state.get("answer", "No answer found.")
|
||||
|
||||
@ -51,8 +51,8 @@ class FetchNode(BaseNode):
|
||||
self.verbose = (
|
||||
False if node_config is None else node_config.get("verbose", False)
|
||||
)
|
||||
self.useSoup = (
|
||||
False if node_config is None else node_config.get("useSoup", False)
|
||||
self.use_soup = (
|
||||
False if node_config is None else node_config.get("use_soup", False)
|
||||
)
|
||||
self.loader_kwargs = (
|
||||
{} if node_config is None else node_config.get("loader_kwargs", {})
|
||||
@ -88,17 +88,17 @@ class FetchNode(BaseNode):
|
||||
or input_keys[0] == "xml_dir"
|
||||
or input_keys[0] == "csv_dir"
|
||||
or input_keys[0] == "pdf_dir"
|
||||
or input_keys[0] == "md_dir"
|
||||
):
|
||||
compressed_document = [
|
||||
source
|
||||
]
|
||||
|
||||
|
||||
state.update({self.output[0]: compressed_document})
|
||||
return state
|
||||
# handling pdf
|
||||
elif input_keys[0] == "pdf":
|
||||
|
||||
# TODO: fix bytes content issue
|
||||
|
||||
loader = PyPDFLoader(source)
|
||||
compressed_document = loader.load()
|
||||
state.update({self.output[0]: compressed_document})
|
||||
@ -128,6 +128,14 @@ class FetchNode(BaseNode):
|
||||
]
|
||||
state.update({self.output[0]: compressed_document})
|
||||
return state
|
||||
elif input_keys[0] == "md":
|
||||
with open(source, "r", encoding="utf-8") as f:
|
||||
data = f.read()
|
||||
compressed_document = [
|
||||
Document(page_content=data, metadata={"source": "md"})
|
||||
]
|
||||
state.update({self.output[0]: compressed_document})
|
||||
return state
|
||||
|
||||
elif self.input == "pdf_dir":
|
||||
pass
|
||||
@ -142,7 +150,7 @@ class FetchNode(BaseNode):
|
||||
Document(page_content=parsed_content, metadata={"source": "local_dir"})
|
||||
]
|
||||
|
||||
elif self.useSoup:
|
||||
elif self.use_soup:
|
||||
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
|
||||
response = requests.get(source)
|
||||
if response.status_code == 200:
|
||||
@ -169,12 +177,14 @@ class FetchNode(BaseNode):
|
||||
document = loader.load()
|
||||
|
||||
if not document or not document[0].page_content.strip():
|
||||
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
||||
raise ValueError("""No HTML body content found in the
|
||||
document fetched by ChromiumLoader.""")
|
||||
|
||||
title, minimized_body, link_urls, image_urls = cleanup_html(
|
||||
str(document[0].page_content), source
|
||||
)
|
||||
parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
|
||||
parsed_content = f"""Title: {title}, Body: {minimized_body},
|
||||
Links: {link_urls}, Images: {image_urls}"""
|
||||
|
||||
compressed_document = [
|
||||
Document(page_content=parsed_content, metadata={"source": source})
|
||||
|
||||
@ -56,4 +56,3 @@ def cleanup_html(html_content: str, base_url: str) -> str:
|
||||
|
||||
else:
|
||||
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user