mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
feat: add integrations for markdown files
Some checks are pending
/ build (3.10) (push) Waiting to run
Some checks are pending
/ build (3.10) (push) Waiting to run
This commit is contained in:
parent
f3cbbcee92
commit
2804434a9e
35
examples/openai/inputs/markdown_example.md
Normal file
35
examples/openai/inputs/markdown_example.md
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
Marco Perini Toggle navigation
|
||||||
|
|
||||||
|
* About
|
||||||
|
* Projects(current)
|
||||||
|
|
||||||
|
Projects
|
||||||
|
|
||||||
|
Competitions
|
||||||
|
|
||||||
|
* CV
|
||||||
|
* ____
|
||||||
|
|
||||||
|
# Projects
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
© Copyright 2023 Marco Perini. Powered by Jekyll with
|
||||||
|
al-folio theme. Hosted by [GitHub
|
||||||
|
Pages](https://pages.github.com/).
|
||||||
57
examples/openai/md_scraper_openai.py
Normal file
57
examples/openai/md_scraper_openai.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
"""
|
||||||
|
Basic example of scraping pipeline using MDScraperGraph from XML documents
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from scrapegraphai.graphs import MDScraperGraph
|
||||||
|
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# ************************************************
|
||||||
|
# Read the XML file
|
||||||
|
# ************************************************
|
||||||
|
|
||||||
|
FILE_NAME = "inputs/markdown_example.md"
|
||||||
|
curr_dir = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
file_path = os.path.join(curr_dir, FILE_NAME)
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding="utf-8") as file:
|
||||||
|
text = file.read()
|
||||||
|
|
||||||
|
# ************************************************
|
||||||
|
# Define the configuration for the graph
|
||||||
|
# ************************************************
|
||||||
|
|
||||||
|
openai_key = os.getenv("OPENAI_APIKEY")
|
||||||
|
|
||||||
|
graph_config = {
|
||||||
|
"llm": {
|
||||||
|
"api_key": openai_key,
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# ************************************************
|
||||||
|
# Create the MDScraperGraph instance and run it
|
||||||
|
# ************************************************
|
||||||
|
|
||||||
|
md_scraper_graph = MDScraperGraph(
|
||||||
|
prompt="List me all the authors, title and genres of the books",
|
||||||
|
source=text, # Pass the content of the file, not the file object
|
||||||
|
config=graph_config
|
||||||
|
)
|
||||||
|
|
||||||
|
result = md_scraper_graph.run()
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
# ************************************************
|
||||||
|
# Get graph execution info
|
||||||
|
# ************************************************
|
||||||
|
|
||||||
|
graph_exec_info = md_scraper_graph.get_execution_info()
|
||||||
|
print(prettify_exec_info(graph_exec_info))
|
||||||
|
|
||||||
|
# Save to json or csv
|
||||||
|
convert_to_csv(result, "result")
|
||||||
|
convert_to_json(result, "result")
|
||||||
@ -21,3 +21,5 @@ from .json_scraper_multi_graph import JSONScraperMultiGraph
|
|||||||
from .csv_scraper_multi_graph import CSVScraperMultiGraph
|
from .csv_scraper_multi_graph import CSVScraperMultiGraph
|
||||||
from .xml_scraper_multi_graph import XMLScraperMultiGraph
|
from .xml_scraper_multi_graph import XMLScraperMultiGraph
|
||||||
from .script_creator_multi_graph import ScriptCreatorMultiGraph
|
from .script_creator_multi_graph import ScriptCreatorMultiGraph
|
||||||
|
from .markdown_scraper_graph import MDScraperGraph
|
||||||
|
from .markdown_scraper_multi_graph import MDScraperMultiGraph
|
||||||
|
|||||||
110
scrapegraphai/graphs/markdown_scraper_graph.py
Normal file
110
scrapegraphai/graphs/markdown_scraper_graph.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
from typing import Optional
|
||||||
|
import logging
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from .base_graph import BaseGraph
|
||||||
|
from .abstract_graph import AbstractGraph
|
||||||
|
from ..nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode
|
||||||
|
|
||||||
|
class MDScraperGraph(AbstractGraph):
|
||||||
|
"""
|
||||||
|
MDScraperGraph is a scraping pipeline that automates the process of
|
||||||
|
extracting information from web pages using a natural language model to interpret
|
||||||
|
and answer prompts.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
prompt (str): The prompt for the graph.
|
||||||
|
source (str): The source of the graph.
|
||||||
|
config (dict): Configuration parameters for the graph.
|
||||||
|
schema (BaseModel): The schema for the graph output.
|
||||||
|
llm_model: An instance of a language model client, configured for generating answers.
|
||||||
|
embedder_model: An instance of an embedding model client, configured for generating embeddings.
|
||||||
|
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||||
|
headless (bool): A flag indicating whether to run the graph in headless mode.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt (str): The prompt for the graph.
|
||||||
|
source (str): The source of the graph.
|
||||||
|
config (dict): Configuration parameters for the graph.
|
||||||
|
schema (BaseModel): The schema for the graph output.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> smart_scraper = MDScraperGraph(
|
||||||
|
... "List me all the attractions in Chioggia.",
|
||||||
|
... "https://en.wikipedia.org/wiki/Chioggia",
|
||||||
|
... {"llm": {"model": "gpt-3.5-turbo"}}
|
||||||
|
... )
|
||||||
|
>>> result = smart_scraper.run()
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
|
||||||
|
super().__init__(prompt, config, source, schema)
|
||||||
|
|
||||||
|
self.input_key = "md" if source.endswith("md") else "md_dir"
|
||||||
|
|
||||||
|
def _create_graph(self) -> BaseGraph:
|
||||||
|
"""
|
||||||
|
Creates the graph of nodes representing the workflow for web scraping.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BaseGraph: A graph instance representing the web scraping workflow.
|
||||||
|
"""
|
||||||
|
fetch_node = FetchNode(
|
||||||
|
input="md | md_dir",
|
||||||
|
output=["doc"],
|
||||||
|
node_config={
|
||||||
|
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
parse_node = ParseNode(
|
||||||
|
input="doc",
|
||||||
|
output=["parsed_doc"],
|
||||||
|
node_config={
|
||||||
|
"parse_html": False,
|
||||||
|
"chunk_size": self.model_token
|
||||||
|
}
|
||||||
|
)
|
||||||
|
rag_node = RAGNode(
|
||||||
|
input="user_prompt & (parsed_doc | doc)",
|
||||||
|
output=["relevant_chunks"],
|
||||||
|
node_config={
|
||||||
|
"llm_model": self.llm_model,
|
||||||
|
"embedder_model": self.embedder_model
|
||||||
|
}
|
||||||
|
)
|
||||||
|
generate_answer_node = GenerateAnswerNode(
|
||||||
|
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
||||||
|
output=["answer"],
|
||||||
|
node_config={
|
||||||
|
"llm_model": self.llm_model,
|
||||||
|
"schema": self.schema,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return BaseGraph(
|
||||||
|
nodes=[
|
||||||
|
fetch_node,
|
||||||
|
parse_node,
|
||||||
|
rag_node,
|
||||||
|
generate_answer_node,
|
||||||
|
],
|
||||||
|
edges=[
|
||||||
|
(fetch_node, parse_node),
|
||||||
|
(parse_node, rag_node),
|
||||||
|
(rag_node, generate_answer_node)
|
||||||
|
],
|
||||||
|
entry_point=fetch_node,
|
||||||
|
graph_name=self.__class__.__name__
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(self) -> str:
|
||||||
|
"""
|
||||||
|
Executes the scraping process and returns the answer to the prompt.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The answer to the prompt.
|
||||||
|
"""
|
||||||
|
|
||||||
|
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
||||||
|
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||||
|
|
||||||
|
return self.final_state.get("answer", "No answer found.")
|
||||||
112
scrapegraphai/graphs/markdown_scraper_multi_graph.py
Normal file
112
scrapegraphai/graphs/markdown_scraper_multi_graph.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
"""
|
||||||
|
MDScraperMultiGraph Module
|
||||||
|
"""
|
||||||
|
|
||||||
|
from copy import copy, deepcopy
|
||||||
|
from typing import List, Optional
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from .base_graph import BaseGraph
|
||||||
|
from .abstract_graph import AbstractGraph
|
||||||
|
from .markdown_scraper_graph import MDScraperGraph
|
||||||
|
|
||||||
|
from ..nodes import (
|
||||||
|
GraphIteratorNode,
|
||||||
|
MergeAnswersNode
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class MDScraperMultiGraph(AbstractGraph):
|
||||||
|
"""
|
||||||
|
MDScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and
|
||||||
|
generates answers to a given prompt. It only requires a user prompt and a list of URLs.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
prompt (str): The user prompt to search the internet.
|
||||||
|
llm_model (dict): The configuration for the language model.
|
||||||
|
embedder_model (dict): The configuration for the embedder model.
|
||||||
|
headless (bool): A flag to run the browser in headless mode.
|
||||||
|
verbose (bool): A flag to display the execution information.
|
||||||
|
model_token (int): The token limit for the language model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt (str): The user prompt to search the internet.
|
||||||
|
source (List[str]): The list of URLs to scrape.
|
||||||
|
config (dict): Configuration parameters for the graph.
|
||||||
|
schema (Optional[BaseModel]): The schema for the graph output.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> search_graph = MDScraperMultiGraph(
|
||||||
|
... "What is Chioggia famous for?",
|
||||||
|
... ["http://example.com/page1", "http://example.com/page2"],
|
||||||
|
... {"llm_model": {"model": "gpt-3.5-turbo"}}
|
||||||
|
... )
|
||||||
|
>>> result = search_graph.run()
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
||||||
|
if all(isinstance(value, str) for value in config.values()):
|
||||||
|
self.copy_config = copy(config)
|
||||||
|
else:
|
||||||
|
self.copy_config = deepcopy(config)
|
||||||
|
|
||||||
|
self.copy_schema = deepcopy(schema)
|
||||||
|
|
||||||
|
super().__init__(prompt, config, source, schema)
|
||||||
|
|
||||||
|
def _create_graph(self) -> BaseGraph:
|
||||||
|
"""
|
||||||
|
Creates the graph of nodes representing the workflow for web scraping and searching.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BaseGraph: A graph instance representing the web scraping and searching workflow.
|
||||||
|
"""
|
||||||
|
# Create a SmartScraperGraph instance
|
||||||
|
smart_scraper_instance = MDScraperGraph(
|
||||||
|
prompt="",
|
||||||
|
source="",
|
||||||
|
config=self.copy_config,
|
||||||
|
schema=self.copy_schema
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define the graph nodes
|
||||||
|
graph_iterator_node = GraphIteratorNode(
|
||||||
|
input="user_prompt & jsons",
|
||||||
|
output=["results"],
|
||||||
|
node_config={
|
||||||
|
"graph_instance": smart_scraper_instance,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
merge_answers_node = MergeAnswersNode(
|
||||||
|
input="user_prompt & results",
|
||||||
|
output=["answer"],
|
||||||
|
node_config={
|
||||||
|
"llm_model": self.llm_model,
|
||||||
|
"schema": self.schema
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return BaseGraph(
|
||||||
|
nodes=[
|
||||||
|
graph_iterator_node,
|
||||||
|
merge_answers_node,
|
||||||
|
],
|
||||||
|
edges=[
|
||||||
|
(graph_iterator_node, merge_answers_node),
|
||||||
|
],
|
||||||
|
entry_point=graph_iterator_node,
|
||||||
|
graph_name=self.__class__.__name__
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(self) -> str:
|
||||||
|
"""
|
||||||
|
Executes the web scraping and searching process.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The answer to the prompt.
|
||||||
|
"""
|
||||||
|
inputs = {"user_prompt": self.prompt, "xmls": self.source}
|
||||||
|
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||||
|
|
||||||
|
return self.final_state.get("answer", "No answer found.")
|
||||||
@ -46,8 +46,6 @@ class PdfScraperMultiGraph(AbstractGraph):
|
|||||||
|
|
||||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
||||||
|
|
||||||
self.max_results = config.get("max_results", 3)
|
|
||||||
|
|
||||||
if all(isinstance(value, str) for value in config.values()):
|
if all(isinstance(value, str) for value in config.values()):
|
||||||
self.copy_config = copy(config)
|
self.copy_config = copy(config)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -46,8 +46,6 @@ class XMLScraperMultiGraph(AbstractGraph):
|
|||||||
|
|
||||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
||||||
|
|
||||||
self.max_results = config.get("max_results", 3)
|
|
||||||
|
|
||||||
if all(isinstance(value, str) for value in config.values()):
|
if all(isinstance(value, str) for value in config.values()):
|
||||||
self.copy_config = copy(config)
|
self.copy_config = copy(config)
|
||||||
else:
|
else:
|
||||||
@ -116,7 +114,7 @@ class XMLScraperMultiGraph(AbstractGraph):
|
|||||||
Returns:
|
Returns:
|
||||||
str: The answer to the prompt.
|
str: The answer to the prompt.
|
||||||
"""
|
"""
|
||||||
inputs = {"user_prompt": self.prompt, "jsons": self.source}
|
inputs = {"user_prompt": self.prompt, "xmls": self.source}
|
||||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||||
|
|
||||||
return self.final_state.get("answer", "No answer found.")
|
return self.final_state.get("answer", "No answer found.")
|
||||||
|
|||||||
@ -51,8 +51,8 @@ class FetchNode(BaseNode):
|
|||||||
self.verbose = (
|
self.verbose = (
|
||||||
False if node_config is None else node_config.get("verbose", False)
|
False if node_config is None else node_config.get("verbose", False)
|
||||||
)
|
)
|
||||||
self.useSoup = (
|
self.use_soup = (
|
||||||
False if node_config is None else node_config.get("useSoup", False)
|
False if node_config is None else node_config.get("use_soup", False)
|
||||||
)
|
)
|
||||||
self.loader_kwargs = (
|
self.loader_kwargs = (
|
||||||
{} if node_config is None else node_config.get("loader_kwargs", {})
|
{} if node_config is None else node_config.get("loader_kwargs", {})
|
||||||
@ -88,17 +88,17 @@ class FetchNode(BaseNode):
|
|||||||
or input_keys[0] == "xml_dir"
|
or input_keys[0] == "xml_dir"
|
||||||
or input_keys[0] == "csv_dir"
|
or input_keys[0] == "csv_dir"
|
||||||
or input_keys[0] == "pdf_dir"
|
or input_keys[0] == "pdf_dir"
|
||||||
|
or input_keys[0] == "md_dir"
|
||||||
):
|
):
|
||||||
compressed_document = [
|
compressed_document = [
|
||||||
source
|
source
|
||||||
]
|
]
|
||||||
|
|
||||||
state.update({self.output[0]: compressed_document})
|
state.update({self.output[0]: compressed_document})
|
||||||
return state
|
return state
|
||||||
# handling pdf
|
# handling pdf
|
||||||
elif input_keys[0] == "pdf":
|
elif input_keys[0] == "pdf":
|
||||||
|
|
||||||
# TODO: fix bytes content issue
|
|
||||||
loader = PyPDFLoader(source)
|
loader = PyPDFLoader(source)
|
||||||
compressed_document = loader.load()
|
compressed_document = loader.load()
|
||||||
state.update({self.output[0]: compressed_document})
|
state.update({self.output[0]: compressed_document})
|
||||||
@ -128,6 +128,14 @@ class FetchNode(BaseNode):
|
|||||||
]
|
]
|
||||||
state.update({self.output[0]: compressed_document})
|
state.update({self.output[0]: compressed_document})
|
||||||
return state
|
return state
|
||||||
|
elif input_keys[0] == "md":
|
||||||
|
with open(source, "r", encoding="utf-8") as f:
|
||||||
|
data = f.read()
|
||||||
|
compressed_document = [
|
||||||
|
Document(page_content=data, metadata={"source": "md"})
|
||||||
|
]
|
||||||
|
state.update({self.output[0]: compressed_document})
|
||||||
|
return state
|
||||||
|
|
||||||
elif self.input == "pdf_dir":
|
elif self.input == "pdf_dir":
|
||||||
pass
|
pass
|
||||||
@ -142,7 +150,7 @@ class FetchNode(BaseNode):
|
|||||||
Document(page_content=parsed_content, metadata={"source": "local_dir"})
|
Document(page_content=parsed_content, metadata={"source": "local_dir"})
|
||||||
]
|
]
|
||||||
|
|
||||||
elif self.useSoup:
|
elif self.use_soup:
|
||||||
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
|
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
|
||||||
response = requests.get(source)
|
response = requests.get(source)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
@ -169,12 +177,14 @@ class FetchNode(BaseNode):
|
|||||||
document = loader.load()
|
document = loader.load()
|
||||||
|
|
||||||
if not document or not document[0].page_content.strip():
|
if not document or not document[0].page_content.strip():
|
||||||
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
raise ValueError("""No HTML body content found in the
|
||||||
|
document fetched by ChromiumLoader.""")
|
||||||
|
|
||||||
title, minimized_body, link_urls, image_urls = cleanup_html(
|
title, minimized_body, link_urls, image_urls = cleanup_html(
|
||||||
str(document[0].page_content), source
|
str(document[0].page_content), source
|
||||||
)
|
)
|
||||||
parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
|
parsed_content = f"""Title: {title}, Body: {minimized_body},
|
||||||
|
Links: {link_urls}, Images: {image_urls}"""
|
||||||
|
|
||||||
compressed_document = [
|
compressed_document = [
|
||||||
Document(page_content=parsed_content, metadata={"source": source})
|
Document(page_content=parsed_content, metadata={"source": source})
|
||||||
|
|||||||
@ -56,4 +56,3 @@ def cleanup_html(html_content: str, base_url: str) -> str:
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")
|
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user