mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-04 21:00:36 +08:00
Merge branch 'pre/beta' into temp
This commit is contained in:
commit
0cda020d48
11
CHANGELOG.md
11
CHANGELOG.md
@ -25,6 +25,8 @@
|
||||
* conditional node ([ce00345](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ce003454953e5785d4746223c252de38cd5d07ea))
|
||||
|
||||
## [1.13.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.12.2...v1.13.0) (2024-08-09)
|
||||
## [1.13.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.8...v1.13.0-beta.9) (2024-08-10)
|
||||
|
||||
|
||||
|
||||
### Features
|
||||
@ -65,6 +67,15 @@
|
||||
* **release:** 1.13.0-beta.5 [skip ci] ([2eba73b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2eba73b784ee443260117e98ab7c943934b3018d)), closes [#513](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/513)
|
||||
* **release:** 1.13.0-beta.6 [skip ci] ([e75b574](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e75b574b67040e127599da9ee1b0eee13d234cb9))
|
||||
* **release:** 1.13.0-beta.7 [skip ci] ([6e56925](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6e56925355c424edae290c70fd98646ab5f420ee))
|
||||
* add refactoring of default temperature ([6c3b37a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6c3b37ab001b80c09ea9ffb56d4c3df338e33a7a))
|
||||
|
||||
## [1.13.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.7...v1.13.0-beta.8) (2024-08-09)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* broken node ([1272273](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/127227349915deeb0dede34aa575ad269ed7cbe3))
|
||||
|
||||
|
||||
## [1.13.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.6...v1.13.0-beta.7) (2024-08-09)
|
||||
|
||||
|
||||
@ -14,7 +14,6 @@ graph_config = {
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
|
||||
"verbose": True,
|
||||
"headless": False
|
||||
}
|
||||
@ -23,8 +22,13 @@ graph_config = {
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
<<<<<<< Updated upstream
|
||||
prompt="Find some information about what does the company do, the name and a contact email.",
|
||||
source="https://scrapegraphai.com/",
|
||||
=======
|
||||
prompt="List all the projects with their descriptions",
|
||||
source="https://perinim.github.io/projects/",
|
||||
>>>>>>> Stashed changes
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
|
||||
@ -3,6 +3,8 @@ name = "scrapegraphai"
|
||||
|
||||
|
||||
version = "1.13.3"
|
||||
version = "1.13.0b9"
|
||||
|
||||
|
||||
|
||||
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
|
||||
|
||||
@ -7,11 +7,9 @@ from typing import Optional
|
||||
import uuid
|
||||
import warnings
|
||||
from pydantic import BaseModel
|
||||
|
||||
from langchain_community.chat_models import ErnieBotChat
|
||||
from langchain_nvidia_ai_endpoints import ChatNVIDIA
|
||||
from langchain.chat_models import init_chat_model
|
||||
|
||||
from ..helpers import models_tokens
|
||||
from ..models import (
|
||||
OneApi,
|
||||
@ -19,8 +17,6 @@ from ..models import (
|
||||
)
|
||||
from ..utils.logging import set_verbosity_warning, set_verbosity_info
|
||||
|
||||
|
||||
|
||||
class AbstractGraph(ABC):
|
||||
"""
|
||||
Scaffolding class for creating a graph representation and executing it.
|
||||
@ -53,6 +49,9 @@ class AbstractGraph(ABC):
|
||||
def __init__(self, prompt: str, config: dict,
|
||||
source: Optional[str] = None, schema: Optional[BaseModel] = None):
|
||||
|
||||
if config.get("llm").get("temperature") is None:
|
||||
config["llm"]["temperature"] = 0
|
||||
|
||||
self.prompt = prompt
|
||||
self.source = source
|
||||
self.config = config
|
||||
@ -212,7 +211,7 @@ class AbstractGraph(ABC):
|
||||
print("model not found, using default token size (8192)")
|
||||
self.model_token = 8192
|
||||
return ErnieBotChat(llm_params)
|
||||
|
||||
|
||||
if "oneapi" in llm_params["model"]:
|
||||
# take the model after the last dash
|
||||
llm_params["model"] = llm_params["model"].split("/")[-1]
|
||||
@ -221,7 +220,7 @@ class AbstractGraph(ABC):
|
||||
except KeyError as exc:
|
||||
raise KeyError("Model not supported") from exc
|
||||
return OneApi(llm_params)
|
||||
|
||||
|
||||
if "nvidia" in llm_params["model"]:
|
||||
try:
|
||||
self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]]
|
||||
|
||||
@ -6,9 +6,7 @@ import warnings
|
||||
from typing import Tuple
|
||||
from langchain_community.callbacks import get_openai_callback
|
||||
from ..integrations import BurrBridge
|
||||
|
||||
# Import telemetry functions
|
||||
from ..telemetry import log_graph_execution, log_event
|
||||
from ..telemetry import log_graph_execution
|
||||
|
||||
class BaseGraph:
|
||||
"""
|
||||
|
||||
@ -4,16 +4,13 @@ Module for creating the smart scraper
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
GenerateAnswerCSVNode
|
||||
)
|
||||
|
||||
|
||||
class CSVScraperGraph(AbstractGraph):
|
||||
"""
|
||||
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
|
||||
|
||||
@ -4,22 +4,19 @@ CSVScraperMultiGraph Module
|
||||
|
||||
from copy import copy, deepcopy
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from .csv_scraper_graph import CSVScraperGraph
|
||||
|
||||
from ..nodes import (
|
||||
GraphIteratorNode,
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
|
||||
class CSVScraperMultiGraph(AbstractGraph):
|
||||
"""
|
||||
CSVScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
|
||||
CSVScraperMultiGraph is a scraping pipeline that
|
||||
scrapes a list of URLs and generates answers to a given prompt.
|
||||
It only requires a user prompt and a list of URLs.
|
||||
|
||||
Attributes:
|
||||
@ -44,7 +41,8 @@ class CSVScraperMultiGraph(AbstractGraph):
|
||||
>>> result = search_graph.run()
|
||||
"""
|
||||
|
||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
||||
def __init__(self, prompt: str, source: List[str],
|
||||
config: dict, schema: Optional[BaseModel] = None):
|
||||
|
||||
self.max_results = config.get("max_results", 3)
|
||||
|
||||
|
||||
@ -4,10 +4,8 @@ DeepScraperGraph Module
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
SearchLinkNode,
|
||||
@ -18,7 +16,6 @@ from ..nodes import (
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
|
||||
class DeepScraperGraph(AbstractGraph):
|
||||
"""
|
||||
[WIP]
|
||||
@ -87,7 +84,6 @@ class DeepScraperGraph(AbstractGraph):
|
||||
output=["relevant_chunks"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model
|
||||
}
|
||||
)
|
||||
generate_answer_node = GenerateAnswerNode(
|
||||
@ -104,7 +100,6 @@ class DeepScraperGraph(AbstractGraph):
|
||||
output=["relevant_links"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model
|
||||
}
|
||||
)
|
||||
graph_iterator_node = GraphIteratorNode(
|
||||
|
||||
@ -4,16 +4,13 @@ JSONScraperGraph Module
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
GenerateAnswerNode
|
||||
)
|
||||
|
||||
|
||||
class JSONScraperGraph(AbstractGraph):
|
||||
"""
|
||||
JSONScraperGraph defines a scraping pipeline for JSON files.
|
||||
|
||||
@ -5,20 +5,18 @@ JSONScraperMultiGraph Module
|
||||
from copy import copy, deepcopy
|
||||
from typing import List, Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from .json_scraper_graph import JSONScraperGraph
|
||||
|
||||
from ..nodes import (
|
||||
GraphIteratorNode,
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
|
||||
class JSONScraperMultiGraph(AbstractGraph):
|
||||
"""
|
||||
JSONScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
|
||||
JSONScraperMultiGraph is a scraping pipeline that scrapes a
|
||||
list of URLs and generates answers to a given prompt.
|
||||
It only requires a user prompt and a list of URLs.
|
||||
|
||||
Attributes:
|
||||
|
||||
@ -5,17 +5,14 @@ MDScraperMultiGraph Module
|
||||
from copy import copy, deepcopy
|
||||
from typing import List, Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from .markdown_scraper_graph import MDScraperGraph
|
||||
|
||||
from ..nodes import (
|
||||
GraphIteratorNode,
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
|
||||
class MDScraperMultiGraph(AbstractGraph):
|
||||
"""
|
||||
MDScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and
|
||||
|
||||
@ -4,17 +4,14 @@ OmniScraperGraph Module
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
ParseNode,
|
||||
ImageToTextNode,
|
||||
GenerateAnswerOmniNode
|
||||
)
|
||||
|
||||
from ..models import OpenAIImageToText
|
||||
|
||||
class OmniScraperGraph(AbstractGraph):
|
||||
|
||||
@ -5,17 +5,14 @@ PDFScraperGraph Module
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
ParseNode,
|
||||
GenerateAnswerPDFNode
|
||||
)
|
||||
|
||||
|
||||
class PDFScraperGraph(AbstractGraph):
|
||||
"""
|
||||
PDFScraperGraph is a scraping pipeline that extracts information from pdf files using a natural
|
||||
|
||||
@ -5,17 +5,14 @@ PdfScraperMultiGraph Module
|
||||
from copy import copy, deepcopy
|
||||
from typing import List, Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from .pdf_scraper_graph import PDFScraperGraph
|
||||
|
||||
from ..nodes import (
|
||||
GraphIteratorNode,
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
|
||||
class PdfScraperMultiGraph(AbstractGraph):
|
||||
"""
|
||||
PdfScraperMultiGraph is a scraping pipeline that scrapes a
|
||||
|
||||
@ -4,17 +4,14 @@ ScriptCreatorGraph Module
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
ParseNode,
|
||||
GenerateScraperNode
|
||||
)
|
||||
|
||||
|
||||
class ScriptCreatorGraph(AbstractGraph):
|
||||
"""
|
||||
ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts.
|
||||
|
||||
@ -16,10 +16,10 @@ from ..nodes import (
|
||||
MergeGeneratedScriptsNode
|
||||
)
|
||||
|
||||
|
||||
class ScriptCreatorMultiGraph(AbstractGraph):
|
||||
"""
|
||||
ScriptCreatorMultiGraph is a scraping pipeline that scrapes a list of URLs generating web scraping scripts.
|
||||
ScriptCreatorMultiGraph is a scraping pipeline that scrapes a list
|
||||
of URLs generating web scraping scripts.
|
||||
It only requires a user prompt and a list of URLs.
|
||||
Attributes:
|
||||
prompt (str): The user prompt to search the internet.
|
||||
|
||||
@ -16,8 +16,6 @@ from ..nodes import (
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
|
||||
|
||||
class SearchGraph(AbstractGraph):
|
||||
"""
|
||||
SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt.
|
||||
|
||||
@ -4,13 +4,13 @@ import logging
|
||||
from pydantic import BaseModel
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
|
||||
|
||||
from ..nodes import ( FetchNode, ParseNode, SearchLinkNode )
|
||||
|
||||
class SearchLinkGraph(AbstractGraph):
|
||||
"""
|
||||
SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts.
|
||||
SearchLinkGraph is a scraping pipeline that automates the process of
|
||||
extracting information from web pages using a natural language model
|
||||
to interpret and answer prompts.
|
||||
|
||||
Attributes:
|
||||
prompt (str): The prompt for the graph.
|
||||
|
||||
@ -14,7 +14,6 @@ from ..nodes import (
|
||||
GenerateAnswerNode
|
||||
)
|
||||
|
||||
|
||||
class SmartScraperGraph(AbstractGraph):
|
||||
"""
|
||||
SmartScraper is a scraping pipeline that automates the process of
|
||||
|
||||
@ -15,10 +15,10 @@ from ..nodes import (
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
|
||||
class SmartScraperMultiGraph(AbstractGraph):
|
||||
"""
|
||||
SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
|
||||
SmartScraperMultiGraph is a scraping pipeline that scrapes a
|
||||
list of URLs and generates answers to a given prompt.
|
||||
It only requires a user prompt and a list of URLs.
|
||||
|
||||
Attributes:
|
||||
@ -43,7 +43,8 @@ class SmartScraperMultiGraph(AbstractGraph):
|
||||
>>> result = search_graph.run()
|
||||
"""
|
||||
|
||||
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
|
||||
def __init__(self, prompt: str, source: List[str],
|
||||
config: dict, schema: Optional[BaseModel] = None):
|
||||
|
||||
self.max_results = config.get("max_results", 3)
|
||||
|
||||
@ -51,7 +52,7 @@ class SmartScraperMultiGraph(AbstractGraph):
|
||||
self.copy_config = copy(config)
|
||||
else:
|
||||
self.copy_config = deepcopy(config)
|
||||
|
||||
|
||||
self.copy_schema = deepcopy(schema)
|
||||
|
||||
super().__init__(prompt, config, source, schema)
|
||||
|
||||
@ -18,10 +18,10 @@ from ..nodes import (
|
||||
from ..utils.save_audio_from_bytes import save_audio_from_bytes
|
||||
from ..models import OpenAITextToSpeech
|
||||
|
||||
|
||||
class SpeechGraph(AbstractGraph):
|
||||
"""
|
||||
SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer to a given prompt, and generate an audio file.
|
||||
SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer
|
||||
to a given prompt, and generate an audio file.
|
||||
|
||||
Attributes:
|
||||
prompt (str): The prompt for the graph.
|
||||
|
||||
@ -13,7 +13,6 @@ from ..nodes import (
|
||||
GenerateAnswerNode
|
||||
)
|
||||
|
||||
|
||||
class XMLScraperGraph(AbstractGraph):
|
||||
"""
|
||||
XMLScraperGraph is a scraping pipeline that extracts information from XML files using a natural
|
||||
|
||||
@ -15,7 +15,6 @@ from ..nodes import (
|
||||
MergeAnswersNode
|
||||
)
|
||||
|
||||
|
||||
class XMLScraperMultiGraph(AbstractGraph):
|
||||
"""
|
||||
XMLScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and
|
||||
|
||||
@ -32,7 +32,10 @@ class ConditionalNode(BaseNode):
|
||||
"""
|
||||
Initializes an empty ConditionalNode.
|
||||
"""
|
||||
pass
|
||||
|
||||
#super().__init__(node_name, "node", input, output, 2, node_config)
|
||||
pass
|
||||
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
|
||||
@ -1,17 +1,12 @@
|
||||
"""
|
||||
GenerateAnswerNode Module
|
||||
"""
|
||||
|
||||
# Imports from standard library
|
||||
from typing import List, Optional
|
||||
|
||||
# Imports from Langchain
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain_core.output_parsers import JsonOutputParser
|
||||
from langchain_core.runnables import RunnableParallel
|
||||
from tqdm import tqdm
|
||||
from langchain_community.chat_models import ChatOllama
|
||||
# Imports from the library
|
||||
from .base_node import BaseNode
|
||||
from ..helpers.generate_answer_node_omni_prompts import template_no_chunk_omni, template_chunks_omni, template_merge_omni
|
||||
|
||||
|
||||
@ -4,16 +4,11 @@ GenerateScraperNode Module
|
||||
|
||||
# Imports from standard library
|
||||
from typing import List, Optional
|
||||
|
||||
# Imports from Langchain
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
|
||||
from ..utils.logging import get_logger
|
||||
|
||||
# Imports from the library
|
||||
from .base_node import BaseNode
|
||||
|
||||
|
||||
class GenerateScraperNode(BaseNode):
|
||||
"""
|
||||
Generates a python script for scraping a website using the specified library.
|
||||
|
||||
@ -11,7 +11,6 @@ from .base_node import BaseNode
|
||||
|
||||
DEFAULT_BATCHSIZE = 16
|
||||
|
||||
|
||||
class GraphIteratorNode(BaseNode):
|
||||
"""
|
||||
A node responsible for instantiating and running multiple graph instances in parallel.
|
||||
|
||||
@ -9,7 +9,6 @@ from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
from ..helpers import template_combined
|
||||
|
||||
|
||||
class MergeAnswersNode(BaseNode):
|
||||
"""
|
||||
A node responsible for merging the answers from multiple graph instances into a single answer.
|
||||
|
||||
@ -10,7 +10,6 @@ from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
|
||||
from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
|
||||
|
||||
class MergeGeneratedScriptsNode(BaseNode):
|
||||
"""
|
||||
A node responsible for merging scripts generated.
|
||||
|
||||
@ -9,7 +9,6 @@ from langchain_core.documents import Document
|
||||
from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
|
||||
|
||||
class ParseNode(BaseNode):
|
||||
"""
|
||||
A node responsible for parsing HTML content from a document.
|
||||
@ -91,7 +90,7 @@ class ParseNode(BaseNode):
|
||||
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||
token_counter=lambda text: len(text.split()),
|
||||
memoize=False)
|
||||
|
||||
|
||||
state.update({self.output[0]: chunks})
|
||||
|
||||
return state
|
||||
|
||||
@ -13,7 +13,6 @@ from langchain.retrievers.document_compressors import (
|
||||
)
|
||||
from langchain_community.document_transformers import EmbeddingsRedundantFilter
|
||||
from langchain_community.vectorstores import FAISS
|
||||
|
||||
from langchain_community.chat_models import ChatOllama
|
||||
from langchain_aws import BedrockEmbeddings, ChatBedrock
|
||||
from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings
|
||||
@ -23,7 +22,6 @@ from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings
|
||||
from langchain_fireworks import FireworksEmbeddings, ChatFireworks
|
||||
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI
|
||||
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA
|
||||
|
||||
from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
from ..helpers import models_tokens
|
||||
|
||||
@ -7,20 +7,23 @@ from urllib.parse import urljoin
|
||||
|
||||
def cleanup_html(html_content: str, base_url: str) -> str:
|
||||
"""
|
||||
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
|
||||
Processes HTML content by removing unnecessary tags,
|
||||
minifying the HTML, and extracting the title and body content.
|
||||
|
||||
Args:
|
||||
html_content (str): The HTML content to be processed.
|
||||
|
||||
Returns:
|
||||
str: A string combining the parsed title and the minified body content. If no body content is found, it indicates so.
|
||||
str: A string combining the parsed title and the minified body content.
|
||||
If no body content is found, it indicates so.
|
||||
|
||||
Example:
|
||||
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
|
||||
>>> remover(html_content)
|
||||
'Title: Example, Body: <body><p>Hello World!</p></body>'
|
||||
|
||||
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
|
||||
This function is particularly useful for preparing HTML content for
|
||||
environments where bandwidth usage needs to be minimized.
|
||||
"""
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
@ -55,4 +58,5 @@ def cleanup_html(html_content: str, base_url: str) -> str:
|
||||
return title, minimized_body, link_urls, image_urls
|
||||
|
||||
else:
|
||||
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")
|
||||
raise ValueError(f"""No HTML body content found, please try setting the 'headless'
|
||||
flag to False in the graph configuration. HTML content: {html_content}""")
|
||||
|
||||
@ -5,7 +5,6 @@ import os
|
||||
import sys
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def convert_to_csv(data: dict, filename: str, position: str = None) -> None:
|
||||
"""
|
||||
Converts a dictionary to a CSV file and saves it at a specified location.
|
||||
|
||||
@ -5,7 +5,6 @@ import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def convert_to_json(data: dict, filename: str, position: str = None) -> None:
|
||||
"""
|
||||
Converts a dictionary to a JSON file and saves it at a specified location.
|
||||
|
||||
@ -27,5 +27,5 @@ def convert_to_md(html: str, url: str = None) -> str:
|
||||
parsed_url = urlparse(url)
|
||||
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
h.baseurl = domain
|
||||
|
||||
|
||||
return h.handle(html)
|
||||
|
||||
@ -17,7 +17,6 @@ _default_logging_level = logging.WARNING
|
||||
|
||||
_semaphore = threading.Lock()
|
||||
|
||||
|
||||
def _get_library_root_logger() -> logging.Logger:
|
||||
return logging.getLogger(_library_name)
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
"""
|
||||
Prettify the execution information of the graph.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
|
||||
@ -10,7 +10,6 @@ import requests
|
||||
from fp.errors import FreeProxyException
|
||||
from fp.fp import FreeProxy
|
||||
|
||||
|
||||
class ProxyBrokerCriteria(TypedDict, total=False):
|
||||
"""proxy broker criteria"""
|
||||
|
||||
|
||||
@ -11,7 +11,8 @@ def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) -
|
||||
|
||||
Args:
|
||||
byte_response (bytes): The byte array containing audio data.
|
||||
output_path (Union[str, Path]): The destination file path where the audio file will be saved.
|
||||
output_path (Union[str, Path]): The destination
|
||||
file path where the audio file will be saved.
|
||||
|
||||
Example:
|
||||
>>> save_audio_from_bytes(b'audio data', 'path/to/audio.mp3')
|
||||
|
||||
@ -10,7 +10,6 @@ import importlib.util # noqa: F401
|
||||
if typing.TYPE_CHECKING:
|
||||
import types
|
||||
|
||||
|
||||
def srcfile_import(modpath: str, modname: str) -> "types.ModuleType":
|
||||
"""imports a python module from its srcfile
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user