Merge branch 'pre/beta' into temp

This commit is contained in:
Marco Vinciguerra 2024-08-10 17:45:52 +02:00 committed by GitHub
commit 0cda020d48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
40 changed files with 58 additions and 92 deletions

View File

@ -25,6 +25,8 @@
* conditional node ([ce00345](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ce003454953e5785d4746223c252de38cd5d07ea))
## [1.13.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.12.2...v1.13.0) (2024-08-09)
## [1.13.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.8...v1.13.0-beta.9) (2024-08-10)
### Features
@ -65,6 +67,15 @@
* **release:** 1.13.0-beta.5 [skip ci] ([2eba73b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2eba73b784ee443260117e98ab7c943934b3018d)), closes [#513](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/513)
* **release:** 1.13.0-beta.6 [skip ci] ([e75b574](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e75b574b67040e127599da9ee1b0eee13d234cb9))
* **release:** 1.13.0-beta.7 [skip ci] ([6e56925](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6e56925355c424edae290c70fd98646ab5f420ee))
* add refactoring of default temperature ([6c3b37a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6c3b37ab001b80c09ea9ffb56d4c3df338e33a7a))
## [1.13.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.7...v1.13.0-beta.8) (2024-08-09)
### Bug Fixes
* broken node ([1272273](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/127227349915deeb0dede34aa575ad269ed7cbe3))
## [1.13.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.6...v1.13.0-beta.7) (2024-08-09)

View File

@ -14,7 +14,6 @@ graph_config = {
"format": "json", # Ollama needs the format to be specified explicitly
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"verbose": True,
"headless": False
}
@ -23,8 +22,13 @@ graph_config = {
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
<<<<<<< Updated upstream
prompt="Find some information about what does the company do, the name and a contact email.",
source="https://scrapegraphai.com/",
=======
prompt="List all the projects with their descriptions",
source="https://perinim.github.io/projects/",
>>>>>>> Stashed changes
config=graph_config
)

View File

@ -3,6 +3,8 @@ name = "scrapegraphai"
version = "1.13.3"
version = "1.13.0b9"
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."

View File

@ -7,11 +7,9 @@ from typing import Optional
import uuid
import warnings
from pydantic import BaseModel
from langchain_community.chat_models import ErnieBotChat
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain.chat_models import init_chat_model
from ..helpers import models_tokens
from ..models import (
OneApi,
@ -19,8 +17,6 @@ from ..models import (
)
from ..utils.logging import set_verbosity_warning, set_verbosity_info
class AbstractGraph(ABC):
"""
Scaffolding class for creating a graph representation and executing it.
@ -53,6 +49,9 @@ class AbstractGraph(ABC):
def __init__(self, prompt: str, config: dict,
source: Optional[str] = None, schema: Optional[BaseModel] = None):
if config.get("llm").get("temperature") is None:
config["llm"]["temperature"] = 0
self.prompt = prompt
self.source = source
self.config = config
@ -212,7 +211,7 @@ class AbstractGraph(ABC):
print("model not found, using default token size (8192)")
self.model_token = 8192
return ErnieBotChat(llm_params)
if "oneapi" in llm_params["model"]:
# take the model after the last dash
llm_params["model"] = llm_params["model"].split("/")[-1]
@ -221,7 +220,7 @@ class AbstractGraph(ABC):
except KeyError as exc:
raise KeyError("Model not supported") from exc
return OneApi(llm_params)
if "nvidia" in llm_params["model"]:
try:
self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]]

View File

@ -6,9 +6,7 @@ import warnings
from typing import Tuple
from langchain_community.callbacks import get_openai_callback
from ..integrations import BurrBridge
# Import telemetry functions
from ..telemetry import log_graph_execution, log_event
from ..telemetry import log_graph_execution
class BaseGraph:
"""

View File

@ -4,16 +4,13 @@ Module for creating the smart scraper
from typing import Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
GenerateAnswerCSVNode
)
class CSVScraperGraph(AbstractGraph):
"""
SmartScraper is a comprehensive web scraping tool that automates the process of extracting

View File

@ -4,22 +4,19 @@ CSVScraperMultiGraph Module
from copy import copy, deepcopy
from typing import List, Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .csv_scraper_graph import CSVScraperGraph
from ..nodes import (
GraphIteratorNode,
MergeAnswersNode
)
class CSVScraperMultiGraph(AbstractGraph):
"""
CSVScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
CSVScraperMultiGraph is a scraping pipeline that
scrapes a list of URLs and generates answers to a given prompt.
It only requires a user prompt and a list of URLs.
Attributes:
@ -44,7 +41,8 @@ class CSVScraperMultiGraph(AbstractGraph):
>>> result = search_graph.run()
"""
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
def __init__(self, prompt: str, source: List[str],
config: dict, schema: Optional[BaseModel] = None):
self.max_results = config.get("max_results", 3)

View File

@ -4,10 +4,8 @@ DeepScraperGraph Module
from typing import Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
SearchLinkNode,
@ -18,7 +16,6 @@ from ..nodes import (
MergeAnswersNode
)
class DeepScraperGraph(AbstractGraph):
"""
[WIP]
@ -87,7 +84,6 @@ class DeepScraperGraph(AbstractGraph):
output=["relevant_chunks"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
}
)
generate_answer_node = GenerateAnswerNode(
@ -104,7 +100,6 @@ class DeepScraperGraph(AbstractGraph):
output=["relevant_links"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
}
)
graph_iterator_node = GraphIteratorNode(

View File

@ -4,16 +4,13 @@ JSONScraperGraph Module
from typing import Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
GenerateAnswerNode
)
class JSONScraperGraph(AbstractGraph):
"""
JSONScraperGraph defines a scraping pipeline for JSON files.

View File

@ -5,20 +5,18 @@ JSONScraperMultiGraph Module
from copy import copy, deepcopy
from typing import List, Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .json_scraper_graph import JSONScraperGraph
from ..nodes import (
GraphIteratorNode,
MergeAnswersNode
)
class JSONScraperMultiGraph(AbstractGraph):
"""
JSONScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
JSONScraperMultiGraph is a scraping pipeline that scrapes a
list of URLs and generates answers to a given prompt.
It only requires a user prompt and a list of URLs.
Attributes:

View File

@ -5,17 +5,14 @@ MDScraperMultiGraph Module
from copy import copy, deepcopy
from typing import List, Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .markdown_scraper_graph import MDScraperGraph
from ..nodes import (
GraphIteratorNode,
MergeAnswersNode
)
class MDScraperMultiGraph(AbstractGraph):
"""
MDScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and

View File

@ -4,17 +4,14 @@ OmniScraperGraph Module
from typing import Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
ImageToTextNode,
GenerateAnswerOmniNode
)
from ..models import OpenAIImageToText
class OmniScraperGraph(AbstractGraph):

View File

@ -5,17 +5,14 @@ PDFScraperGraph Module
from typing import Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
GenerateAnswerPDFNode
)
class PDFScraperGraph(AbstractGraph):
"""
PDFScraperGraph is a scraping pipeline that extracts information from pdf files using a natural

View File

@ -5,17 +5,14 @@ PdfScraperMultiGraph Module
from copy import copy, deepcopy
from typing import List, Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from .pdf_scraper_graph import PDFScraperGraph
from ..nodes import (
GraphIteratorNode,
MergeAnswersNode
)
class PdfScraperMultiGraph(AbstractGraph):
"""
PdfScraperMultiGraph is a scraping pipeline that scrapes a

View File

@ -4,17 +4,14 @@ ScriptCreatorGraph Module
from typing import Optional
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
GenerateScraperNode
)
class ScriptCreatorGraph(AbstractGraph):
"""
ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts.

View File

@ -16,10 +16,10 @@ from ..nodes import (
MergeGeneratedScriptsNode
)
class ScriptCreatorMultiGraph(AbstractGraph):
"""
ScriptCreatorMultiGraph is a scraping pipeline that scrapes a list of URLs generating web scraping scripts.
ScriptCreatorMultiGraph is a scraping pipeline that scrapes a list
of URLs generating web scraping scripts.
It only requires a user prompt and a list of URLs.
Attributes:
prompt (str): The user prompt to search the internet.

View File

@ -16,8 +16,6 @@ from ..nodes import (
MergeAnswersNode
)
class SearchGraph(AbstractGraph):
"""
SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt.

View File

@ -4,13 +4,13 @@ import logging
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import ( FetchNode, ParseNode, SearchLinkNode )
class SearchLinkGraph(AbstractGraph):
"""
SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts.
SearchLinkGraph is a scraping pipeline that automates the process of
extracting information from web pages using a natural language model
to interpret and answer prompts.
Attributes:
prompt (str): The prompt for the graph.

View File

@ -14,7 +14,6 @@ from ..nodes import (
GenerateAnswerNode
)
class SmartScraperGraph(AbstractGraph):
"""
SmartScraper is a scraping pipeline that automates the process of

View File

@ -15,10 +15,10 @@ from ..nodes import (
MergeAnswersNode
)
class SmartScraperMultiGraph(AbstractGraph):
"""
SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
SmartScraperMultiGraph is a scraping pipeline that scrapes a
list of URLs and generates answers to a given prompt.
It only requires a user prompt and a list of URLs.
Attributes:
@ -43,7 +43,8 @@ class SmartScraperMultiGraph(AbstractGraph):
>>> result = search_graph.run()
"""
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
def __init__(self, prompt: str, source: List[str],
config: dict, schema: Optional[BaseModel] = None):
self.max_results = config.get("max_results", 3)
@ -51,7 +52,7 @@ class SmartScraperMultiGraph(AbstractGraph):
self.copy_config = copy(config)
else:
self.copy_config = deepcopy(config)
self.copy_schema = deepcopy(schema)
super().__init__(prompt, config, source, schema)

View File

@ -18,10 +18,10 @@ from ..nodes import (
from ..utils.save_audio_from_bytes import save_audio_from_bytes
from ..models import OpenAITextToSpeech
class SpeechGraph(AbstractGraph):
"""
SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer to a given prompt, and generate an audio file.
SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer
to a given prompt, and generate an audio file.
Attributes:
prompt (str): The prompt for the graph.

View File

@ -13,7 +13,6 @@ from ..nodes import (
GenerateAnswerNode
)
class XMLScraperGraph(AbstractGraph):
"""
XMLScraperGraph is a scraping pipeline that extracts information from XML files using a natural

View File

@ -15,7 +15,6 @@ from ..nodes import (
MergeAnswersNode
)
class XMLScraperMultiGraph(AbstractGraph):
"""
XMLScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and

View File

@ -32,7 +32,10 @@ class ConditionalNode(BaseNode):
"""
Initializes an empty ConditionalNode.
"""
pass
#super().__init__(node_name, "node", input, output, 2, node_config)
pass
def execute(self, state: dict) -> dict:
"""

View File

@ -1,17 +1,12 @@
"""
GenerateAnswerNode Module
"""
# Imports from standard library
from typing import List, Optional
# Imports from Langchain
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnableParallel
from tqdm import tqdm
from langchain_community.chat_models import ChatOllama
# Imports from the library
from .base_node import BaseNode
from ..helpers.generate_answer_node_omni_prompts import template_no_chunk_omni, template_chunks_omni, template_merge_omni

View File

@ -4,16 +4,11 @@ GenerateScraperNode Module
# Imports from standard library
from typing import List, Optional
# Imports from Langchain
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from ..utils.logging import get_logger
# Imports from the library
from .base_node import BaseNode
class GenerateScraperNode(BaseNode):
"""
Generates a python script for scraping a website using the specified library.

View File

@ -11,7 +11,6 @@ from .base_node import BaseNode
DEFAULT_BATCHSIZE = 16
class GraphIteratorNode(BaseNode):
"""
A node responsible for instantiating and running multiple graph instances in parallel.

View File

@ -9,7 +9,6 @@ from ..utils.logging import get_logger
from .base_node import BaseNode
from ..helpers import template_combined
class MergeAnswersNode(BaseNode):
"""
A node responsible for merging the answers from multiple graph instances into a single answer.

View File

@ -10,7 +10,6 @@ from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from ..utils.logging import get_logger
from .base_node import BaseNode
class MergeGeneratedScriptsNode(BaseNode):
"""
A node responsible for merging scripts generated.

View File

@ -9,7 +9,6 @@ from langchain_core.documents import Document
from ..utils.logging import get_logger
from .base_node import BaseNode
class ParseNode(BaseNode):
"""
A node responsible for parsing HTML content from a document.
@ -91,7 +90,7 @@ class ParseNode(BaseNode):
chunk_size=self.node_config.get("chunk_size", 4096)-250,
token_counter=lambda text: len(text.split()),
memoize=False)
state.update({self.output[0]: chunks})
return state

View File

@ -13,7 +13,6 @@ from langchain.retrievers.document_compressors import (
)
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain_community.vectorstores import FAISS
from langchain_community.chat_models import ChatOllama
from langchain_aws import BedrockEmbeddings, ChatBedrock
from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings
@ -23,7 +22,6 @@ from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings
from langchain_fireworks import FireworksEmbeddings, ChatFireworks
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA
from ..utils.logging import get_logger
from .base_node import BaseNode
from ..helpers import models_tokens

View File

@ -7,20 +7,23 @@ from urllib.parse import urljoin
def cleanup_html(html_content: str, base_url: str) -> str:
"""
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
Processes HTML content by removing unnecessary tags,
minifying the HTML, and extracting the title and body content.
Args:
html_content (str): The HTML content to be processed.
Returns:
str: A string combining the parsed title and the minified body content. If no body content is found, it indicates so.
str: A string combining the parsed title and the minified body content.
If no body content is found, it indicates so.
Example:
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
>>> remover(html_content)
'Title: Example, Body: <body><p>Hello World!</p></body>'
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
This function is particularly useful for preparing HTML content for
environments where bandwidth usage needs to be minimized.
"""
soup = BeautifulSoup(html_content, 'html.parser')
@ -55,4 +58,5 @@ def cleanup_html(html_content: str, base_url: str) -> str:
return title, minimized_body, link_urls, image_urls
else:
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")
raise ValueError(f"""No HTML body content found, please try setting the 'headless'
flag to False in the graph configuration. HTML content: {html_content}""")

View File

@ -5,7 +5,6 @@ import os
import sys
import pandas as pd
def convert_to_csv(data: dict, filename: str, position: str = None) -> None:
"""
Converts a dictionary to a CSV file and saves it at a specified location.

View File

@ -5,7 +5,6 @@ import json
import os
import sys
def convert_to_json(data: dict, filename: str, position: str = None) -> None:
"""
Converts a dictionary to a JSON file and saves it at a specified location.

View File

@ -27,5 +27,5 @@ def convert_to_md(html: str, url: str = None) -> str:
parsed_url = urlparse(url)
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
h.baseurl = domain
return h.handle(html)

View File

@ -17,7 +17,6 @@ _default_logging_level = logging.WARNING
_semaphore = threading.Lock()
def _get_library_root_logger() -> logging.Logger:
return logging.getLogger(_library_name)

View File

@ -1,7 +1,6 @@
"""
Prettify the execution information of the graph.
"""
import pandas as pd

View File

@ -10,7 +10,6 @@ import requests
from fp.errors import FreeProxyException
from fp.fp import FreeProxy
class ProxyBrokerCriteria(TypedDict, total=False):
"""proxy broker criteria"""

View File

@ -11,7 +11,8 @@ def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) -
Args:
byte_response (bytes): The byte array containing audio data.
output_path (Union[str, Path]): The destination file path where the audio file will be saved.
output_path (Union[str, Path]): The destination
file path where the audio file will be saved.
Example:
>>> save_audio_from_bytes(b'audio data', 'path/to/audio.mp3')

View File

@ -10,7 +10,6 @@ import importlib.util # noqa: F401
if typing.TYPE_CHECKING:
import types
def srcfile_import(modpath: str, modname: str) -> "types.ModuleType":
"""imports a python module from its srcfile