fix: robot node and proxyes

This commit is contained in:
VinciGit00 2024-04-27 19:07:37 +02:00
parent 4bc727412f
commit adbc08f27b
6 changed files with 17 additions and 25 deletions

View File

@ -10,7 +10,7 @@ from scrapegraphai.utils import prettify_exec_info
graph_config = { graph_config = {
"llm": { "llm": {
"model": "ollama/mistral", "model": "ollama/mistral",
"temperature": 1, "temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly "format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily, # "model_tokens": 2000, # set context length arbitrarily,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily "base_url": "http://localhost:11434", # set ollama URL arbitrarily

View File

View File

@ -2,22 +2,16 @@
Example of custom graph using existing nodes Example of custom graph using existing nodes
""" """
import os from scrapegraphai.models import Ollama
from dotenv import load_dotenv
from scrapegraphai.models import OpenAI
from scrapegraphai.nodes import RobotsNode from scrapegraphai.nodes import RobotsNode
load_dotenv()
# ************************************************ # ************************************************
# Define the configuration for the graph # Define the configuration for the graph
# ************************************************ # ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = { graph_config = {
"llm": { "llm": {
"api_key": openai_key, "model": "ollama/llama3",
"model": "gpt-3.5-turbo",
"temperature": 0, "temperature": 0,
"streaming": True "streaming": True
}, },
@ -27,7 +21,7 @@ graph_config = {
# Define the node # Define the node
# ************************************************ # ************************************************
llm_model = OpenAI(graph_config["llm"]) llm_model = Ollama(graph_config["llm"])
robots_node = RobotsNode( robots_node = RobotsNode(
input="url", input="url",

View File

@ -7,7 +7,6 @@ from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_core.documents import Document from langchain_core.documents import Document
from .base_node import BaseNode from .base_node import BaseNode
from ..utils.remover import remover from ..utils.remover import remover
from ..utils.proxy_generator import proxy_generator
class FetchNode(BaseNode): class FetchNode(BaseNode):
@ -38,8 +37,7 @@ class FetchNode(BaseNode):
to succeed. to succeed.
""" """
def __init__(self, input: str, output: List[str], num_prox: int = True, def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
node_name: str = "Fetch"):
""" """
Initializes the FetchHTMLNode with a node name and node type. Initializes the FetchHTMLNode with a node name and node type.
Arguments: Arguments:
@ -47,7 +45,6 @@ class FetchNode(BaseNode):
prox_rotation (bool): if you wamt to rotate proxies prox_rotation (bool): if you wamt to rotate proxies
""" """
super().__init__(node_name, "node", input, output, 1) super().__init__(node_name, "node", input, output, 1)
self.num_prox = num_prox
def execute(self, state): def execute(self, state):
""" """
@ -80,13 +77,13 @@ class FetchNode(BaseNode):
"source": "local_dir" "source": "local_dir"
})] })]
# if it is a URL
else: else:
if self.num_prox > 1: if self.node_config.get("endpoint") is not None:
loader = AsyncHtmlLoader( loader = AsyncHtmlLoader(
source, proxies=proxy_generator(self.num_prox)) source, proxies={"http": self.node_config["endpoint"]})
else: else:
loader = AsyncHtmlLoader(source) loader = AsyncHtmlLoader(source)
document = loader.load() document = loader.load()
compressed_document = [ compressed_document = [
Document(page_content=remover(str(document)))] Document(page_content=remover(str(document)))]

View File

@ -1,5 +1,5 @@
""" """
Module for checking if a website is scrapepable or not Module for checking if a website is scrapepable or not
""" """
from typing import List from typing import List
from urllib.parse import urlparse from urllib.parse import urlparse
@ -12,7 +12,7 @@ from ..helpers import robots_dictionary
class RobotsNode(BaseNode): class RobotsNode(BaseNode):
""" """
A node responsible for checking if a website is scrapepable or not. A node responsible for checking if a website is scrapepable or not.
It uses the AsyncHtmlLoader for asynchronous It uses the AsyncHtmlLoader for asynchronous
document loading. document loading.
@ -59,7 +59,7 @@ class RobotsNode(BaseNode):
node_config (dict): Configuration parameters for the node. node_config (dict): Configuration parameters for the node.
force_scraping (bool): A flag indicating whether scraping should be enforced even force_scraping (bool): A flag indicating whether scraping should be enforced even
if disallowed by robots.txt. Defaults to True. if disallowed by robots.txt. Defaults to True.
node_name (str, optional): The unique identifier name for the node. node_name (str, optional): The unique identifier name for the node.
Defaults to "Robots". Defaults to "Robots".
""" """
super().__init__(node_name, "node", input, output, 1) super().__init__(node_name, "node", input, output, 1)
@ -112,11 +112,12 @@ class RobotsNode(BaseNode):
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
loader = AsyncHtmlLoader(f"{base_url}/robots.txt") loader = AsyncHtmlLoader(f"{base_url}/robots.txt")
document = loader.load() document = loader.load()
model = self.llm_model.model_name if "ollama" in self.llm_model.model:
self.llm_model.model = self.llm_model.model.split("/")[-1]
if "ollama" in model: model = self.llm_model.model.split("/")[-1]
model = model.split("/", maxsplit=1)[-1]
else:
model = self.llm_model.model_name
try: try:
agent = robots_dictionary[model] agent = robots_dictionary[model]

View File

@ -5,4 +5,4 @@ from .save_audio_from_bytes import save_audio_from_bytes
from .convert_to_csv import convert_to_csv from .convert_to_csv import convert_to_csv
from .convert_to_json import convert_to_json from .convert_to_json import convert_to_json
from .prettify_exec_info import prettify_exec_info from .prettify_exec_info import prettify_exec_info
from .proxy_generator import proxy_generator from .proxy_rotation import proxy_generator