Scrapegraph-ai/scrapegraphai/nodes/robots_node.py

"""
RobotsNode Module
"""

from typing import List, Optional
from urllib.parse import urlparse

from langchain_community.document_loaders import AsyncChromiumLoader
from langchain.prompts import PromptTemplate
from langchain.output_parsers import CommaSeparatedListOutputParser

from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import AsyncChromiumLoader

from ..helpers import robots_dictionary
from ..utils.logging import get_logger
from .base_node import BaseNode

class RobotsNode(BaseNode):
    """
    A node responsible for checking if a website is scrapeable or not based on the robots.txt file.
    It uses a language model to determine if the website allows scraping of the provided path.

    This node acts as a starting point in many scraping workflows, preparing the state
    with the necessary HTML content for further processing by subsequent nodes in the graph.

    Attributes:
        llm_model: An instance of the language model client used for checking scrapeability.
        force_scraping (bool): A flag indicating whether scraping should be enforced even
                               if disallowed by robots.txt.
        verbose (bool): A flag indicating whether to show print statements during execution.

    Args:
        input (str): Boolean expression defining the input keys needed from the state.
        output (List[str]): List of output keys to be updated in the state.
        node_config (dict): Additional configuration for the node.
        force_scraping (bool): A flag indicating whether scraping should be enforced even
                                 if disallowed by robots.txt. Defaults to True.
        node_name (str): The unique identifier name for the node, defaulting to "Robots".
    """

    def __init__(
        self,
        input: str,
        output: List[str],
        node_config: Optional[dict] = None,
        node_name: str = "Robots",
    ):
        super().__init__(node_name, "node", input, output, 1)

        self.llm_model = node_config["llm_model"]

        self.force_scraping = (
            False if node_config is None else node_config.get("force_scraping", False)
        )
        self.verbose = (
            True if node_config is None else node_config.get("verbose", False)
        )

    def execute(self, state: dict) -> dict:
        """
        Checks if a website is scrapeable based on the robots.txt file and updates the state
        with the scrapeability status. The method constructs a prompt for the language model,
        submits it, and parses the output to determine if scraping is allowed.

        Args:
            state (dict): The current state of the graph. The input keys will be used to fetch the

        Returns:
            dict: The updated state with the output key containing the scrapeability status.

        Raises:
            KeyError: If the input keys are not found in the state, indicating that the
                        necessary information for checking scrapeability is missing.
            KeyError: If the large language model is not found in the robots_dictionary.
            ValueError: If the website is not scrapeable based on the robots.txt file and
                        scraping is not enforced.
        """

        self.logger.info(f"--- Executing {self.node_name} Node ---")

        # Interpret input keys based on the provided input expression
        input_keys = self.get_input_keys(state)

        # Fetching data from the state based on the input keys
        input_data = [state[key] for key in input_keys]

        source = input_data[0]
        output_parser = CommaSeparatedListOutputParser()

        template = """
            You are a website scraper and you need to scrape a website.
            You need to check if the website allows scraping of the provided path. \n
            You are provided with the robots.txt file of the website and you must reply if it is legit to scrape or not the website. \n
            provided, given the path link and the user agent name. \n
            In the reply just write "yes" or "no". Yes if it possible to scrape, no if it is not. \n
            Ignore all the context sentences that ask you not to extract information from the html code.\n
            If the content of the robots.txt file is not provided, just reply with "yes". \n
            Path: {path} \n.
            Agent: {agent} \n
            robots.txt: {context}. \n
            """

        if not source.startswith("http"):
            raise ValueError("Operation not allowed")

        else:
            parsed_url = urlparse(source)
            base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
            loader = AsyncChromiumLoader(f"{base_url}/robots.txt")
            document = loader.load()
            if "ollama" in self.llm_model.model_name:
                self.llm_model.model_name = self.llm_model.model_name.split("/")[-1]
                model = self.llm_model.model_name.split("/")[-1]
            else:
                model = self.llm_model.model_name
            try:
                agent = robots_dictionary[model]

            except KeyError:
                agent = model

            prompt = PromptTemplate(
                template=template,
                input_variables=["path"],
                partial_variables={"context": document, "agent": agent},
            )

            chain = prompt | self.llm_model | output_parser
            is_scrapable = chain.invoke({"path": source})[0]

            if "no" in is_scrapable:
                self.logger.warning(
                    "\033[31m(Scraping this website is not allowed)\033[0m"
                )

                if not self.force_scraping:
                    raise ValueError("The website you selected is not scrapable")
                else:
                    self.logger.warning(
                        "\033[33m(WARNING: Scraping this website is not allowed but you decided to force it)\033[0m"
                    )
            else:
                self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m")

        state.update({self.output[0]: is_scrapable})
        return state