feat(multiple_search): working multiple example

2026-06-25 21:11:11 +08:00 · 2024-05-18 01:51:12 +02:00 · 2024-05-18 01:51:12 +02:00 · bed3eed50c
commit bed3eed50c
parent 05e511e36f
6 changed files with 53 additions and 39 deletions
--- a/examples/openai/multiple_search_openai.py
+++ b/examples/openai/multiple_search_openai.py
@ -10,6 +10,36 @@ from scrapegraphai.utils import prettify_exec_info
 load_dotenv()


+schema= """{ 
+    "Job Postings": { 
+        "Company x": [ 
+            { 
+                "title": "...", 
+                "description": "...", 
+                "location": "...", 
+                "date_posted": "..", 
+                "requirements": ["...", "...", "..."] 
+            }, 
+            { 
+                "title": "...", 
+                "description": "...", 
+                "location": "...", 
+                "date_posted": "..", 
+                "requirements": ["...", "...", "..."] 
+            } 
+        ], 
+        "Company y": [ 
+            { 
+                "title": "...", 
+                "description": "...", 
+                "location": "...", 
+                "date_posted": "..", 
+                "requirements": ["...", "...", "..."] 
+            } 
+        ] 
+    } 
+}"""
+
 # ************************************************
 # Define the configuration for the graph
 # ************************************************
@ -19,47 +49,23 @@ openai_key = os.getenv("OPENAI_APIKEY")
 graph_config = {
    "llm": {
        "api_key": openai_key,
-        "model": "gpt-4o",
+        "model": "gpt-3.5-turbo",
    },
    "verbose": True,
    "headless": False,
+    "schema": schema,
 }

-schema= """{ 
-    "Job Postings": { 
-        "Company A": [ 
-            { 
-                "title": "Software Engineer", 
-                "description": "Develop and maintain software applications.", 
-                "location": "New York, NY", 
-                "date_posted": "2024-05-01", 
-                "requirements": ["Python", "Django", "REST APIs"] 
-            }, 
-            { 
-                "title": "Data Scientist", 
-                "description": "Analyze and interpret complex data.", 
-                "location": "San Francisco, CA", 
-                "date_posted": "2024-05-05", 
-                "requirements": ["Python", "Machine Learning", "SQL"] 
-            } 
-        ], 
-        "Company B": [ 
-            { 
-                "title": "Project Manager", 
-                "description": "Manage software development projects.", 
-                "location": "Boston, MA", 
-                "date_posted": "2024-04-20", 
-                "requirements": ["Project Management", "Agile", "Scrum"] 
-            } 
-        ] 
-    } 
-}"""
+

 multiple_search_graph = MultipleSearchGraph(
    prompt="List me all the projects with their description",
-    source= ["https://perinim.github.io/projects/", "https://perinim.github.io/projects/"],
+    source= [
+        "https://www.linkedin.com/jobs/machine-learning-engineer-offerte-di-lavoro/?currentJobId=3889037104&originalSubdomain=it",
+        "https://www.glassdoor.com/Job/italy-machine-learning-engineer-jobs-SRCH_IL.0,5_IN120_KO6,31.html",
+        "https://it.indeed.com/jobs?q=ML+engineer&vjk=3c2e6d27601ffaaa"
+        ],
    config=graph_config,
-    schema = schema
 )

 result = multiple_search_graph.run()
--- a/scrapegraphai/graphs/abstract_graph.py
+++ b/scrapegraphai/graphs/abstract_graph.py
@ -40,12 +40,11 @@ class AbstractGraph(ABC):
        >>> result = my_graph.run()
    """

-    def __init__(self, prompt: str, config: dict, source: Optional[str] = None, schema: Optional[dict]=None):
+    def __init__(self, prompt: str, config: dict, source: Optional[str] = None):

        self.prompt = prompt
        self.source = source
        self.config = config
-        self.schema = schema
        self.llm_model = self._create_llm(config["llm"], chat=True)
        self.embedder_model = self._create_default_embedder(llm_config=config["llm"]
                                                            ) if "embeddings" not in config else self._create_embedder(
@ -62,6 +61,7 @@ class AbstractGraph(ABC):
        self.headless = True if config is None else config.get(
            "headless", True)
        self.loader_kwargs = config.get("loader_kwargs", {})
+        self.schema = config.get("schema", None)

        common_params = {"headless": self.headless,
                         "verbose": self.verbose,
@ -69,6 +69,7 @@ class AbstractGraph(ABC):
                         "llm_model": self.llm_model,
                         "embedder_model": self.embedder_model,
                         "schema": self.schema}
+        
        self.set_common_params(common_params, overwrite=False)

    def set_common_params(self, params: dict, overwrite=False):
--- a/scrapegraphai/graphs/multiple_search_graph.py
+++ b/scrapegraphai/graphs/multiple_search_graph.py
@ -14,6 +14,8 @@ from .abstract_graph import AbstractGraph
 from .smart_scraper_graph import SmartScraperGraph

 from typing import List, Optional
+
+
 class MultipleSearchGraph(AbstractGraph):
    """ 
    MultipleSearchGraph is a scraping pipeline that searches the internet for answers to a given prompt.
@ -39,7 +41,7 @@ class MultipleSearchGraph(AbstractGraph):
        >>> result = search_graph.run()
    """

-    def __init__(self, prompt: str, source: List[str], config: dict, schema:Optional[dict]= None):
+    def __init__(self, prompt: str, source: List[str], config: dict):

        self.max_results = config.get("max_results", 3)

@ -48,7 +50,7 @@ class MultipleSearchGraph(AbstractGraph):
        else:
            self.copy_config = deepcopy(config)

-        super().__init__(prompt, config)
+        super().__init__(prompt, config, source)

    def _create_graph(self) -> BaseGraph:
        """
@ -65,7 +67,7 @@ class MultipleSearchGraph(AbstractGraph):
        smart_scraper_instance = SmartScraperGraph(
            prompt="",
            source="",
-            config=self.copy_config
+            config=self.copy_config,
        )

        # ************************************************
@ -85,6 +87,7 @@ class MultipleSearchGraph(AbstractGraph):
            output=["answer"],
            node_config={
                "llm_model": self.llm_model,
+                "schema": self.config.get("schema", None),
            }
        )

--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@ -81,7 +81,8 @@ class SmartScraperGraph(AbstractGraph):
            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
            output=["answer"],
            node_config={
-                "llm_model": self.llm_model
+                "llm_model": self.llm_model,
+                "schema": self.config.get("schema", None),
            }
        )

--- a/scrapegraphai/nodes/generate_answer_node.py
+++ b/scrapegraphai/nodes/generate_answer_node.py
@ -35,7 +35,7 @@ class GenerateAnswerNode(BaseNode):

    def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None,
                 node_name: str = "GenerateAnswer"):
-        print(node_config)
+
        super().__init__(node_name, "node", input, output, 2, node_config)

        self.llm_model = node_config["llm_model"]
--- a/scrapegraphai/nodes/merge_answers_node.py
+++ b/scrapegraphai/nodes/merge_answers_node.py
@ -79,6 +79,8 @@ class MergeAnswersNode(BaseNode):
        You need to merge the content from the different websites into a single answer without repetitions (if there are any). \n
        The scraped contents are in a JSON format and you need to merge them based on the context and providing a correct JSON structure.\n
        OUTPUT INSTRUCTIONS: {format_instructions}\n
+        You must format the output with the following schema, if not None:\n
+        SCHEMA: {schema}\n
        USER PROMPT: {user_prompt}\n
        WEBSITE CONTENT: {website_content}
        """
@ -89,6 +91,7 @@ class MergeAnswersNode(BaseNode):
            partial_variables={
                "format_instructions": format_instructions,
                "website_content": answers_str,
+                "schema": self.node_config.get("schema", None),
            },
        )