diff --git a/examples/knowledge_graph/input/job_postings.json b/examples/knowledge_graph/input/job_postings.json new file mode 100644 index 00000000..10367a1a --- /dev/null +++ b/examples/knowledge_graph/input/job_postings.json @@ -0,0 +1,704 @@ +{ + "Job Postings":{ + "Netflix":[ + { + "title":"Machine Learning Engineer (L4) - Infrastructure Algorithms and ML", + "description":"NA", + "location":"Los Gatos, CA", + "date_posted":"2 weeks ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Machine Learning Engineer L4, Algorithms Engineering", + "description":"NA", + "location":"Los Gatos, CA", + "date_posted":"18 hours ago", + "requirements":[ + "NA" + ] + } + ], + "Rose AI":[ + { + "title":"Machine Learning Engineer Intern", + "description":"NA", + "location":"New York, NY", + "date_posted":"2 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Team Remotely Inc":[ + { + "title":"Junior Machine Learning Engineer", + "description":"NA", + "location":"Wilmington, DE", + "date_posted":"14 hours ago", + "requirements":[ + "NA" + ] + } + ], + "Zuma":[ + { + "title":"Machine Learning Engineer Intern", + "description":"NA", + "location":"San Francisco Bay Area", + "date_posted":"11 hours ago", + "requirements":[ + "NA" + ] + } + ], + "Tinder":[ + { + "title":"Data Scientist I", + "description":"NA", + "location":"West Hollywood, CA", + "date_posted":"23 hours ago", + "requirements":[ + "NA" + ] + } + ], + "Moveworks":[ + { + "title":"Machine Learning Engineer Intern - NLU & ML Infra", + "description":"NA", + "location":"Mountain View, CA", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + } + ], + "Cognitiv":[ + { + "title":"Machine Learning Engineer Intern", + "description":"NA", + "location":"Berkeley, CA", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + } + ], + "DoorDash":[ + { + "title":"Machine Learning Engineer, Forecast Platform", + "description":"NA", + "location":"San Francisco, CA", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Machine Learning Engineer, Forecast Platform", + "description":"NA", + "location":"Sunnyvale, CA", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Machine Learning Engineer - New Verticals", + "description":"NA", + "location":"New York, NY", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + } + ], + "PipeIQ":[ + { + "title":"Machine Learning Engineer Intern (NLP)", + "description":"NA", + "location":"Palo Alto, CA", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + } + ], + "Fractal":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"California, United States", + "date_posted":"3 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Accroid Inc":[ + { + "title":"Machine Learning Engineer/Python", + "description":"NA", + "location":"Austin, TX", + "date_posted":"3 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Notion":[ + { + "title":"Software Engineer, Machine Learning", + "description":"NA", + "location":"San Francisco, CA", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Software Engineer, Machine Learning", + "description":"NA", + "location":"New York, NY", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + } + ], + "PhysicsX":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"New York, United States", + "date_posted":"1 week ago", + "requirements":[ + "NA" + ] + } + ], + "HireIO, Inc.":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"San Francisco, CA", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + } + ], + "Dexian Inc":[ + { + "title":"Junior Machine Learning Engineer", + "description":"NA", + "location":"Columbia, MD", + "date_posted":"4 days ago", + "requirements":[ + "NA" + ] + } + ], + "Google":[ + { + "title":"Software Engineer, Early Career", + "description":"NA", + "location":"New York, NY", + "date_posted":"11 hours ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Software Engineer, Early Career", + "description":"NA", + "location":"San Francisco, CA", + "date_posted":"11 hours ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Software Engineer, Early Career", + "description":"NA", + "location":"Mountain View, CA", + "date_posted":"11 hours ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Software Engineer, Early Career", + "description":"NA", + "location":"Sunnyvale, CA", + "date_posted":"11 hours ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Customer Engineering, AI/ML (English, Italian)", + "description":"Candidates will typically have 6 years of experience as a technical sales engineer in a cloud computing environment.", + "location":"Milano, Lombardia", + "date_posted":"15 giorni fa", + "requirements":[ + "NA" + ] + } + ], + "Unreal Staffing, Inc":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"San Francisco, CA", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"San Francisco, CA", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + } + ], + "Reveal HealthTech":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"Boston, MA", + "date_posted":"3 days ago", + "requirements":[ + "NA" + ] + } + ], + "Replicate":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"San Francisco, CA", + "date_posted":"4 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Truveta":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"Greater Seattle Area", + "date_posted":"3 days ago", + "requirements":[ + "NA" + ] + } + ], + "Atlassian":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"San Francisco, CA", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"United States", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + } + ], + "Continua AI, Inc.":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"San Francisco, CA", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"New York, NY", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"Seattle, WA", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + } + ], + "Software Technology Inc.":[ + { + "title":"Data Scientist/ ML Engineer | Remote | Long Term", + "description":"NA", + "location":"United States", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Data Scientist/ ML Engineer | Remote | Long Term", + "description":"NA", + "location":"United States", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + } + ], + "Neptune Technologies LLC":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"United States", + "date_posted":"1 day ago", + "requirements":[ + "NA" + ] + } + ], + "Zoom":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"San Jose, CA", + "date_posted":"4 weeks ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"California, United States", + "date_posted":"4 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "HP":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"Palo Alto, CA", + "date_posted":"2 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Enterprise Minds, Inc":[ + { + "title":"Machine Learning Software Engineer", + "description":"NA", + "location":"Mountain View, CA", + "date_posted":"1 week ago", + "requirements":[ + "NA" + ] + } + ], + "Celonis":[ + { + "title":"Machine Learning Engineer Intern", + "description":"NA", + "location":"New York, NY", + "date_posted":"3 weeks ago", + "requirements":[ + "NA" + ] + }, + { + "title":"Machine Learning Engineer Intern", + "description":"NA", + "location":"Palo Alto, CA", + "date_posted":"3 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Lockheed Martin":[ + { + "title":"A/AI Machine Learning Engineer", + "description":"NA", + "location":"Littleton, CO", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + } + ], + "Two Dots":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"Los Angeles, CA", + "date_posted":"2 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Verneek":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"New York, NY", + "date_posted":"1 week ago", + "requirements":[ + "NA" + ] + } + ], + "Rivian":[ + { + "title":"Machine Learning Software Engineer", + "description":"NA", + "location":"Palo Alto, CA", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + } + ], + "Impax Recruitment":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"United States", + "date_posted":"2 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Stripe":[ + { + "title":"Machine Learning Engineer, Risk", + "description":"NA", + "location":"United States", + "date_posted":"3 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Adobe":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"San Jose, CA", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + } + ], + "Javelin":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"New York City Metropolitan Area", + "date_posted":"1 week ago", + "requirements":[ + "NA" + ] + } + ], + "Ultralytics":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"New York, NY", + "date_posted":"2 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Supernormal":[ + { + "title":"Machine Learning Engineer (with a focus on modeling)", + "description":"NA", + "location":"Seattle, WA", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + } + ], + "Samsung Electronics America":[ + { + "title":"Machine Learning Engineer – Data Science", + "description":"NA", + "location":"Mountain View, CA", + "date_posted":"4 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Skale":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"San Francisco, CA", + "date_posted":"2 weeks ago", + "requirements":[ + "NA" + ] + } + ], + "Steneral Consulting":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"United States", + "date_posted":"1 month ago", + "requirements":[ + "NA" + ] + } + ], + "Movable Ink":[ + { + "title":"Machine Learning Engineer", + "description":"NA", + "location":"United States", + "date_posted":"2 months ago", + "requirements":[ + "NA" + ] + } + ], + "LHH":[ + { + "title":"DevOps Engineer", + "description":"Per azienda cliente Fit2you, siamo alla ricerca di un DevOps Engineer presso la sede di Milano che possa operare all'intersezione di Fit2you Broker e Air, guidando l'innovazione tecnologica e l'efficienza operativa in entrambi i contesti. Questo ruolo unico offre l'opportunità di influenzare significativamente due diversi, ma complementari, settori dell'industria automotive, dal brokeraggio assicurativo ai big data e alle auto connesse.", + "location":"Italy", + "date_posted":"15d", + "requirements":[ + "CI/CD", + "DevOps", + "AWS", + "JavaScript", + "Integrazione continua" + ] + } + ], + "Deloitte":[ + { + "title":"Experienced - Cloud Test Engineer - Cloud Native Development & Migration - NextHub Bari", + "description":"Scopri di più sulle nostre strategie di Corporate Sustainability, tra cui Well-being, la strategia volta a migliorare il benessere fisico, mentale e sociale.", + "location":"Bari", + "date_posted":"14d", + "requirements":[ + "ASP.NET", + "Azure", + "DevOps", + "C#", + "Automazione dei test" + ] + } + ], + "MACMARK":[ + { + "title":"MID/SENIOR BACKEND DEVELOPER IN PRESENZA", + "description":"Sarà possibile solo lavorare in presenza, pertanto sei disponibile a lavorare nella sede di Rende (CS)? Buona propensione nel lavorare in Team.", + "location":"Rende", + "date_posted":"7d", + "requirements":[ + "Infrastrutture cloud", + "Azure", + "CSS", + "Git", + "Google Cloud Platform" + ] + }, + { + "title":"MID/SENIOR FRONTEND DEVELOPER IN PRESENZA", + "description":"Buona propensione nel lavorare in Team. O Laura in informativa ed almeno 1/2 anni di esperienza in un contesto di sviluppo software.", + "location":"Rende", + "date_posted":"7d", + "requirements":[ + "Infrastrutture cloud", + "CSS", + "React", + "Git", + "Google Cloud Platform" + ] + } + ], + "Assist Digital Spa":[ + { + "title":"System & Networking Engineer", + "description":"Eu. Il Trattamento è realizzato, con il suo consenso, per realizzare processi di ricerca, selezione e valutazione del personale svolti per conto proprio, per.", + "location":"Roma", + "date_posted":"30d+", + "requirements":[ + "Inglese", + "Windows", + "Sistemi di sicurezza", + "AWS", + "Virtualizzazione" + ] + }, + { + "title":"Prompt Engineer", + "description":"You, as data subject of the processing of personal data, may exercise at any time the rights expressly granted by the European Regulation, and in particular.", + "location":"Roma", + "date_posted":"30d+", + "requirements":[ + "Strutture dati", + "Inglese", + "Google Cloud Platform", + "AWS", + "C" + ] + } + ], + "TOOLS FOR SMART MINDS S.r.l.":[ + { + "title":"Sviluppatore software", + "description":"predisposizione a lavorare in team. La nostra missione è creare valore per le aziende che vogliono intraprendere la trasformazione 4.0 con soluzioni su misura.", + "location":"Castel Mella", + "date_posted":"30d+", + "requirements":[ + "Inglese", + "Machine learning", + "Intelligenza artificiale" + ] + }, + { + "title":"Sviluppatore software - linguaggio OWL e SPARQL", + "description":"predisposizione a lavorare in team. La nostra missione è creare valore per le aziende che vogliono intraprendere la trasformazione 4.0 con soluzioni su misura." + } + ] + } +} \ No newline at end of file diff --git a/examples/knowledge_graph/load_vector.py b/examples/knowledge_graph/load_vector.py new file mode 100644 index 00000000..6df631ee --- /dev/null +++ b/examples/knowledge_graph/load_vector.py @@ -0,0 +1,44 @@ +import os, json +from langchain_community.vectorstores import FAISS +from langchain_openai import OpenAIEmbeddings +from dotenv import load_dotenv +from scrapegraphai.utils import create_graph, create_interactive_graph_retrieval + +load_dotenv() + +# Load the OpenAI API key and the embeddings model +openai_key = os.getenv("OPENAI_APIKEY") +embeddings_model = OpenAIEmbeddings(api_key=openai_key) + +# Paths +curr_dir = os.path.dirname(os.path.realpath(__file__)) +json_file_path = os.path.join(curr_dir, 'input', 'job_postings.json') +vector_store_output_path = os.path.join(curr_dir, 'output', 'faiss_index') +retrieval_graph_output_path = os.path.join(curr_dir, 'output', 'job_postings_retrieval.html') + +# Load the job postings JSON file +with open(json_file_path, 'r') as f: + job_postings = json.load(f) + +# Load the vector store +db = FAISS.load_local( + vector_store_output_path, + embeddings_model, + allow_dangerous_deserialization=True +) + +# User prompt for similarity search +user_prompt = "Company based United States with job title Software Engineer" + +# Similarity search on the vector store +result = db.similarity_search_with_score(user_prompt, fetch_k=10) + +found_companies = [] +for res in result: + found_companies.append(res[0].page_content) + +# Build the graph +graph = create_graph(job_postings) + +# Create the interactive graph +create_interactive_graph_retrieval(graph, found_companies, output_file=retrieval_graph_output_path) \ No newline at end of file diff --git a/examples/knowledge_graph/output/faiss_index/index.faiss b/examples/knowledge_graph/output/faiss_index/index.faiss new file mode 100644 index 00000000..19f9f610 Binary files /dev/null and b/examples/knowledge_graph/output/faiss_index/index.faiss differ diff --git a/examples/knowledge_graph/output/faiss_index/index.pkl b/examples/knowledge_graph/output/faiss_index/index.pkl new file mode 100644 index 00000000..2933da40 Binary files /dev/null and b/examples/knowledge_graph/output/faiss_index/index.pkl differ diff --git a/examples/knowledge_graph/save_vector.py b/examples/knowledge_graph/save_vector.py new file mode 100644 index 00000000..bc139b68 --- /dev/null +++ b/examples/knowledge_graph/save_vector.py @@ -0,0 +1,41 @@ +import json +import os +from langchain_community.vectorstores import FAISS +from langchain_openai import OpenAIEmbeddings +from dotenv import load_dotenv + +load_dotenv() + +# Load the OpenAI API key and the embeddings model +openai_key = os.getenv("OPENAI_APIKEY") +embeddings_model = OpenAIEmbeddings(api_key=openai_key) + +# Paths +curr_dir = os.path.dirname(os.path.realpath(__file__)) +json_file_path = os.path.join(curr_dir, 'input', 'job_postings.json') +vector_store_output_path = os.path.join(curr_dir, 'output', 'faiss_index') + +# Load the job postings JSON file +with open(json_file_path, 'r') as f: + job_postings = json.load(f) + +texts = [] +metadata = [] + +# Extract company names and job details +for company, jobs in job_postings["Job Postings"].items(): + for job in jobs: + texts.append(company) + metadata.append({ + "title": job.get("title", "N/A"), + "description": job.get("description", "N/A"), + "location": job.get("location", "N/A"), + "date_posted": job.get("date_posted", "N/A"), + "requirements": job.get("requirements", []) + }) + +# Create the vector store +db = FAISS.from_texts(texts=texts, embedding=embeddings_model, metadatas=metadata) + +# Save the embeddings locally +db.save_local(vector_store_output_path) \ No newline at end of file diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py index eced80ea..2eb67303 100644 --- a/scrapegraphai/utils/__init__.py +++ b/scrapegraphai/utils/__init__.py @@ -9,4 +9,4 @@ from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers from .save_audio_from_bytes import save_audio_from_bytes from .sys_dynamic_import import dynamic_import, srcfile_import from .cleanup_html import cleanup_html -from .knowledge_graph import create_graph, add_customizations, create_interactive_graph \ No newline at end of file +from .knowledge_graph import create_graph, create_interactive_graph, create_interactive_graph_retrieval \ No newline at end of file diff --git a/scrapegraphai/utils/knowledge_graph.py b/scrapegraphai/utils/knowledge_graph.py index 1b6682aa..a1f2e802 100644 --- a/scrapegraphai/utils/knowledge_graph.py +++ b/scrapegraphai/utils/knowledge_graph.py @@ -70,6 +70,79 @@ def add_customizations(net, graph): net.add_edge(edge[0], edge[1]) return net +# Add customizations to the network +def add_customizations_retrieval(net, graph, found_companies): + node_colors = {} + node_sizes = {} + edge_colors = {} + + # Custom colors and sizes for nodes + node_colors["Job Postings"] = '#8470FF' + node_sizes["Job Postings"] = 50 + + # Nodes and edges to highlight in red + highlighted_nodes = set(found_companies) + highlighted_edges = set() + + # Highlight found companies and their paths to the root + for company in found_companies: + node_colors[company] = 'red' + node_sizes[company] = 30 + + # Highlight the path to the root + node = company + while node != "Job Postings": + predecessors = list(graph.predecessors(node)) + if not predecessors: + break + predecessor = predecessors[0] + highlighted_nodes.add(predecessor) + node_colors[predecessor] = 'red' + node_sizes[predecessor] = 30 + highlighted_edges.add((predecessor, node)) + node = predecessor + + # Highlight job nodes and edges + for idx in range(1, graph.out_degree(company) + 1): + job_node = f"{company}-Job{idx}" + if job_node in graph.nodes: + highlighted_nodes.add(job_node) + node_colors[job_node] = 'red' + node_sizes[job_node] = 20 + highlighted_edges.add((company, job_node)) + + # Highlight job detail nodes + for successor in graph.successors(job_node): + if successor not in highlighted_nodes: + node_colors[successor] = 'rgba(211, 211, 211, 0.5)' # light grey with transparency + node_sizes[successor] = 10 + highlighted_edges.add((job_node, successor)) + + # Set almost transparent color for non-highlighted nodes and edges + for node in graph.nodes: + if node not in node_colors: + node_colors[node] = 'rgba(211, 211, 211, 0.5)' # light grey with transparency + node_sizes[node] = 10 if '-' in node else 15 + + for edge in graph.edges: + if edge not in highlighted_edges: + edge_colors[edge] = 'rgba(211, 211, 211, 0.5)' # light grey with transparency + + # Add nodes and edges to the network with customized styles + for node in graph.nodes: + net.add_node(node, + label=graph.nodes[node].get('label', node.split('-')[-1]), + color=node_colors.get(node, 'lightgray'), + size=node_sizes.get(node, 15), + title=graph.nodes[node].get('title', '')) + for edge in graph.edges: + if edge in highlighted_edges: + net.add_edge(edge[0], edge[1], color='red') + else: + net.add_edge(edge[0], edge[1], color=edge_colors.get(edge, 'lightgray')) + + return net + # Create interactive graph def create_interactive_graph(graph, output_file='interactive_graph.html'): net = Network(notebook=False, height='1000px', width='100%', bgcolor='white', font_color='black') @@ -79,3 +152,11 @@ def create_interactive_graph(graph, output_file='interactive_graph.html'): # Automatically open the generated HTML file in the default web browser webbrowser.open(f"file://{os.path.realpath(output_file)}") +# Create interactive graph +def create_interactive_graph_retrieval(graph, found_companies, output_file='interactive_graph.html'): + net = Network(notebook=False, height='1000px', width='100%', bgcolor='white', font_color='black') + net = add_customizations_retrieval(net, graph, found_companies) + net.save_graph(output_file) + + # Automatically open the generated HTML file in the default web browser + webbrowser.open(f"file://{os.path.realpath(output_file)}")