stack/apps/dashboard/src/lib/classify-query.ts
2026-01-30 17:56:10 -08:00

288 lines
11 KiB
TypeScript

/**
* Classifies whether user input is a ClickHouse SQL query or a natural language prompt.
* Used to prioritize "Run Query" vs "Ask AI" in the command center.
*
* @param input - The user's input string
* @param options.readonlyOnly - If true, only readonly queries (SELECT, SHOW, DESCRIBE, EXPLAIN) are considered SQL.
* Mutating statements (INSERT, UPDATE, DELETE, etc.) are treated as prompts.
*/
export function classifyClickHouseSqlVsPrompt(
input: unknown,
options: { readonlyOnly?: boolean } = {}
): {
kind: "sql" | "prompt",
confidence: number,
reasons: string[],
score?: { sql: number, prompt: number, margin: number },
} {
const { readonlyOnly = false } = options;
const raw = (input ?? "").toString();
const s = raw.trim();
if (!s) {
return { kind: "prompt", confidence: 0.5, reasons: ["empty"] };
}
// Strip code fences if someone pasted markdown
const unfenced = s.replace(/^```[\w-]*\n([\s\S]*?)\n```$/m, "$1").trim();
const lower = unfenced.toLowerCase();
const words = lower.match(/[a-z_]+/g) ?? [];
const wordSet = new Set(words);
let sql = 0;
let prompt = 0;
const reasons: string[] = [];
// Helper: check if keyword appears in UPPERCASE (stronger signal)
const hasUppercaseKeyword = (keyword: string) => {
const regex = new RegExp(`\\b${keyword.toUpperCase()}\\b`);
return regex.test(unfenced);
};
// Define readonly vs mutating keywords
const readonlyKeywords = ["select", "show", "describe", "desc", "explain", "with"];
const mutatingKeywords = ["insert", "update", "delete", "alter", "create", "drop", "truncate", "use", "set"];
// 1) Check for mutating statements when in readonly mode
const startsWithMutating = new RegExp(`^(${mutatingKeywords.join("|")})\\b`, "i").test(unfenced);
if (readonlyOnly && startsWithMutating) {
// In readonly mode, mutating statements are not valid SQL for our purposes
prompt += 4;
reasons.push("mutating-statement-in-readonly-mode");
}
// 2) Strong starts - uppercase keywords are much stronger signals
const readonlyStartRegex = new RegExp(`^(${readonlyKeywords.join("|")})\\b`, "i");
const startsWithReadonly = readonlyStartRegex.test(unfenced);
const allSqlKeywords = readonlyOnly ? readonlyKeywords : [...readonlyKeywords, ...mutatingKeywords];
const sqlStartRegex = new RegExp(`^(${allSqlKeywords.join("|")})\\b`, "i");
const startsWithSqlKeyword = sqlStartRegex.test(unfenced);
// SHOW special handling - "show me" is English, "SHOW TABLES" is SQL
const showEnglishLead = /^show\s+(me|us|my|our|your|them|him|her)\b/i.test(unfenced);
const showHasSqlTarget = /^show\s+(tables?|databases?|columns?|create|processlist|functions?|settings|grants|roles|quotas|dictionary|dictionaries|clusters|indexes|partitions|privileges|users?)\b/i.test(unfenced);
const startsWithShowSql = /^show\b/i.test(unfenced) && showHasSqlTarget && !showEnglishLead;
// Extra points for SHOW/DESCRIBE commands that clearly target SQL objects
if (startsWithShowSql) {
sql += hasUppercaseKeyword("SHOW") ? 3 : 1;
reasons.push("show-sql-target");
}
const startsWithSql = (startsWithSqlKeyword && !startsWithMutating) || startsWithShowSql ||
(startsWithMutating && !readonlyOnly);
if (startsWithSql) {
// Uppercase start is much stronger
const firstWord = unfenced.split(/\s+/)[0];
if (firstWord === firstWord.toUpperCase() && /^[A-Z]+$/.test(firstWord)) {
sql += 4;
reasons.push("starts-with-uppercase-sql-keyword");
} else {
sql += 1; // Lowercase start is weak - could be English
reasons.push("starts-with-lowercase-sql-keyword");
}
}
// 3) Structural patterns - these are strong signals
// SELECT ... FROM pattern - but check for SQL-like structure vs English
const selectFromMatch = /\bselect\b([\s\S]{0,300})\bfrom\b/i.exec(unfenced);
if (selectFromMatch) {
const between = selectFromMatch[1];
// Check for SQL-specific patterns between SELECT and FROM
const hasStar = /\*/.test(between);
const hasComma = /,/.test(between);
const hasDotNotation = /\w+\.\w+/.test(between);
const hasFunction = /\w+\s*\(/.test(between);
const isUppercase = /\bSELECT\b/.test(unfenced) && /\bFROM\b/.test(unfenced);
if (hasStar || hasDotNotation || hasFunction || isUppercase) {
sql += isUppercase ? 5 : 3;
reasons.push("select-from-structure-with-sql-syntax");
} else if (hasComma) {
sql += 2;
reasons.push("select-from-structure-with-comma");
} else {
// "select all events from the table" - looks like English
sql += 1;
prompt += 2;
reasons.push("select-from-looks-like-english");
}
}
// INSERT INTO pattern (only if not readonly mode)
if (!readonlyOnly) {
const hasInsertInto = /\binsert\b[\s\S]{0,80}\binto\b/i.test(unfenced);
if (hasInsertInto) {
const isUppercase = /\bINSERT\b/.test(unfenced) && /\bINTO\b/.test(unfenced);
sql += isUppercase ? 5 : 3;
reasons.push("insert-into-structure");
}
}
// 4) SQL-specific operators and punctuation (rare in English)
// * is very SQL-specific when not preceded by space (not "user * activity")
const hasSqlStar = /SELECT\s+\*|,\s*\*|^\s*\*\s*,|\w\.\*/.test(unfenced);
if (hasSqlStar) {
sql += 3;
reasons.push("sql-star-operator");
}
// Dot notation without space after (table.column) - very SQL
const dotNotationCount = (unfenced.match(/\b\w+\.\w+\b/g) ?? []).length;
if (dotNotationCount >= 2) {
sql += 3;
reasons.push("multiple-dot-notations");
} else if (dotNotationCount === 1) {
sql += 1;
reasons.push("dot-notation");
}
// Comparison operators with = (rare in natural language)
const hasEquals = /[^!<>=]=[^=]/.test(unfenced);
const hasComparison = /(<=|>=|!=|<>)/.test(unfenced);
if (hasComparison) {
sql += 3;
reasons.push("sql-comparison-operators");
} else if (hasEquals) {
sql += 2;
reasons.push("equals-operator");
}
// Parentheses with content (function calls, subqueries)
const parenContent = unfenced.match(/\w+\s*\([^)]+\)/g) ?? [];
if (parenContent.length >= 2) {
sql += 2;
reasons.push("multiple-function-calls");
} else if (parenContent.length === 1) {
sql += 1;
reasons.push("function-call");
}
// Semicolon at end - very SQL
if (/;\s*$/.test(unfenced)) {
sql += 2;
reasons.push("ends-with-semicolon");
}
// 5) SQL clauses - uppercase is stronger
const clauses = ["where", "group", "order", "limit", "having", "join", "union", "distinct", "format", "settings"];
const clauseHits: string[] = [];
let uppercaseClauseCount = 0;
for (const clause of clauses) {
if (wordSet.has(clause)) {
clauseHits.push(clause);
if (hasUppercaseKeyword(clause)) {
uppercaseClauseCount++;
}
}
}
if (clauseHits.length) {
// Uppercase clauses are much stronger
sql += uppercaseClauseCount * 2 + Math.min(2, clauseHits.length - uppercaseClauseCount);
reasons.push(`sql-clauses:${clauseHits.join(",")}`);
if (uppercaseClauseCount > 0) {
reasons.push(`uppercase-clauses:${uppercaseClauseCount}`);
}
}
// 6) ClickHouse-specific tokens
const chTokens = [
"prewhere", "final", "sample", "engine", "partition", "ttl", "distributed", "merge", "replacing", "collapsing",
"materialized", "interval"
];
const chHits = chTokens.filter(k => wordSet.has(k));
if (chHits.length) {
sql += chHits.length >= 2 ? 3 : 2;
reasons.push("clickhouse-ish:" + chHits.join(","));
}
// 7) Quoted identifiers (very SQL-specific)
if (/`[^`]+`/.test(unfenced)) {
sql += 2;
reasons.push("backtick-identifiers");
}
if (/"[^"]+"/.test(unfenced) && !/"\s/.test(unfenced)) {
sql += 1;
reasons.push("quoted-identifiers");
}
// SQL comments
if (/--|\/\*/.test(unfenced)) {
sql += 2;
reasons.push("sql-comments");
}
// ===== PROMPT SIGNALS =====
// Question mark at end - strong prompt signal
if (/[?]\s*$/.test(unfenced)) {
prompt += 3;
reasons.push("ends-with-question-mark");
}
// Conversational/prompt words
const promptWords = ["please", "could", "can", "would", "what", "why", "how", "explain", "help", "tell", "show me", "get me", "find me", "give me"];
const promptWordHits = promptWords.filter(w => lower.includes(w));
if (promptWordHits.length) {
prompt += Math.min(4, promptWordHits.length * 2);
reasons.push("prompt-words:" + promptWordHits.join(","));
}
// Articles and prepositions common in English but rare in SQL
const englishWords = ["the", "a", "an", "all", "some", "any", "every", "each", "this", "that", "these", "those"];
const englishHits = englishWords.filter(w => wordSet.has(w));
if (englishHits.length >= 2) {
prompt += 2;
reasons.push("english-articles");
}
// Low symbol density with no SQL operators = likely English
// But don't penalize short SQL commands like "SHOW TABLES" or "DESCRIBE events"
const symbolChars = (unfenced.match(/[^a-z0-9_\s]/gi) ?? []).length;
const symbolRatio = symbolChars / Math.max(1, unfenced.length);
const hasSqlOperators = hasEquals || hasComparison || hasSqlStar || dotNotationCount > 0;
const isShortCommand = words.length <= 3;
if (symbolRatio < 0.04 && !hasSqlOperators && !startsWithSql) {
prompt += 3;
reasons.push("low-symbol-no-operators");
} else if (symbolRatio < 0.08 && !hasSqlOperators && !startsWithSql && !isShortCommand) {
prompt += 2;
reasons.push("low-symbol-density");
}
// "select" followed by English words without SQL structure
if (/^select\b/i.test(unfenced) && !selectFromMatch) {
prompt += 4;
reasons.push("select-without-from");
} else if (/^select\b/i.test(unfenced) && selectFromMatch && !hasSqlStar && dotNotationCount === 0 && !hasEquals) {
// "select all events from the table from the last 24 hours" - very English-like
const hasEnglishPattern = /\bfrom the\b|\bin the\b|\bof the\b|\bfor the\b/i.test(unfenced);
if (hasEnglishPattern) {
prompt += 4;
reasons.push("english-phrase-pattern");
}
}
// Long sentences without SQL structure are likely prompts
const wordCount = words.length;
if (wordCount > 10 && !hasSqlOperators && clauseHits.length <= 1) {
prompt += 2;
reasons.push("long-sentence-no-sql-structure");
}
// ===== FINAL SCORING =====
const margin = sql - prompt;
// Require a higher margin to classify as SQL (when in doubt, prefer prompt)
const kind = margin >= 3 ? "sql" : "prompt";
// Confidence: squish margin into [0.5..0.99]
const confidence = Math.max(0.5, Math.min(0.99, 0.5 + Math.abs(margin) * 0.08));
return { kind, confidence, reasons, score: { sql, prompt, margin } };
}