From 5d453910167d35206385b65e8144a39b56860f29 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sun, 19 Apr 2026 16:42:06 -0700 Subject: [PATCH] llama/compat: rewrite gemma4 tokenizer model to BPE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ollama's converter writes `tokenizer.ggml.model = 'llama'` (SPM) on gemma4 GGUFs, but the gemma4 model family actually uses BPE — upstream- converted reference GGUFs use `'gemma4'` which selects LLAMA_VOCAB_TYPE_BPE in src/llama-vocab.cpp. With the wrong tokenizer type, gemma4's special tokens (e.g. `<|thought|>`, `<|turn>`, `<|channel>`) get split into multiple SPM subword pieces, so when the model emits them they come out as raw text instead of being recognized as control tokens — surfaces as e.g. the literal word "thought" appearing in the chat output instead of being routed to the model's reasoning channel. Ollama already supplies `tokenizer.ggml.merges` (needed for BPE) and `tokenizer.ggml.pre = 'gemma4'`, so flipping just the model name is enough; upstream's gemma4 init reads merges and sets pre-type correctly. The `add_bos = False` we ship is also auto-overridden to True by upstream's gemma4 workaround (PR 21500). Verified: with the fix, `Hey` -> "Hello! How can I help you today?" (coherent greeting); without the fix the model emitted raw "thought" text into the user-visible response. --- llama/compat/llama-ollama-compat.cpp | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp index 84866b24f..0fb4ae1d5 100644 --- a/llama/compat/llama-ollama-compat.cpp +++ b/llama/compat/llama-ollama-compat.cpp @@ -221,11 +221,30 @@ bool detect_ollama_gemma4(const gguf_context * meta, const ggml_context * ctx) { void handle_gemma4(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) { if (!detect_ollama_gemma4(meta, ctx)) return; - (void) meta; (void) ctx; LLAMA_LOG_INFO("%s: detected Ollama-format gemma4 GGUF; applying compatibility fixes\n", __func__); + // Tokenizer fix: Ollama writes `tokenizer.ggml.model = 'llama'` (SPM) on + // gemma4 GGUFs, but gemma4 actually uses BPE — upstream-converted GGUFs + // use `'gemma4'` which selects LLAMA_VOCAB_TYPE_BPE in src/llama-vocab.cpp. + // With the wrong tokenizer type, gemma4's special tokens (e.g. + // `<|thought|>`, `<|turn>`, `<|channel>`) get split into multiple SPM + // subword pieces, so when the model emits them they come out as raw + // text instead of being recognized as control tokens. + // + // Ollama already supplies `tokenizer.ggml.merges` (needed for BPE) and + // `tokenizer.ggml.pre = 'gemma4'`, so flipping the model name is enough. + { + const int64_t kid = gguf_find_key(meta, "tokenizer.ggml.model"); + if (kid >= 0) { + const char * cur = gguf_get_val_str(meta, kid); + if (cur && std::strcmp(cur, "llama") == 0) { + gguf_set_val_str(meta, "tokenizer.ggml.model", "gemma4"); + } + } + } + // Hide embedded audio + vision + projector tensors from the text loader. add_skip_prefix(ml, "a."); add_skip_prefix(ml, "v.");