From 5d453910167d35206385b65e8144a39b56860f29 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sun, 19 Apr 2026 16:42:06 -0700
Subject: [PATCH] llama/compat: rewrite gemma4 tokenizer model to BPE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ollama's converter writes `tokenizer.ggml.model = 'llama'` (SPM) on
gemma4 GGUFs, but the gemma4 model family actually uses BPE — upstream-
converted reference GGUFs use `'gemma4'` which selects
LLAMA_VOCAB_TYPE_BPE in src/llama-vocab.cpp.

With the wrong tokenizer type, gemma4's special tokens (e.g.
`<|thought|>`, `<|turn>`, `<|channel>`) get split into multiple SPM
subword pieces, so when the model emits them they come out as raw
text instead of being recognized as control tokens — surfaces as e.g.
the literal word "thought" appearing in the chat output instead of
being routed to the model's reasoning channel.

Ollama already supplies `tokenizer.ggml.merges` (needed for BPE) and
`tokenizer.ggml.pre = 'gemma4'`, so flipping just the model name is
enough; upstream's gemma4 init reads merges and sets pre-type
correctly. The `add_bos = False` we ship is also auto-overridden to
True by upstream's gemma4 workaround (PR 21500).

Verified: with the fix, `Hey` -> "Hello! How can I help you today?"
(coherent greeting); without the fix the model emitted raw "thought"
text into the user-visible response.
---
 llama/compat/llama-ollama-compat.cpp | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp
index 84866b24f..0fb4ae1d5 100644
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@@ -221,11 +221,30 @@ bool detect_ollama_gemma4(const gguf_context * meta, const ggml_context * ctx) {
 
 void handle_gemma4(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) {
     if (!detect_ollama_gemma4(meta, ctx)) return;
-    (void) meta;
     (void) ctx;
 
     LLAMA_LOG_INFO("%s: detected Ollama-format gemma4 GGUF; applying compatibility fixes\n", __func__);
 
+    // Tokenizer fix: Ollama writes `tokenizer.ggml.model = 'llama'` (SPM) on
+    // gemma4 GGUFs, but gemma4 actually uses BPE — upstream-converted GGUFs
+    // use `'gemma4'` which selects LLAMA_VOCAB_TYPE_BPE in src/llama-vocab.cpp.
+    // With the wrong tokenizer type, gemma4's special tokens (e.g.
+    // `<|thought|>`, `<|turn>`, `<|channel>`) get split into multiple SPM
+    // subword pieces, so when the model emits them they come out as raw
+    // text instead of being recognized as control tokens.
+    //
+    // Ollama already supplies `tokenizer.ggml.merges` (needed for BPE) and
+    // `tokenizer.ggml.pre = 'gemma4'`, so flipping the model name is enough.
+    {
+        const int64_t kid = gguf_find_key(meta, "tokenizer.ggml.model");
+        if (kid >= 0) {
+            const char * cur = gguf_get_val_str(meta, kid);
+            if (cur && std::strcmp(cur, "llama") == 0) {
+                gguf_set_val_str(meta, "tokenizer.ggml.model", "gemma4");
+            }
+        }
+    }
+
     // Hide embedded audio + vision + projector tensors from the text loader.
     add_skip_prefix(ml, "a.");
     add_skip_prefix(ml, "v.");