From 8fa6648650bf99511fdf99d69671ce4a43d564d2 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sun, 19 Apr 2026 12:24:16 -0700
Subject: [PATCH] llama/compat: add qwen35moe text handler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Text-only support for Ollama's qwen3.5 (qwen35moe) blobs. Vision is more
involved (QKV merge, patch_embed reshape/split) and follows in a later
commit.

Detection markers: qwen35moe.vision.*, qwen35moe.image_token_id,
qwen35moe.ssm.v_head_reordered, qwen35moe.feed_forward_length, mtp.*
tensors, embedded v.* tensors.

Fixes applied:
  - attention.head_count_kv ARRAY -> UINT32 scalar (Ollama wrote per-layer
    [0,0,0,2,...] over 40 layers; upstream wants the max non-zero scalar).
  - rope.dimension_sections padded from [11,11,10] to [11,11,10,0]
    (4-element M-RoPE convention).
  - blk.N.ssm_dt -> blk.N.ssm_dt.bias rename for all 40 layers.
  - skip prefixes v.*, mm.*, mtp.* from the text loader.

Tightens detect_ollama_gemma3 so it only matches files whose
general.architecture is actually "gemma3" — without this, qwen3.5 was
triggering gemma3 clip translation on the auto-mmproj pass and crashing
on tensor-shape mismatches.

Go side: gates the auto-mmproj behavior on a small arch allowlist
(currently just gemma3). When a clip handler exists for an arch, add it
to the map; until then the model runs text-only.

Verified end-to-end via `ollama run qwen3.5:35b-a3b-q4_K_M` on the
existing library/qwen3.5 blob — answers "2 + 2 equals **4**" with the
text loader's compat fixups firing and no --mmproj flag.
---
 llama/compat/llama-ollama-compat.cpp | 102 +++++++++++++++++++++++++--
 llm/llama_server.go                  |  21 ++++--
 2 files changed, 113 insertions(+), 10 deletions(-)

diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp
index 30f2eba3c..bda0c9b64 100644
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@@ -145,11 +145,21 @@ void promote_tensor_to_f32(ggml_context * ctx, const char * name) {
 // gemma3 (text side)
 // -------------------------------------------------------------------------
 
-// Returns true if this looks like an Ollama-format gemma3 blob. Different
-// Ollama converter versions produced different quirks (4B/12B/27B have
-// embedded vision + mm KVs; 1B uses non-standard rope key names; all of
-// them omit layer_norm_rms_epsilon). Any single marker trips detection.
+// Returns true if this looks like an Ollama-format gemma3 blob. Requires
+// the file to declare itself gemma3 (either via general.architecture or
+// by having at least one gemma3.* KV), AND to exhibit at least one Ollama
+// quirk. Different Ollama converter versions produced different quirks
+// (4B/12B/27B have embedded vision + mm KVs; 1B uses non-standard rope
+// key names; all of them omit layer_norm_rms_epsilon).
 bool detect_ollama_gemma3(const gguf_context * meta, const ggml_context * ctx) {
+    // Claim #1: the file is gemma3.
+    const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
+    if (arch_kid < 0) return false;
+    if (std::strcmp(gguf_get_val_str(meta, arch_kid), "gemma3") != 0) return false;
+
+    // Claim #2: at least one Ollama-ism. An upstream-converted gemma3 would
+    // have none of these (except possibly the v./mm. prefixes, which upstream
+    // never ships in the text file — they live in a separate mmproj).
     return has_key(meta, "gemma3.mm.tokens_per_image")
         || any_tensor_with_prefix(ctx, "v.")
         || any_tensor_with_prefix(ctx, "mm.")
@@ -213,6 +223,87 @@ void handle_gemma3(const llama_model_loader * ml, gguf_context * meta, ggml_cont
     // already have the +1 shift baked in, same as upstream's convert_hf.
 }
 
+// -------------------------------------------------------------------------
+// qwen35moe (text side)
+// -------------------------------------------------------------------------
+
+bool detect_ollama_qwen35moe(const gguf_context * meta, const ggml_context * ctx) {
+    // Strongest markers: vision KVs live in-file (upstream splits to mmproj)
+    // or MTP tensors are present (upstream strips them).
+    if (has_key(meta, "qwen35moe.vision.block_count"))     return true;
+    if (has_key(meta, "qwen35moe.image_token_id"))         return true;
+    if (has_key(meta, "qwen35moe.ssm.v_head_reordered"))   return true;
+    if (has_key(meta, "qwen35moe.feed_forward_length"))    return true; // upstream omits (=0 stored)
+    if (has_key(meta, "qwen35moe.rope.mrope_interleaved")) return true;
+    if (any_tensor_with_prefix(ctx, "mtp."))               return true;
+    if (any_tensor_with_prefix(ctx, "v."))                 return true;
+
+    // Scalar-vs-array: upstream writes head_count_kv as UINT32; Ollama wrote
+    // it as a per-layer array. has_key alone can't tell us that, but a mismatch
+    // shows up as a type-mismatch crash downstream, which is worse than over-
+    // detecting. If any of the above markers fire we'll normalize it below.
+    return false;
+}
+
+void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) {
+    if (!detect_ollama_qwen35moe(meta, ctx)) return;
+
+    LLAMA_LOG_INFO("%s: detected Ollama-format qwen35moe GGUF; applying compatibility fixes\n", __func__);
+
+    // 1. attention.head_count_kv — upstream expects UINT32; Ollama wrote
+    //    an array (one entry per layer, 0 for SSM layers, 2 for attention
+    //    layers). Collapse to the max non-zero value.
+    {
+        const int64_t kid = gguf_find_key(meta, "qwen35moe.attention.head_count_kv");
+        if (kid >= 0 && gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
+            const size_t n = gguf_get_arr_n(meta, kid);
+            const auto * arr = static_cast<const uint32_t *>(gguf_get_arr_data(meta, kid));
+            uint32_t max_kv = 0;
+            for (size_t i = 0; i < n; ++i) if (arr[i] > max_kv) max_kv = arr[i];
+            if (max_kv == 0) max_kv = 2; // safety fallback
+            gguf_remove_key(meta, "qwen35moe.attention.head_count_kv");
+            gguf_set_val_u32(meta, "qwen35moe.attention.head_count_kv", max_kv);
+        }
+    }
+
+    // 2. rope.dimension_sections — upstream expects a 4-element array
+    //    (M-RoPE convention); Ollama wrote 3 elements. Pad with a trailing 0.
+    {
+        const int64_t kid = gguf_find_key(meta, "qwen35moe.rope.dimension_sections");
+        if (kid >= 0 && gguf_get_arr_n(meta, kid) == 3) {
+            const auto * src = static_cast<const int32_t *>(gguf_get_arr_data(meta, kid));
+            const int32_t padded[4] = { src[0], src[1], src[2], 0 };
+            gguf_set_arr_data(meta, "qwen35moe.rope.dimension_sections",
+                              GGUF_TYPE_INT32, padded, 4);
+        }
+    }
+
+    // 3. Tensor rename: Ollama's `blk.N.ssm_dt` corresponds to upstream's
+    //    `blk.N.ssm_dt.bias` (same shape, F32 [32]). 40 layers.
+    {
+        std::vector<std::string> targets;
+        const int64_t n = gguf_get_n_tensors(meta);
+        static const char suffix[] = ".ssm_dt";
+        const size_t slen = sizeof(suffix) - 1;
+        for (int64_t i = 0; i < n; ++i) {
+            std::string name(gguf_get_tensor_name(meta, i));
+            if (name.size() >= slen
+                    && name.compare(name.size() - slen, slen, suffix) == 0) {
+                targets.push_back(std::move(name));
+            }
+        }
+        for (const auto & from : targets) {
+            rename_tensor(meta, ctx, from.c_str(), (from + ".bias").c_str());
+        }
+    }
+
+    // 4. Drop embedded vision + MTP + projector tensors from the text loader.
+    //    (vision goes to clip via --mmproj; MTP isn't used by upstream.)
+    add_skip_prefix(ml, "v.");
+    add_skip_prefix(ml, "mm.");
+    add_skip_prefix(ml, "mtp.");
+}
+
 // -------------------------------------------------------------------------
 // gemma3 (clip side)
 // -------------------------------------------------------------------------
@@ -281,7 +372,8 @@ void translate_metadata(const llama_model_loader * ml,
                         ggml_context * ctx,
                         std::string & arch_name) {
     if (!meta) return;
-    if (arch_name == "gemma3") handle_gemma3(ml, meta, ctx);
+    if (arch_name == "gemma3")    handle_gemma3(ml, meta, ctx);
+    if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx);
     // Dispatch. Add more arches as they are wired up.
 }
 
diff --git a/llm/llama_server.go b/llm/llama_server.go
index 2f843ef1a..c9620d613 100644
--- a/llm/llama_server.go
+++ b/llm/llama_server.go
@@ -424,11 +424,22 @@ func NewLlamaServerRunner(
 	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
 
 	// Older Ollama-format GGUFs store vision tensors (v.*, mm.*) inline in
-	// the main model file rather than in a separate projector layer. Detect
-	// this case and point --mmproj at the model itself — the in-process
-	// llama.cpp compat shim translates the same file into both a text-only
-	// view and a clip-mmproj view. See llama/compat/ for details.
-	if len(projectors) == 0 && len(f.Tensors().Items("v.")) > 0 {
+	// the main model file rather than in a separate projector layer. When
+	// the arch has a llama/compat clip handler, we can point --mmproj at
+	// the same file and the in-process shim translates the two views.
+	//
+	// If we auto-enable --mmproj for an arch whose clip handler doesn't
+	// exist yet, upstream's clip loader sees un-translated Ollama tensors
+	// and aborts model load. So gate on an explicit allowlist that mirrors
+	// the compat layer's clip-side coverage in llama/compat/.
+	compatClipArches := map[string]bool{
+		"gemma3": true,
+		// Add entries as llama/compat grows clip handlers.
+		// "qwen35moe": true,
+	}
+	if len(projectors) == 0 &&
+		len(f.Tensors().Items("v.")) > 0 &&
+		compatClipArches[f.KV().Architecture()] {
 		projectors = []string{modelPath}
 	}