From 8fa6648650bf99511fdf99d69671ce4a43d564d2 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sun, 19 Apr 2026 12:24:16 -0700 Subject: [PATCH] llama/compat: add qwen35moe text handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Text-only support for Ollama's qwen3.5 (qwen35moe) blobs. Vision is more involved (QKV merge, patch_embed reshape/split) and follows in a later commit. Detection markers: qwen35moe.vision.*, qwen35moe.image_token_id, qwen35moe.ssm.v_head_reordered, qwen35moe.feed_forward_length, mtp.* tensors, embedded v.* tensors. Fixes applied: - attention.head_count_kv ARRAY -> UINT32 scalar (Ollama wrote per-layer [0,0,0,2,...] over 40 layers; upstream wants the max non-zero scalar). - rope.dimension_sections padded from [11,11,10] to [11,11,10,0] (4-element M-RoPE convention). - blk.N.ssm_dt -> blk.N.ssm_dt.bias rename for all 40 layers. - skip prefixes v.*, mm.*, mtp.* from the text loader. Tightens detect_ollama_gemma3 so it only matches files whose general.architecture is actually "gemma3" — without this, qwen3.5 was triggering gemma3 clip translation on the auto-mmproj pass and crashing on tensor-shape mismatches. Go side: gates the auto-mmproj behavior on a small arch allowlist (currently just gemma3). When a clip handler exists for an arch, add it to the map; until then the model runs text-only. Verified end-to-end via `ollama run qwen3.5:35b-a3b-q4_K_M` on the existing library/qwen3.5 blob — answers "2 + 2 equals **4**" with the text loader's compat fixups firing and no --mmproj flag. --- llama/compat/llama-ollama-compat.cpp | 102 +++++++++++++++++++++++++-- llm/llama_server.go | 21 ++++-- 2 files changed, 113 insertions(+), 10 deletions(-) diff --git a/llama/compat/llama-ollama-compat.cpp b/llama/compat/llama-ollama-compat.cpp index 30f2eba3c..bda0c9b64 100644 --- a/llama/compat/llama-ollama-compat.cpp +++ b/llama/compat/llama-ollama-compat.cpp @@ -145,11 +145,21 @@ void promote_tensor_to_f32(ggml_context * ctx, const char * name) { // gemma3 (text side) // ------------------------------------------------------------------------- -// Returns true if this looks like an Ollama-format gemma3 blob. Different -// Ollama converter versions produced different quirks (4B/12B/27B have -// embedded vision + mm KVs; 1B uses non-standard rope key names; all of -// them omit layer_norm_rms_epsilon). Any single marker trips detection. +// Returns true if this looks like an Ollama-format gemma3 blob. Requires +// the file to declare itself gemma3 (either via general.architecture or +// by having at least one gemma3.* KV), AND to exhibit at least one Ollama +// quirk. Different Ollama converter versions produced different quirks +// (4B/12B/27B have embedded vision + mm KVs; 1B uses non-standard rope +// key names; all of them omit layer_norm_rms_epsilon). bool detect_ollama_gemma3(const gguf_context * meta, const ggml_context * ctx) { + // Claim #1: the file is gemma3. + const int64_t arch_kid = gguf_find_key(meta, "general.architecture"); + if (arch_kid < 0) return false; + if (std::strcmp(gguf_get_val_str(meta, arch_kid), "gemma3") != 0) return false; + + // Claim #2: at least one Ollama-ism. An upstream-converted gemma3 would + // have none of these (except possibly the v./mm. prefixes, which upstream + // never ships in the text file — they live in a separate mmproj). return has_key(meta, "gemma3.mm.tokens_per_image") || any_tensor_with_prefix(ctx, "v.") || any_tensor_with_prefix(ctx, "mm.") @@ -213,6 +223,87 @@ void handle_gemma3(const llama_model_loader * ml, gguf_context * meta, ggml_cont // already have the +1 shift baked in, same as upstream's convert_hf. } +// ------------------------------------------------------------------------- +// qwen35moe (text side) +// ------------------------------------------------------------------------- + +bool detect_ollama_qwen35moe(const gguf_context * meta, const ggml_context * ctx) { + // Strongest markers: vision KVs live in-file (upstream splits to mmproj) + // or MTP tensors are present (upstream strips them). + if (has_key(meta, "qwen35moe.vision.block_count")) return true; + if (has_key(meta, "qwen35moe.image_token_id")) return true; + if (has_key(meta, "qwen35moe.ssm.v_head_reordered")) return true; + if (has_key(meta, "qwen35moe.feed_forward_length")) return true; // upstream omits (=0 stored) + if (has_key(meta, "qwen35moe.rope.mrope_interleaved")) return true; + if (any_tensor_with_prefix(ctx, "mtp.")) return true; + if (any_tensor_with_prefix(ctx, "v.")) return true; + + // Scalar-vs-array: upstream writes head_count_kv as UINT32; Ollama wrote + // it as a per-layer array. has_key alone can't tell us that, but a mismatch + // shows up as a type-mismatch crash downstream, which is worse than over- + // detecting. If any of the above markers fire we'll normalize it below. + return false; +} + +void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) { + if (!detect_ollama_qwen35moe(meta, ctx)) return; + + LLAMA_LOG_INFO("%s: detected Ollama-format qwen35moe GGUF; applying compatibility fixes\n", __func__); + + // 1. attention.head_count_kv — upstream expects UINT32; Ollama wrote + // an array (one entry per layer, 0 for SSM layers, 2 for attention + // layers). Collapse to the max non-zero value. + { + const int64_t kid = gguf_find_key(meta, "qwen35moe.attention.head_count_kv"); + if (kid >= 0 && gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) { + const size_t n = gguf_get_arr_n(meta, kid); + const auto * arr = static_cast(gguf_get_arr_data(meta, kid)); + uint32_t max_kv = 0; + for (size_t i = 0; i < n; ++i) if (arr[i] > max_kv) max_kv = arr[i]; + if (max_kv == 0) max_kv = 2; // safety fallback + gguf_remove_key(meta, "qwen35moe.attention.head_count_kv"); + gguf_set_val_u32(meta, "qwen35moe.attention.head_count_kv", max_kv); + } + } + + // 2. rope.dimension_sections — upstream expects a 4-element array + // (M-RoPE convention); Ollama wrote 3 elements. Pad with a trailing 0. + { + const int64_t kid = gguf_find_key(meta, "qwen35moe.rope.dimension_sections"); + if (kid >= 0 && gguf_get_arr_n(meta, kid) == 3) { + const auto * src = static_cast(gguf_get_arr_data(meta, kid)); + const int32_t padded[4] = { src[0], src[1], src[2], 0 }; + gguf_set_arr_data(meta, "qwen35moe.rope.dimension_sections", + GGUF_TYPE_INT32, padded, 4); + } + } + + // 3. Tensor rename: Ollama's `blk.N.ssm_dt` corresponds to upstream's + // `blk.N.ssm_dt.bias` (same shape, F32 [32]). 40 layers. + { + std::vector targets; + const int64_t n = gguf_get_n_tensors(meta); + static const char suffix[] = ".ssm_dt"; + const size_t slen = sizeof(suffix) - 1; + for (int64_t i = 0; i < n; ++i) { + std::string name(gguf_get_tensor_name(meta, i)); + if (name.size() >= slen + && name.compare(name.size() - slen, slen, suffix) == 0) { + targets.push_back(std::move(name)); + } + } + for (const auto & from : targets) { + rename_tensor(meta, ctx, from.c_str(), (from + ".bias").c_str()); + } + } + + // 4. Drop embedded vision + MTP + projector tensors from the text loader. + // (vision goes to clip via --mmproj; MTP isn't used by upstream.) + add_skip_prefix(ml, "v."); + add_skip_prefix(ml, "mm."); + add_skip_prefix(ml, "mtp."); +} + // ------------------------------------------------------------------------- // gemma3 (clip side) // ------------------------------------------------------------------------- @@ -281,7 +372,8 @@ void translate_metadata(const llama_model_loader * ml, ggml_context * ctx, std::string & arch_name) { if (!meta) return; - if (arch_name == "gemma3") handle_gemma3(ml, meta, ctx); + if (arch_name == "gemma3") handle_gemma3(ml, meta, ctx); + if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx); // Dispatch. Add more arches as they are wired up. } diff --git a/llm/llama_server.go b/llm/llama_server.go index 2f843ef1a..c9620d613 100644 --- a/llm/llama_server.go +++ b/llm/llama_server.go @@ -424,11 +424,22 @@ func NewLlamaServerRunner( _, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())] // Older Ollama-format GGUFs store vision tensors (v.*, mm.*) inline in - // the main model file rather than in a separate projector layer. Detect - // this case and point --mmproj at the model itself — the in-process - // llama.cpp compat shim translates the same file into both a text-only - // view and a clip-mmproj view. See llama/compat/ for details. - if len(projectors) == 0 && len(f.Tensors().Items("v.")) > 0 { + // the main model file rather than in a separate projector layer. When + // the arch has a llama/compat clip handler, we can point --mmproj at + // the same file and the in-process shim translates the two views. + // + // If we auto-enable --mmproj for an arch whose clip handler doesn't + // exist yet, upstream's clip loader sees un-translated Ollama tensors + // and aborts model load. So gate on an explicit allowlist that mirrors + // the compat layer's clip-side coverage in llama/compat/. + compatClipArches := map[string]bool{ + "gemma3": true, + // Add entries as llama/compat grows clip handlers. + // "qwen35moe": true, + } + if len(projectors) == 0 && + len(f.Tensors().Items("v.")) > 0 && + compatClipArches[f.KV().Architecture()] { projectors = []string{modelPath} }