Fix inaccurate Ollama context length for qwen2.5 models (#20933)

Since Ollama/llama.cpp do not currently YARN for context length extension, the context length is limited to `32768`. This can be confirmed by the Ollama model card. See corresponding issue on Ollama repo : https://github.com/ollama/ollama/issues/6865 Co-authored-by: Patrick Samson <1416027+patricksamson@users.noreply.github.com>
2024-11-22 15:10:01 +00:00 · 2024-11-22 15:10:01 +00:00 · b4659bb44e
commit b4659bb44e
parent d5f2bca382
1 changed files with 3 additions and 2 deletions
--- a/crates/ollama/src/ollama.rs
+++ b/crates/ollama/src/ollama.rs
@ -81,9 +81,10 @@ fn get_max_tokens(name: &str) -> usize {
        "llama2" | "yi" | "vicuna" | "stablelm2" => 4096,
        "llama3" | "gemma2" | "gemma" | "codegemma" | "starcoder" | "aya" => 8192,
        "codellama" | "starcoder2" => 16384,
-        "mistral" | "codestral" | "mixstral" | "llava" | "qwen2" | "dolphin-mixtral" => 32768,
+        "mistral" | "codestral" | "mixstral" | "llava" | "qwen2" | "qwen2.5-coder"
+        | "dolphin-mixtral" => 32768,
        "llama3.1" | "phi3" | "phi3.5" | "command-r" | "deepseek-coder-v2" | "yi-coder"
-        | "llama3.2" | "qwen2.5-coder" => 128000,
+        | "llama3.2" => 128000,
        _ => DEFAULT_TOKENS,
    }
    .clamp(1, MAXIMUM_TOKENS)