From b49eaea3f5dd26507605dcab6253bd91d0ec2cc6 Mon Sep 17 00:00:00 2001 From: Randy420Marsh <46230578+Randy420Marsh@users.noreply.github.com> Date: Sat, 25 Apr 2026 09:43:32 +0300 Subject: [PATCH 1/4] Initial commit --- .github/workflows/build-llama-cpp-wheel.yml | 99 ++++++ README.md | 2 + build.txt | 1 + llama_cpp/llama.py | 5 + llama_cpp/llama_chat_format.py | 370 +++++++++++++++++++- 5 files changed, 464 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/build-llama-cpp-wheel.yml create mode 100644 build.txt diff --git a/.github/workflows/build-llama-cpp-wheel.yml b/.github/workflows/build-llama-cpp-wheel.yml new file mode 100644 index 000000000..148f9a29b --- /dev/null +++ b/.github/workflows/build-llama-cpp-wheel.yml @@ -0,0 +1,99 @@ +name: Build llama-cpp-python Wheel (CUDA 12.8) + +on: + push: + tags: + - 'v*' + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + container: + image: nvidia/cuda:12.8.0-devel-ubuntu22.04 + + steps: + - name: Install Git (required for submodules) + run: | + apt-get update + apt-get install -y git + + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install Python 3.11 + build tools + env: + DEBIAN_FRONTEND: noninteractive + TZ: Etc/UTC + run: | + apt-get update + apt-get install -y software-properties-common tzdata + add-apt-repository -y ppa:deadsnakes/ppa + apt-get update + apt-get install -y \ + python3.11 \ + python3.11-venv \ + python3.11-dev \ + git \ + cmake \ + build-essential \ + pkg-config \ + libopenblas-dev + python3.11 -m ensurepip --upgrade + python3.11 -m pip install --upgrade pip wheel setuptools + + - name: Use Python 3.11 + run: | + update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 + update-alternatives --set python /usr/bin/python3.11 + python --version + + - name: Set CUDA environment + run: | + echo "CUDA_HOME=/usr/local/cuda" >> $GITHUB_ENV + echo "PATH=/usr/local/cuda/bin:$PATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" >> $GITHUB_ENV + + - name: Verify CUDA installation + run: | + set -e + ls -la /usr/local/cuda* || (echo "ERROR: CUDA not found!" && exit 1) + which nvcc || (echo "ERROR: nvcc not found!" && exit 1) + nvcc --version + echo "CUDA verified successfully at: $(which nvcc)" + + - name: Build wheel with all architectures + env: + CMAKE_ARGS: > + -DGGML_CUDA=ON + -DCMAKE_CUDA_ARCHITECTURES="75;80;86;89;90" + -DGGML_NATIVE=OFF + -DLLAMA_BUILD_EXAMPLES=OFF + -DLLAMA_BUILD_TESTS=OFF + -DLLAMA_BUILD_SERVER=OFF + FORCE_CMAKE: 1 + run: | + python -m pip wheel . --no-deps -w dist -v + + - name: Upload wheel as artifact + if: success() + uses: actions/upload-artifact@v4 + with: + name: llama-cpp-python-cuda12.8-all-arch + path: dist/*.whl + retention-days: 30 + + - name: Create Release (on tag) + if: startsWith(github.ref, 'refs/tags/') + uses: softprops/action-gh-release@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + files: dist/*.whl + draft: false + prerelease: false + generate_release_notes: true diff --git a/README.md b/README.md index 69a0f8234..48ad27750 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@

+## Unofficial custom fork meant for https://github.com/Randy420Marsh/VisionLLMCaptioner and gemma 4 + # Python Bindings for [`llama.cpp`](https://github.com/ggerganov/llama.cpp) [![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) diff --git a/build.txt b/build.txt new file mode 100644 index 000000000..615ab0708 --- /dev/null +++ b/build.txt @@ -0,0 +1 @@ +export FORCE_CMAKE=1 && CMAKE_ARGS="-DGGML_CUDA=on" pip install -e . diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 11fe169cf..373d115bf 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1973,6 +1973,8 @@ def create_chat_completion( logit_bias: Optional[Dict[int, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + reasoning_budget: Optional[int] = None, + enable_thinking: bool = False, # <--- FIXED SYNTAX ERROR HERE ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -2005,6 +2007,7 @@ def create_chat_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use. logit_bias: A logit bias to use. + reasoning_budget: The reasoning budget for thinking mode (Gemma 4). Controls the maximum number of tokens for thinking/reasoning. Returns: Generated chat completion or a stream of chat completion chunks. @@ -2044,6 +2047,8 @@ def create_chat_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + enable_thinking=enable_thinking, # <--- PASSED TO HANDLER HERE ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 1024fb85b..4fdaac909 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1,3 +1,4 @@ +# Drop-in replacement / standalone Gemma 4 chat handler for llama-cpp-python (April 2026) from __future__ import annotations import os @@ -607,6 +608,7 @@ def chat_completion_handler( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + reasoning_budget: Optional[int] = None, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -618,6 +620,7 @@ def chat_completion_handler( function_call=function_call, tools=tools, tool_choice=tool_choice, + reasoning_budget=reasoning_budget, **kwargs, ) prompt = llama.tokenize( @@ -1398,11 +1401,12 @@ def format_saiga( return ChatFormatterResponse(prompt=_prompt.strip()) -# Chat format for Google's Gemma models, see more details and available models: +# Chat format for Google's Gemma models (Gemma 2 and Gemma 3), see more details and available models: # https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b @register_chat_format("gemma") def format_gemma( messages: List[llama_types.ChatCompletionRequestMessage], + reasoning_budget: Optional[int] = None, **kwargs: Any, ) -> ChatFormatterResponse: system_message = _get_system_message(messages) @@ -1418,6 +1422,80 @@ def format_gemma( return ChatFormatterResponse(prompt=_prompt, stop=_sep) +# Chat format for Google's Gemma 4 models, see more details: +# https://huggingface.co/google/gemma-4-E2B-it +# https://ai.google.dev/gemma/docs/core/prompt-structure +# Gemma 4 introduces new special tokens and native system role support +@register_chat_format("gemma4") +def format_gemma4( + messages: List[llama_types.ChatCompletionRequestMessage], + reasoning_budget: Optional[int] = None, + **kwargs: Any, +) -> ChatFormatterResponse: + """Format messages for Gemma 4 models using the new <|turn> and tokens. + + Gemma 4 introduces: + - Native system role support via <|channel>thought\n ... \n + - New turn-based tokens: <|turn>, , <|channel>, + - Thinking mode support via <|think|> token + - Tool calling support via <|tool_call>, , etc. + + This is a simplified formatter that handles basic text-only conversations. + For full multimodal and tool calling support, use the Gemma4ChatHandler class. + + Special tokens: + - : Beginning of sequence + - <|turn>: Start of turn + - : End of turn + - <|channel>: Start of channel + - : End of channel + - <|think|>: Thinking mode indicator + - <|tool_call>: Start of tool call + - : End of tool call + + Args: + messages: List of chat completion messages + reasoning_budget: Maximum number of tokens for thinking/reasoning (Gemma 4 feature) + **kwargs: Additional keyword arguments + """ + _bos_token = "" + _turn_start = "<|turn>" + _turn_end = "\n" + _channel_start = "<|channel>" + _channel_end = "\n" + + _prompt = _bos_token + + # Check for system message - in Gemma 4, system messages go in a thought channel + system_message = _get_system_message(messages) + if system_message: + _prompt += f"{_channel_start}thought\n{system_message}{_channel_end}" + + # Format conversation turns + for message in messages: + role = message["role"] + content = message.get("content", "") + + # Skip system messages as they're handled separately + if role == "system": + continue + + # Map role to Gemma 4 role names + if role == "assistant": + gemma_role = "model" + else: + gemma_role = role + + _prompt += f"{_turn_start}{gemma_role}\n{content}{_turn_end}" + + # Add generation prompt + _prompt += f"{_turn_start}model\n" + + return ChatFormatterResponse( + prompt=_prompt, stop=[_turn_end, "", ""] + ) + + # Tricky chat formats that require custom chat handlers @@ -1575,9 +1653,9 @@ def prepare_messages_for_inference( message["name"] = f"functions.{message['name']}" # Function call requests by assistant if "function_call" in message: - message["function_call"]["name"] = ( - f"functions.{message['function_call']['name']}" - ) + message["function_call"][ + "name" + ] = f"functions.{message['function_call']['name']}" all_messages.append(message) all_messages.append( @@ -1816,9 +1894,9 @@ def functionary_v1_v2_chat_handler( SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" tokenizer = llama.tokenizer_ - assert hasattr(tokenizer, "hf_tokenizer"), ( - "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" - ) + assert hasattr( + tokenizer, "hf_tokenizer" + ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" from transformers import AutoTokenizer if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens: @@ -1968,9 +2046,9 @@ def prepare_messages_for_inference( message["name"] = f"functions.{message['name']}" # Function call requests by assistant if "function_call" in message: - message["function_call"]["name"] = ( - f"functions.{message['function_call']['name']}" - ) + message["function_call"][ + "name" + ] = f"functions.{message['function_call']['name']}" all_messages.append(message) if version == "v1": @@ -3229,6 +3307,64 @@ def from_pretrained( ) +class GemmaChatHandler(Llava15ChatHandler): + """Chat handler for Gemma-based multimodal models (e.g., PaliGemma, MedGemma). + + Gemma models use / control tokens instead of + the LLaVA-style USER:/ASSISTANT: format. The text-only 'gemma' chat format + is already registered (see format_gemma), but multimodal Gemma models that + require a Llava-style vision pipeline need a dedicated handler so the + correct chat template is applied when chat_handler takes precedence over + chat_format in the resolution order. + + See: https://ai.google.dev/gemma/docs/formatting + """ + + DEFAULT_SYSTEM_MESSAGE = None # Gemma models do not natively support a system role + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System messages are folded into a user turn (Gemma has no system role) + "{% if message.role == 'system' %}" + "user\n{{ message.content }}\n" + "{% endif %}" + # User message (handles both plain string and multimodal content list) + "{% if message.role == 'user' %}" + "user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable and message.content is not string %}" + # Emit image tokens first + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + # Then emit text tokens + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' and message.content is not none %}" + "model\n{{ message.content }}\n" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "model\n" + "{% endif %}" + ) + + class ObsidianChatHandler(Llava15ChatHandler): # Prompt Format # The model followed ChatML format. However, with ### as the seperator @@ -3581,6 +3717,216 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class MultimodalGemmaChatHandler(Llava15ChatHandler): + DEFAULT_SYSTEM_MESSAGE: Optional[str] = None + + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'user' %}" + "user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "\n" + "{% endif %}" + "{% if message.role == 'assistant' and message.content is not none %}" + "model\n" + "{{ message.content }}\n" + "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}" + "model\n" + "{% endif %}" + ) + + +# ============================================================ +# GEMMA 4 CHAT HANDLER - FULLY CORRECTED & POLISHED +# ============================================================ + +class Gemma4ChatHandler(Llava15ChatHandler): + """Chat handler for Gemma 4 models with full multimodal and tool calling support. + + Gemma 4 introduces new special tokens and native system role support: + - <|turn>: Start of turn + - : End of turn + - <|channel>: Start of channel (for system/thought messages) + - : End of channel + - <|think|>: Thinking mode indicator + - <|tool_call>: Start of tool call + - : End of tool call + - <|tool_response|>: Tool response marker + """ + + DEFAULT_SYSTEM_MESSAGE: Optional[str] = None + + CHAT_FORMAT = ( + "{% for message in messages %}" + # 1. System messages go in a thought channel + "{% if message.role == 'system' %}" + "<|channel>thought\n{{ message.content }}\n" + "{% endif %}" + + # 2. User message (handles both plain string and multimodal media) + "{% if message.role == 'user' %}" + "<|turn>user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable and message.content is not string %}" + # Emit Media Embeddings (Images AND Audio) + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}{{ content.image_url }}{% else %}{{ content.image_url.url }}{% endif %}" + "{% elif content.type == 'input_audio' %}" + "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" + "{% elif content.type == 'audio' %}" + "data:audio/{{ content.audio.format }};base64,{{ content.audio.data }}" + "{% endif %}" + "{% endfor %}" + # Then emit text tokens + "{% for content in message.content %}" + "{% if content.type == 'text' %}{{ content.text }}{% endif %}" + "{% endfor %}" + "{% endif %}" + "\n" + "{% endif %}" + + # 3. Assistant message + "{% if message.role == 'assistant' and message.content is not none %}" + "<|turn>model\n{{ message.content }}\n" + "{% endif %}" + + # 4. Tool Calls (Agentic Workflow Handshakes) + "{% if message.role == 'assistant' and message.tool_calls %}" + "<|turn>model\n" + "{% for tool_call in message.tool_calls %}" + "<|tool_call>call:{{ tool_call.function.name }}{{ tool_call.function.arguments }}\n" + "{% endfor %}" + "\n" + "{% endif %}" + + # 5. Tool Responses + "{% if message.role == 'tool' %}" + "<|tool_response>response:{{ message.name }}{{ message.content }}\n" + "{% endif %}" + "{% endfor %}" + + # 6. Generation prompt + "{% if add_generation_prompt %}" + "<|turn>model\n" + "{% endif %}" + ) + + @staticmethod + def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]) -> List[str]: + """ + Overrides the base Llava15ChatHandler method. + Extracts both image URLs and audio base64 data strings so they can be processed + and replaced by the mtmd C++ media marker embeddings in the backend. + """ + media_urls: List[str] = [] + for message in messages: + if message["role"] == "user" and message.get("content"): + for content in message["content"]: + if isinstance(content, dict) and "type" in content: + + # Extract Vision + if content["type"] == "image_url": + if isinstance(content["image_url"], dict) and "url" in content["image_url"]: + media_urls.append(content["image_url"]["url"]) + else: + media_urls.append(content["image_url"]) + + # Extract Audio (Supports OpenAI's 'input_audio' AND custom 'audio' schemas) + elif content["type"] in ["input_audio", "audio"]: + audio_data = content.get("input_audio") or content.get("audio") + if audio_data: + fmt = audio_data.get("format", "wav") + data = audio_data.get("data", "") + # Standardize the output so `load_image` successfully base64-decodes the bytes + media_urls.append(f"data:audio/{fmt};base64,{data}") + + return media_urls + + def __call__(self, **kwargs): + """ + Overrides the __call__ pipeline to dynamically intercept and enable Thinking Mode + by injecting the required control token seamlessly into the Jinja template. + Also performs state clearing for reliable multimodal (vision + audio) support + across multiple chat turns, matching other vision handlers like Qwen25VL. + """ + enable_thinking = kwargs.get("enable_thinking", False) + original_format = self.CHAT_FORMAT + + if enable_thinking: + # Inject <|think|> into BOTH the initial system thought channel AND + # the assistant generation prompt so thinking starts the response turn. + # This follows Gemma 4 docs for triggering native thinking mode. + modified_format = original_format.replace( + "<|channel>thought\n", + "<|channel>thought\n<|think|>\n" + ).replace( + "{% if add_generation_prompt %}\n<|turn>model\n{% endif %}", + "{% if add_generation_prompt %}\n<|turn>model\n<|think|>\n{% endif %}" + ) + self.CHAT_FORMAT = modified_format + + # Gemma requires a system block for the thought channel to exist. + # If the user hasn't provided one, we dynamically append a blank one. + messages = kwargs.get("messages", []) + if not any(m.get("role") == "system" for m in messages): + kwargs["messages"] = [{"role": "system", "content": ""}] + messages + + # Clear state for multiple runs (critical for vision/audio + thinking in chat) + llama = kwargs.get("llama") + if llama is not None: + llama.reset() + if hasattr(llama, "_ctx") and llama._ctx is not None: + llama._ctx.kv_cache_clear() + llama.n_tokens = 0 + if hasattr(llama, "input_ids"): + llama.input_ids.fill(0) + + # Clear any handler state (e.g. cached embeds from previous multimodal turn) + if hasattr(self, "_last_image_embed"): + self._last_image_embed = None + self._last_image_hash = None + + try: + result = super().__call__(**kwargs) + # Post-process non-streaming responses when thinking mode is enabled + # to provide clear structure: 'thinking' field (contains reasoning) + 'content' (final answer). + # Note: Since Gemma 4 outputs thinking + final answer in a single generation, + # 'thinking' currently holds the full generated text (including reasoning). + # Future: parse on model-specific end-of-thinking markers (e.g. <|end_think|>) if emitted. + if enable_thinking and not kwargs.get("stream", False) and isinstance(result, dict): + for choice in result.get("choices", []): + if "message" in choice: + content = choice["message"].get("content", "") or "" + choice["message"]["thinking"] = content # structured access for test app + # content remains the complete response (thinking + final answer) for compatibility + return result + finally: + # Restore the original class format so future non-thinking calls don't leak state + self.CHAT_FORMAT = original_format + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, @@ -3698,9 +4044,7 @@ def chatml_function_calling( stop = ( [stop, "<|im_end|>"] if isinstance(stop, str) - else stop + ["<|im_end|>"] - if stop - else ["<|im_end|>"] + else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] ) # Case 1: No tool choice by user From a44f4c7b705ba1b487d2ed1902c6632f9c3721ef Mon Sep 17 00:00:00 2001 From: Randy420Marsh <46230578+Randy420Marsh@users.noreply.github.com> Date: Mon, 27 Apr 2026 06:44:56 +0300 Subject: [PATCH 2/4] fix formatting --- llama_cpp/llama.py | 4 +- llama_cpp/llama_chat_format.py | 73 +++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 34 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 373d115bf..377643209 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1974,7 +1974,7 @@ def create_chat_completion( logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, reasoning_budget: Optional[int] = None, - enable_thinking: bool = False, # <--- FIXED SYNTAX ERROR HERE + enable_thinking: bool = False, # <--- FIXED SYNTAX ERROR HERE ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -2048,7 +2048,7 @@ def create_chat_completion( grammar=grammar, logit_bias=logit_bias, reasoning_budget=reasoning_budget, - enable_thinking=enable_thinking, # <--- PASSED TO HANDLER HERE + enable_thinking=enable_thinking, # <--- PASSED TO HANDLER HERE ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 4fdaac909..6708fcae7 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1653,9 +1653,9 @@ def prepare_messages_for_inference( message["name"] = f"functions.{message['name']}" # Function call requests by assistant if "function_call" in message: - message["function_call"][ - "name" - ] = f"functions.{message['function_call']['name']}" + message["function_call"]["name"] = ( + f"functions.{message['function_call']['name']}" + ) all_messages.append(message) all_messages.append( @@ -1894,9 +1894,9 @@ def functionary_v1_v2_chat_handler( SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" tokenizer = llama.tokenizer_ - assert hasattr( - tokenizer, "hf_tokenizer" - ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" + assert hasattr(tokenizer, "hf_tokenizer"), ( + "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" + ) from transformers import AutoTokenizer if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens: @@ -2046,9 +2046,9 @@ def prepare_messages_for_inference( message["name"] = f"functions.{message['name']}" # Function call requests by assistant if "function_call" in message: - message["function_call"][ - "name" - ] = f"functions.{message['function_call']['name']}" + message["function_call"]["name"] = ( + f"functions.{message['function_call']['name']}" + ) all_messages.append(message) if version == "v1": @@ -3759,6 +3759,7 @@ class MultimodalGemmaChatHandler(Llava15ChatHandler): # GEMMA 4 CHAT HANDLER - FULLY CORRECTED & POLISHED # ============================================================ + class Gemma4ChatHandler(Llava15ChatHandler): """Chat handler for Gemma 4 models with full multimodal and tool calling support. @@ -3781,7 +3782,6 @@ class Gemma4ChatHandler(Llava15ChatHandler): "{% if message.role == 'system' %}" "<|channel>thought\n{{ message.content }}\n" "{% endif %}" - # 2. User message (handles both plain string and multimodal media) "{% if message.role == 'user' %}" "<|turn>user\n" @@ -3806,12 +3806,10 @@ class Gemma4ChatHandler(Llava15ChatHandler): "{% endif %}" "\n" "{% endif %}" - # 3. Assistant message "{% if message.role == 'assistant' and message.content is not none %}" "<|turn>model\n{{ message.content }}\n" "{% endif %}" - # 4. Tool Calls (Agentic Workflow Handshakes) "{% if message.role == 'assistant' and message.tool_calls %}" "<|turn>model\n" @@ -3820,13 +3818,11 @@ class Gemma4ChatHandler(Llava15ChatHandler): "{% endfor %}" "\n" "{% endif %}" - # 5. Tool Responses "{% if message.role == 'tool' %}" "<|tool_response>response:{{ message.name }}{{ message.content }}\n" "{% endif %}" "{% endfor %}" - # 6. Generation prompt "{% if add_generation_prompt %}" "<|turn>model\n" @@ -3834,10 +3830,12 @@ class Gemma4ChatHandler(Llava15ChatHandler): ) @staticmethod - def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]) -> List[str]: + def get_image_urls( + messages: List[llama_types.ChatCompletionRequestMessage], + ) -> List[str]: """ Overrides the base Llava15ChatHandler method. - Extracts both image URLs and audio base64 data strings so they can be processed + Extracts both image URLs and audio base64 data strings so they can be processed and replaced by the mtmd C++ media marker embeddings in the backend. """ media_urls: List[str] = [] @@ -3845,49 +3843,52 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]) -> if message["role"] == "user" and message.get("content"): for content in message["content"]: if isinstance(content, dict) and "type" in content: - # Extract Vision if content["type"] == "image_url": - if isinstance(content["image_url"], dict) and "url" in content["image_url"]: + if ( + isinstance(content["image_url"], dict) + and "url" in content["image_url"] + ): media_urls.append(content["image_url"]["url"]) else: media_urls.append(content["image_url"]) - + # Extract Audio (Supports OpenAI's 'input_audio' AND custom 'audio' schemas) elif content["type"] in ["input_audio", "audio"]: - audio_data = content.get("input_audio") or content.get("audio") + audio_data = content.get("input_audio") or content.get( + "audio" + ) if audio_data: fmt = audio_data.get("format", "wav") data = audio_data.get("data", "") # Standardize the output so `load_image` successfully base64-decodes the bytes media_urls.append(f"data:audio/{fmt};base64,{data}") - + return media_urls def __call__(self, **kwargs): """ - Overrides the __call__ pipeline to dynamically intercept and enable Thinking Mode + Overrides the __call__ pipeline to dynamically intercept and enable Thinking Mode by injecting the required control token seamlessly into the Jinja template. Also performs state clearing for reliable multimodal (vision + audio) support across multiple chat turns, matching other vision handlers like Qwen25VL. """ enable_thinking = kwargs.get("enable_thinking", False) original_format = self.CHAT_FORMAT - + if enable_thinking: - # Inject <|think|> into BOTH the initial system thought channel AND + # Inject <|think|> into BOTH the initial system thought channel AND # the assistant generation prompt so thinking starts the response turn. # This follows Gemma 4 docs for triggering native thinking mode. modified_format = original_format.replace( - "<|channel>thought\n", - "<|channel>thought\n<|think|>\n" + "<|channel>thought\n", "<|channel>thought\n<|think|>\n" ).replace( "{% if add_generation_prompt %}\n<|turn>model\n{% endif %}", - "{% if add_generation_prompt %}\n<|turn>model\n<|think|>\n{% endif %}" + "{% if add_generation_prompt %}\n<|turn>model\n<|think|>\n{% endif %}", ) self.CHAT_FORMAT = modified_format - - # Gemma requires a system block for the thought channel to exist. + + # Gemma requires a system block for the thought channel to exist. # If the user hasn't provided one, we dynamically append a blank one. messages = kwargs.get("messages", []) if not any(m.get("role") == "system" for m in messages): @@ -3915,11 +3916,17 @@ def __call__(self, **kwargs): # Note: Since Gemma 4 outputs thinking + final answer in a single generation, # 'thinking' currently holds the full generated text (including reasoning). # Future: parse on model-specific end-of-thinking markers (e.g. <|end_think|>) if emitted. - if enable_thinking and not kwargs.get("stream", False) and isinstance(result, dict): + if ( + enable_thinking + and not kwargs.get("stream", False) + and isinstance(result, dict) + ): for choice in result.get("choices", []): if "message" in choice: content = choice["message"].get("content", "") or "" - choice["message"]["thinking"] = content # structured access for test app + choice["message"]["thinking"] = ( + content # structured access for test app + ) # content remains the complete response (thinking + final answer) for compatibility return result finally: @@ -4044,7 +4051,9 @@ def chatml_function_calling( stop = ( [stop, "<|im_end|>"] if isinstance(stop, str) - else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] + else stop + ["<|im_end|>"] + if stop + else ["<|im_end|>"] ) # Case 1: No tool choice by user From fd754ff8bfae7a8eaedfc2266274c57bedb23bbf Mon Sep 17 00:00:00 2001 From: Randy Marsh <46230578+Randy420Marsh@users.noreply.github.com> Date: Sat, 2 May 2026 12:30:32 +0300 Subject: [PATCH 3/4] Update build.txt --- build.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/build.txt b/build.txt index 615ab0708..bdb664dd1 100644 --- a/build.txt +++ b/build.txt @@ -1 +1,10 @@ +git submodule update --init --recursive export FORCE_CMAKE=1 && CMAKE_ARGS="-DGGML_CUDA=on" pip install -e . + +or build wheel: + +python -m build --wheel + +auditwheel show dist/llama_cpp_python-*.whl + +auditwheel repair dist/llama_cpp_python-*.whl From fd5c510011541e261d98e7cf7c94cf4df482da2f Mon Sep 17 00:00:00 2001 From: Randy420Marsh <46230578+Randy420Marsh@users.noreply.github.com> Date: Thu, 14 May 2026 22:54:45 +0300 Subject: [PATCH 4/4] update build instructions --- build.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/build.txt b/build.txt index bdb664dd1..fcd141377 100644 --- a/build.txt +++ b/build.txt @@ -5,6 +5,12 @@ or build wheel: python -m build --wheel +export LD_LIBRARY_PATH=$PWD/llama_cpp/lib:$LD_LIBRARY_PATH + auditwheel show dist/llama_cpp_python-*.whl auditwheel repair dist/llama_cpp_python-*.whl + + +docker run --rm -v $PWD:/io quay.io/pypa/manylinux_2_28_x86_64 \ + bash -c "cd /io && pip wheel . -w dist && auditwheel repair dist/*.whl"