From b49eaea3f5dd26507605dcab6253bd91d0ec2cc6 Mon Sep 17 00:00:00 2001
From: Randy420Marsh <46230578+Randy420Marsh@users.noreply.github.com>
Date: Sat, 25 Apr 2026 09:43:32 +0300
Subject: [PATCH 1/4] Initial commit
---
.github/workflows/build-llama-cpp-wheel.yml | 99 ++++++
README.md | 2 +
build.txt | 1 +
llama_cpp/llama.py | 5 +
llama_cpp/llama_chat_format.py | 370 +++++++++++++++++++-
5 files changed, 464 insertions(+), 13 deletions(-)
create mode 100644 .github/workflows/build-llama-cpp-wheel.yml
create mode 100644 build.txt
diff --git a/.github/workflows/build-llama-cpp-wheel.yml b/.github/workflows/build-llama-cpp-wheel.yml
new file mode 100644
index 000000000..148f9a29b
--- /dev/null
+++ b/.github/workflows/build-llama-cpp-wheel.yml
@@ -0,0 +1,99 @@
+name: Build llama-cpp-python Wheel (CUDA 12.8)
+
+on:
+ push:
+ tags:
+ - 'v*'
+ workflow_dispatch:
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ env:
+ FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
+ container:
+ image: nvidia/cuda:12.8.0-devel-ubuntu22.04
+
+ steps:
+ - name: Install Git (required for submodules)
+ run: |
+ apt-get update
+ apt-get install -y git
+
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ with:
+ submodules: recursive
+
+ - name: Install Python 3.11 + build tools
+ env:
+ DEBIAN_FRONTEND: noninteractive
+ TZ: Etc/UTC
+ run: |
+ apt-get update
+ apt-get install -y software-properties-common tzdata
+ add-apt-repository -y ppa:deadsnakes/ppa
+ apt-get update
+ apt-get install -y \
+ python3.11 \
+ python3.11-venv \
+ python3.11-dev \
+ git \
+ cmake \
+ build-essential \
+ pkg-config \
+ libopenblas-dev
+ python3.11 -m ensurepip --upgrade
+ python3.11 -m pip install --upgrade pip wheel setuptools
+
+ - name: Use Python 3.11
+ run: |
+ update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
+ update-alternatives --set python /usr/bin/python3.11
+ python --version
+
+ - name: Set CUDA environment
+ run: |
+ echo "CUDA_HOME=/usr/local/cuda" >> $GITHUB_ENV
+ echo "PATH=/usr/local/cuda/bin:$PATH" >> $GITHUB_ENV
+ echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" >> $GITHUB_ENV
+
+ - name: Verify CUDA installation
+ run: |
+ set -e
+ ls -la /usr/local/cuda* || (echo "ERROR: CUDA not found!" && exit 1)
+ which nvcc || (echo "ERROR: nvcc not found!" && exit 1)
+ nvcc --version
+ echo "CUDA verified successfully at: $(which nvcc)"
+
+ - name: Build wheel with all architectures
+ env:
+ CMAKE_ARGS: >
+ -DGGML_CUDA=ON
+ -DCMAKE_CUDA_ARCHITECTURES="75;80;86;89;90"
+ -DGGML_NATIVE=OFF
+ -DLLAMA_BUILD_EXAMPLES=OFF
+ -DLLAMA_BUILD_TESTS=OFF
+ -DLLAMA_BUILD_SERVER=OFF
+ FORCE_CMAKE: 1
+ run: |
+ python -m pip wheel . --no-deps -w dist -v
+
+ - name: Upload wheel as artifact
+ if: success()
+ uses: actions/upload-artifact@v4
+ with:
+ name: llama-cpp-python-cuda12.8-all-arch
+ path: dist/*.whl
+ retention-days: 30
+
+ - name: Create Release (on tag)
+ if: startsWith(github.ref, 'refs/tags/')
+ uses: softprops/action-gh-release@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ files: dist/*.whl
+ draft: false
+ prerelease: false
+ generate_release_notes: true
diff --git a/README.md b/README.md
index 69a0f8234..48ad27750 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
+## Unofficial custom fork meant for https://github.com/Randy420Marsh/VisionLLMCaptioner and gemma 4
+
# Python Bindings for [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
[](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
diff --git a/build.txt b/build.txt
new file mode 100644
index 000000000..615ab0708
--- /dev/null
+++ b/build.txt
@@ -0,0 +1 @@
+export FORCE_CMAKE=1 && CMAKE_ARGS="-DGGML_CUDA=on" pip install -e .
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 11fe169cf..373d115bf 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1973,6 +1973,8 @@ def create_chat_completion(
logit_bias: Optional[Dict[int, float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
+ reasoning_budget: Optional[int] = None,
+ enable_thinking: bool = False, # <--- FIXED SYNTAX ERROR HERE
) -> Union[
CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
]:
@@ -2005,6 +2007,7 @@ def create_chat_completion(
logits_processor: A list of logits processors to use.
grammar: A grammar to use.
logit_bias: A logit bias to use.
+ reasoning_budget: The reasoning budget for thinking mode (Gemma 4). Controls the maximum number of tokens for thinking/reasoning.
Returns:
Generated chat completion or a stream of chat completion chunks.
@@ -2044,6 +2047,8 @@ def create_chat_completion(
logits_processor=logits_processor,
grammar=grammar,
logit_bias=logit_bias,
+ reasoning_budget=reasoning_budget,
+ enable_thinking=enable_thinking, # <--- PASSED TO HANDLER HERE
)
def create_chat_completion_openai_v1(
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 1024fb85b..4fdaac909 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1,3 +1,4 @@
+# Drop-in replacement / standalone Gemma 4 chat handler for llama-cpp-python (April 2026)
from __future__ import annotations
import os
@@ -607,6 +608,7 @@ def chat_completion_handler(
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
+ reasoning_budget: Optional[int] = None,
**kwargs, # type: ignore
) -> Union[
llama_types.CreateChatCompletionResponse,
@@ -618,6 +620,7 @@ def chat_completion_handler(
function_call=function_call,
tools=tools,
tool_choice=tool_choice,
+ reasoning_budget=reasoning_budget,
**kwargs,
)
prompt = llama.tokenize(
@@ -1398,11 +1401,12 @@ def format_saiga(
return ChatFormatterResponse(prompt=_prompt.strip())
-# Chat format for Google's Gemma models, see more details and available models:
+# Chat format for Google's Gemma models (Gemma 2 and Gemma 3), see more details and available models:
# https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b
@register_chat_format("gemma")
def format_gemma(
messages: List[llama_types.ChatCompletionRequestMessage],
+ reasoning_budget: Optional[int] = None,
**kwargs: Any,
) -> ChatFormatterResponse:
system_message = _get_system_message(messages)
@@ -1418,6 +1422,80 @@ def format_gemma(
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
+# Chat format for Google's Gemma 4 models, see more details:
+# https://huggingface.co/google/gemma-4-E2B-it
+# https://ai.google.dev/gemma/docs/core/prompt-structure
+# Gemma 4 introduces new special tokens and native system role support
+@register_chat_format("gemma4")
+def format_gemma4(
+ messages: List[llama_types.ChatCompletionRequestMessage],
+ reasoning_budget: Optional[int] = None,
+ **kwargs: Any,
+) -> ChatFormatterResponse:
+ """Format messages for Gemma 4 models using the new <|turn> and tokens.
+
+ Gemma 4 introduces:
+ - Native system role support via <|channel>thought\n ... \n
+ - New turn-based tokens: <|turn>, , <|channel>,
+ - Thinking mode support via <|think|> token
+ - Tool calling support via <|tool_call>, , etc.
+
+ This is a simplified formatter that handles basic text-only conversations.
+ For full multimodal and tool calling support, use the Gemma4ChatHandler class.
+
+ Special tokens:
+ - : Beginning of sequence
+ - <|turn>: Start of turn
+ - : End of turn
+ - <|channel>: Start of channel
+ - : End of channel
+ - <|think|>: Thinking mode indicator
+ - <|tool_call>: Start of tool call
+ - : End of tool call
+
+ Args:
+ messages: List of chat completion messages
+ reasoning_budget: Maximum number of tokens for thinking/reasoning (Gemma 4 feature)
+ **kwargs: Additional keyword arguments
+ """
+ _bos_token = ""
+ _turn_start = "<|turn>"
+ _turn_end = "\n"
+ _channel_start = "<|channel>"
+ _channel_end = "\n"
+
+ _prompt = _bos_token
+
+ # Check for system message - in Gemma 4, system messages go in a thought channel
+ system_message = _get_system_message(messages)
+ if system_message:
+ _prompt += f"{_channel_start}thought\n{system_message}{_channel_end}"
+
+ # Format conversation turns
+ for message in messages:
+ role = message["role"]
+ content = message.get("content", "")
+
+ # Skip system messages as they're handled separately
+ if role == "system":
+ continue
+
+ # Map role to Gemma 4 role names
+ if role == "assistant":
+ gemma_role = "model"
+ else:
+ gemma_role = role
+
+ _prompt += f"{_turn_start}{gemma_role}\n{content}{_turn_end}"
+
+ # Add generation prompt
+ _prompt += f"{_turn_start}model\n"
+
+ return ChatFormatterResponse(
+ prompt=_prompt, stop=[_turn_end, "", ""]
+ )
+
+
# Tricky chat formats that require custom chat handlers
@@ -1575,9 +1653,9 @@ def prepare_messages_for_inference(
message["name"] = f"functions.{message['name']}"
# Function call requests by assistant
if "function_call" in message:
- message["function_call"]["name"] = (
- f"functions.{message['function_call']['name']}"
- )
+ message["function_call"][
+ "name"
+ ] = f"functions.{message['function_call']['name']}"
all_messages.append(message)
all_messages.append(
@@ -1816,9 +1894,9 @@ def functionary_v1_v2_chat_handler(
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
tokenizer = llama.tokenizer_
- assert hasattr(tokenizer, "hf_tokenizer"), (
- "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
- )
+ assert hasattr(
+ tokenizer, "hf_tokenizer"
+ ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
from transformers import AutoTokenizer
if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens:
@@ -1968,9 +2046,9 @@ def prepare_messages_for_inference(
message["name"] = f"functions.{message['name']}"
# Function call requests by assistant
if "function_call" in message:
- message["function_call"]["name"] = (
- f"functions.{message['function_call']['name']}"
- )
+ message["function_call"][
+ "name"
+ ] = f"functions.{message['function_call']['name']}"
all_messages.append(message)
if version == "v1":
@@ -3229,6 +3307,64 @@ def from_pretrained(
)
+class GemmaChatHandler(Llava15ChatHandler):
+ """Chat handler for Gemma-based multimodal models (e.g., PaliGemma, MedGemma).
+
+ Gemma models use / control tokens instead of
+ the LLaVA-style USER:/ASSISTANT: format. The text-only 'gemma' chat format
+ is already registered (see format_gemma), but multimodal Gemma models that
+ require a Llava-style vision pipeline need a dedicated handler so the
+ correct chat template is applied when chat_handler takes precedence over
+ chat_format in the resolution order.
+
+ See: https://ai.google.dev/gemma/docs/formatting
+ """
+
+ DEFAULT_SYSTEM_MESSAGE = None # Gemma models do not natively support a system role
+
+ CHAT_FORMAT = (
+ "{% for message in messages %}"
+ # System messages are folded into a user turn (Gemma has no system role)
+ "{% if message.role == 'system' %}"
+ "user\n{{ message.content }}\n"
+ "{% endif %}"
+ # User message (handles both plain string and multimodal content list)
+ "{% if message.role == 'user' %}"
+ "user\n"
+ "{% if message.content is string %}"
+ "{{ message.content }}"
+ "{% endif %}"
+ "{% if message.content is iterable and message.content is not string %}"
+ # Emit image tokens first
+ "{% for content in message.content %}"
+ "{% if content.type == 'image_url' and content.image_url is string %}"
+ "{{ content.image_url }}"
+ "{% endif %}"
+ "{% if content.type == 'image_url' and content.image_url is mapping %}"
+ "{{ content.image_url.url }}"
+ "{% endif %}"
+ "{% endfor %}"
+ # Then emit text tokens
+ "{% for content in message.content %}"
+ "{% if content.type == 'text' %}"
+ "{{ content.text }}"
+ "{% endif %}"
+ "{% endfor %}"
+ "{% endif %}"
+ "\n"
+ "{% endif %}"
+ # Assistant message
+ "{% if message.role == 'assistant' and message.content is not none %}"
+ "model\n{{ message.content }}\n"
+ "{% endif %}"
+ "{% endfor %}"
+ # Generation prompt
+ "{% if add_generation_prompt %}"
+ "model\n"
+ "{% endif %}"
+ )
+
+
class ObsidianChatHandler(Llava15ChatHandler):
# Prompt Format
# The model followed ChatML format. However, with ### as the seperator
@@ -3581,6 +3717,216 @@ def __call__(self, **kwargs):
return super().__call__(**kwargs)
+class MultimodalGemmaChatHandler(Llava15ChatHandler):
+ DEFAULT_SYSTEM_MESSAGE: Optional[str] = None
+
+ CHAT_FORMAT = (
+ "{% for message in messages %}"
+ "{% if message.role == 'user' %}"
+ "user\n"
+ "{% if message.content is string %}"
+ "{{ message.content }}"
+ "{% endif %}"
+ "{% if message.content is iterable %}"
+ "{% for content in message.content %}"
+ "{% if content.type == 'image_url' and content.image_url is string %}"
+ "{{ content.image_url }}"
+ "{% endif %}"
+ "{% if content.type == 'image_url' and content.image_url is mapping %}"
+ "{{ content.image_url.url }}"
+ "{% endif %}"
+ "{% endfor %}"
+ "{% for content in message.content %}"
+ "{% if content.type == 'text' %}"
+ "{{ content.text }}"
+ "{% endif %}"
+ "{% endfor %}"
+ "{% endif %}"
+ "\n"
+ "{% endif %}"
+ "{% if message.role == 'assistant' and message.content is not none %}"
+ "model\n"
+ "{{ message.content }}\n"
+ "{% endif %}"
+ "{% endfor %}"
+ "{% if add_generation_prompt %}"
+ "model\n"
+ "{% endif %}"
+ )
+
+
+# ============================================================
+# GEMMA 4 CHAT HANDLER - FULLY CORRECTED & POLISHED
+# ============================================================
+
+class Gemma4ChatHandler(Llava15ChatHandler):
+ """Chat handler for Gemma 4 models with full multimodal and tool calling support.
+
+ Gemma 4 introduces new special tokens and native system role support:
+ - <|turn>: Start of turn
+ - : End of turn
+ - <|channel>: Start of channel (for system/thought messages)
+ - : End of channel
+ - <|think|>: Thinking mode indicator
+ - <|tool_call>: Start of tool call
+ - : End of tool call
+ - <|tool_response|>: Tool response marker
+ """
+
+ DEFAULT_SYSTEM_MESSAGE: Optional[str] = None
+
+ CHAT_FORMAT = (
+ "{% for message in messages %}"
+ # 1. System messages go in a thought channel
+ "{% if message.role == 'system' %}"
+ "<|channel>thought\n{{ message.content }}\n"
+ "{% endif %}"
+
+ # 2. User message (handles both plain string and multimodal media)
+ "{% if message.role == 'user' %}"
+ "<|turn>user\n"
+ "{% if message.content is string %}"
+ "{{ message.content }}"
+ "{% endif %}"
+ "{% if message.content is iterable and message.content is not string %}"
+ # Emit Media Embeddings (Images AND Audio)
+ "{% for content in message.content %}"
+ "{% if content.type == 'image_url' %}"
+ "{% if content.image_url is string %}{{ content.image_url }}{% else %}{{ content.image_url.url }}{% endif %}"
+ "{% elif content.type == 'input_audio' %}"
+ "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}"
+ "{% elif content.type == 'audio' %}"
+ "data:audio/{{ content.audio.format }};base64,{{ content.audio.data }}"
+ "{% endif %}"
+ "{% endfor %}"
+ # Then emit text tokens
+ "{% for content in message.content %}"
+ "{% if content.type == 'text' %}{{ content.text }}{% endif %}"
+ "{% endfor %}"
+ "{% endif %}"
+ "\n"
+ "{% endif %}"
+
+ # 3. Assistant message
+ "{% if message.role == 'assistant' and message.content is not none %}"
+ "<|turn>model\n{{ message.content }}\n"
+ "{% endif %}"
+
+ # 4. Tool Calls (Agentic Workflow Handshakes)
+ "{% if message.role == 'assistant' and message.tool_calls %}"
+ "<|turn>model\n"
+ "{% for tool_call in message.tool_calls %}"
+ "<|tool_call>call:{{ tool_call.function.name }}{{ tool_call.function.arguments }}\n"
+ "{% endfor %}"
+ "\n"
+ "{% endif %}"
+
+ # 5. Tool Responses
+ "{% if message.role == 'tool' %}"
+ "<|tool_response>response:{{ message.name }}{{ message.content }}\n"
+ "{% endif %}"
+ "{% endfor %}"
+
+ # 6. Generation prompt
+ "{% if add_generation_prompt %}"
+ "<|turn>model\n"
+ "{% endif %}"
+ )
+
+ @staticmethod
+ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]) -> List[str]:
+ """
+ Overrides the base Llava15ChatHandler method.
+ Extracts both image URLs and audio base64 data strings so they can be processed
+ and replaced by the mtmd C++ media marker embeddings in the backend.
+ """
+ media_urls: List[str] = []
+ for message in messages:
+ if message["role"] == "user" and message.get("content"):
+ for content in message["content"]:
+ if isinstance(content, dict) and "type" in content:
+
+ # Extract Vision
+ if content["type"] == "image_url":
+ if isinstance(content["image_url"], dict) and "url" in content["image_url"]:
+ media_urls.append(content["image_url"]["url"])
+ else:
+ media_urls.append(content["image_url"])
+
+ # Extract Audio (Supports OpenAI's 'input_audio' AND custom 'audio' schemas)
+ elif content["type"] in ["input_audio", "audio"]:
+ audio_data = content.get("input_audio") or content.get("audio")
+ if audio_data:
+ fmt = audio_data.get("format", "wav")
+ data = audio_data.get("data", "")
+ # Standardize the output so `load_image` successfully base64-decodes the bytes
+ media_urls.append(f"data:audio/{fmt};base64,{data}")
+
+ return media_urls
+
+ def __call__(self, **kwargs):
+ """
+ Overrides the __call__ pipeline to dynamically intercept and enable Thinking Mode
+ by injecting the required control token seamlessly into the Jinja template.
+ Also performs state clearing for reliable multimodal (vision + audio) support
+ across multiple chat turns, matching other vision handlers like Qwen25VL.
+ """
+ enable_thinking = kwargs.get("enable_thinking", False)
+ original_format = self.CHAT_FORMAT
+
+ if enable_thinking:
+ # Inject <|think|> into BOTH the initial system thought channel AND
+ # the assistant generation prompt so thinking starts the response turn.
+ # This follows Gemma 4 docs for triggering native thinking mode.
+ modified_format = original_format.replace(
+ "<|channel>thought\n",
+ "<|channel>thought\n<|think|>\n"
+ ).replace(
+ "{% if add_generation_prompt %}\n<|turn>model\n{% endif %}",
+ "{% if add_generation_prompt %}\n<|turn>model\n<|think|>\n{% endif %}"
+ )
+ self.CHAT_FORMAT = modified_format
+
+ # Gemma requires a system block for the thought channel to exist.
+ # If the user hasn't provided one, we dynamically append a blank one.
+ messages = kwargs.get("messages", [])
+ if not any(m.get("role") == "system" for m in messages):
+ kwargs["messages"] = [{"role": "system", "content": ""}] + messages
+
+ # Clear state for multiple runs (critical for vision/audio + thinking in chat)
+ llama = kwargs.get("llama")
+ if llama is not None:
+ llama.reset()
+ if hasattr(llama, "_ctx") and llama._ctx is not None:
+ llama._ctx.kv_cache_clear()
+ llama.n_tokens = 0
+ if hasattr(llama, "input_ids"):
+ llama.input_ids.fill(0)
+
+ # Clear any handler state (e.g. cached embeds from previous multimodal turn)
+ if hasattr(self, "_last_image_embed"):
+ self._last_image_embed = None
+ self._last_image_hash = None
+
+ try:
+ result = super().__call__(**kwargs)
+ # Post-process non-streaming responses when thinking mode is enabled
+ # to provide clear structure: 'thinking' field (contains reasoning) + 'content' (final answer).
+ # Note: Since Gemma 4 outputs thinking + final answer in a single generation,
+ # 'thinking' currently holds the full generated text (including reasoning).
+ # Future: parse on model-specific end-of-thinking markers (e.g. <|end_think|>) if emitted.
+ if enable_thinking and not kwargs.get("stream", False) and isinstance(result, dict):
+ for choice in result.get("choices", []):
+ if "message" in choice:
+ content = choice["message"].get("content", "") or ""
+ choice["message"]["thinking"] = content # structured access for test app
+ # content remains the complete response (thinking + final answer) for compatibility
+ return result
+ finally:
+ # Restore the original class format so future non-thinking calls don't leak state
+ self.CHAT_FORMAT = original_format
+
+
@register_chat_completion_handler("chatml-function-calling")
def chatml_function_calling(
llama: llama.Llama,
@@ -3698,9 +4044,7 @@ def chatml_function_calling(
stop = (
[stop, "<|im_end|>"]
if isinstance(stop, str)
- else stop + ["<|im_end|>"]
- if stop
- else ["<|im_end|>"]
+ else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
)
# Case 1: No tool choice by user
From a44f4c7b705ba1b487d2ed1902c6632f9c3721ef Mon Sep 17 00:00:00 2001
From: Randy420Marsh <46230578+Randy420Marsh@users.noreply.github.com>
Date: Mon, 27 Apr 2026 06:44:56 +0300
Subject: [PATCH 2/4] fix formatting
---
llama_cpp/llama.py | 4 +-
llama_cpp/llama_chat_format.py | 73 +++++++++++++++++++---------------
2 files changed, 43 insertions(+), 34 deletions(-)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 373d115bf..377643209 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1974,7 +1974,7 @@ def create_chat_completion(
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
reasoning_budget: Optional[int] = None,
- enable_thinking: bool = False, # <--- FIXED SYNTAX ERROR HERE
+ enable_thinking: bool = False, # <--- FIXED SYNTAX ERROR HERE
) -> Union[
CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
]:
@@ -2048,7 +2048,7 @@ def create_chat_completion(
grammar=grammar,
logit_bias=logit_bias,
reasoning_budget=reasoning_budget,
- enable_thinking=enable_thinking, # <--- PASSED TO HANDLER HERE
+ enable_thinking=enable_thinking, # <--- PASSED TO HANDLER HERE
)
def create_chat_completion_openai_v1(
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 4fdaac909..6708fcae7 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1653,9 +1653,9 @@ def prepare_messages_for_inference(
message["name"] = f"functions.{message['name']}"
# Function call requests by assistant
if "function_call" in message:
- message["function_call"][
- "name"
- ] = f"functions.{message['function_call']['name']}"
+ message["function_call"]["name"] = (
+ f"functions.{message['function_call']['name']}"
+ )
all_messages.append(message)
all_messages.append(
@@ -1894,9 +1894,9 @@ def functionary_v1_v2_chat_handler(
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
tokenizer = llama.tokenizer_
- assert hasattr(
- tokenizer, "hf_tokenizer"
- ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
+ assert hasattr(tokenizer, "hf_tokenizer"), (
+ "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
+ )
from transformers import AutoTokenizer
if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens:
@@ -2046,9 +2046,9 @@ def prepare_messages_for_inference(
message["name"] = f"functions.{message['name']}"
# Function call requests by assistant
if "function_call" in message:
- message["function_call"][
- "name"
- ] = f"functions.{message['function_call']['name']}"
+ message["function_call"]["name"] = (
+ f"functions.{message['function_call']['name']}"
+ )
all_messages.append(message)
if version == "v1":
@@ -3759,6 +3759,7 @@ class MultimodalGemmaChatHandler(Llava15ChatHandler):
# GEMMA 4 CHAT HANDLER - FULLY CORRECTED & POLISHED
# ============================================================
+
class Gemma4ChatHandler(Llava15ChatHandler):
"""Chat handler for Gemma 4 models with full multimodal and tool calling support.
@@ -3781,7 +3782,6 @@ class Gemma4ChatHandler(Llava15ChatHandler):
"{% if message.role == 'system' %}"
"<|channel>thought\n{{ message.content }}\n"
"{% endif %}"
-
# 2. User message (handles both plain string and multimodal media)
"{% if message.role == 'user' %}"
"<|turn>user\n"
@@ -3806,12 +3806,10 @@ class Gemma4ChatHandler(Llava15ChatHandler):
"{% endif %}"
"\n"
"{% endif %}"
-
# 3. Assistant message
"{% if message.role == 'assistant' and message.content is not none %}"
"<|turn>model\n{{ message.content }}\n"
"{% endif %}"
-
# 4. Tool Calls (Agentic Workflow Handshakes)
"{% if message.role == 'assistant' and message.tool_calls %}"
"<|turn>model\n"
@@ -3820,13 +3818,11 @@ class Gemma4ChatHandler(Llava15ChatHandler):
"{% endfor %}"
"\n"
"{% endif %}"
-
# 5. Tool Responses
"{% if message.role == 'tool' %}"
"<|tool_response>response:{{ message.name }}{{ message.content }}\n"
"{% endif %}"
"{% endfor %}"
-
# 6. Generation prompt
"{% if add_generation_prompt %}"
"<|turn>model\n"
@@ -3834,10 +3830,12 @@ class Gemma4ChatHandler(Llava15ChatHandler):
)
@staticmethod
- def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]) -> List[str]:
+ def get_image_urls(
+ messages: List[llama_types.ChatCompletionRequestMessage],
+ ) -> List[str]:
"""
Overrides the base Llava15ChatHandler method.
- Extracts both image URLs and audio base64 data strings so they can be processed
+ Extracts both image URLs and audio base64 data strings so they can be processed
and replaced by the mtmd C++ media marker embeddings in the backend.
"""
media_urls: List[str] = []
@@ -3845,49 +3843,52 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]) ->
if message["role"] == "user" and message.get("content"):
for content in message["content"]:
if isinstance(content, dict) and "type" in content:
-
# Extract Vision
if content["type"] == "image_url":
- if isinstance(content["image_url"], dict) and "url" in content["image_url"]:
+ if (
+ isinstance(content["image_url"], dict)
+ and "url" in content["image_url"]
+ ):
media_urls.append(content["image_url"]["url"])
else:
media_urls.append(content["image_url"])
-
+
# Extract Audio (Supports OpenAI's 'input_audio' AND custom 'audio' schemas)
elif content["type"] in ["input_audio", "audio"]:
- audio_data = content.get("input_audio") or content.get("audio")
+ audio_data = content.get("input_audio") or content.get(
+ "audio"
+ )
if audio_data:
fmt = audio_data.get("format", "wav")
data = audio_data.get("data", "")
# Standardize the output so `load_image` successfully base64-decodes the bytes
media_urls.append(f"data:audio/{fmt};base64,{data}")
-
+
return media_urls
def __call__(self, **kwargs):
"""
- Overrides the __call__ pipeline to dynamically intercept and enable Thinking Mode
+ Overrides the __call__ pipeline to dynamically intercept and enable Thinking Mode
by injecting the required control token seamlessly into the Jinja template.
Also performs state clearing for reliable multimodal (vision + audio) support
across multiple chat turns, matching other vision handlers like Qwen25VL.
"""
enable_thinking = kwargs.get("enable_thinking", False)
original_format = self.CHAT_FORMAT
-
+
if enable_thinking:
- # Inject <|think|> into BOTH the initial system thought channel AND
+ # Inject <|think|> into BOTH the initial system thought channel AND
# the assistant generation prompt so thinking starts the response turn.
# This follows Gemma 4 docs for triggering native thinking mode.
modified_format = original_format.replace(
- "<|channel>thought\n",
- "<|channel>thought\n<|think|>\n"
+ "<|channel>thought\n", "<|channel>thought\n<|think|>\n"
).replace(
"{% if add_generation_prompt %}\n<|turn>model\n{% endif %}",
- "{% if add_generation_prompt %}\n<|turn>model\n<|think|>\n{% endif %}"
+ "{% if add_generation_prompt %}\n<|turn>model\n<|think|>\n{% endif %}",
)
self.CHAT_FORMAT = modified_format
-
- # Gemma requires a system block for the thought channel to exist.
+
+ # Gemma requires a system block for the thought channel to exist.
# If the user hasn't provided one, we dynamically append a blank one.
messages = kwargs.get("messages", [])
if not any(m.get("role") == "system" for m in messages):
@@ -3915,11 +3916,17 @@ def __call__(self, **kwargs):
# Note: Since Gemma 4 outputs thinking + final answer in a single generation,
# 'thinking' currently holds the full generated text (including reasoning).
# Future: parse on model-specific end-of-thinking markers (e.g. <|end_think|>) if emitted.
- if enable_thinking and not kwargs.get("stream", False) and isinstance(result, dict):
+ if (
+ enable_thinking
+ and not kwargs.get("stream", False)
+ and isinstance(result, dict)
+ ):
for choice in result.get("choices", []):
if "message" in choice:
content = choice["message"].get("content", "") or ""
- choice["message"]["thinking"] = content # structured access for test app
+ choice["message"]["thinking"] = (
+ content # structured access for test app
+ )
# content remains the complete response (thinking + final answer) for compatibility
return result
finally:
@@ -4044,7 +4051,9 @@ def chatml_function_calling(
stop = (
[stop, "<|im_end|>"]
if isinstance(stop, str)
- else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
+ else stop + ["<|im_end|>"]
+ if stop
+ else ["<|im_end|>"]
)
# Case 1: No tool choice by user
From fd754ff8bfae7a8eaedfc2266274c57bedb23bbf Mon Sep 17 00:00:00 2001
From: Randy Marsh <46230578+Randy420Marsh@users.noreply.github.com>
Date: Sat, 2 May 2026 12:30:32 +0300
Subject: [PATCH 3/4] Update build.txt
---
build.txt | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/build.txt b/build.txt
index 615ab0708..bdb664dd1 100644
--- a/build.txt
+++ b/build.txt
@@ -1 +1,10 @@
+git submodule update --init --recursive
export FORCE_CMAKE=1 && CMAKE_ARGS="-DGGML_CUDA=on" pip install -e .
+
+or build wheel:
+
+python -m build --wheel
+
+auditwheel show dist/llama_cpp_python-*.whl
+
+auditwheel repair dist/llama_cpp_python-*.whl
From fd5c510011541e261d98e7cf7c94cf4df482da2f Mon Sep 17 00:00:00 2001
From: Randy420Marsh <46230578+Randy420Marsh@users.noreply.github.com>
Date: Thu, 14 May 2026 22:54:45 +0300
Subject: [PATCH 4/4] update build instructions
---
build.txt | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/build.txt b/build.txt
index bdb664dd1..fcd141377 100644
--- a/build.txt
+++ b/build.txt
@@ -5,6 +5,12 @@ or build wheel:
python -m build --wheel
+export LD_LIBRARY_PATH=$PWD/llama_cpp/lib:$LD_LIBRARY_PATH
+
auditwheel show dist/llama_cpp_python-*.whl
auditwheel repair dist/llama_cpp_python-*.whl
+
+
+docker run --rm -v $PWD:/io quay.io/pypa/manylinux_2_28_x86_64 \
+ bash -c "cd /io && pip wheel . -w dist && auditwheel repair dist/*.whl"