diff --git a/.github/workflows/build-llama-cpp-wheel.yml b/.github/workflows/build-llama-cpp-wheel.yml
new file mode 100644
index 000000000..148f9a29b
--- /dev/null
+++ b/.github/workflows/build-llama-cpp-wheel.yml
@@ -0,0 +1,99 @@
+name: Build llama-cpp-python Wheel (CUDA 12.8)
+
+on:
+ push:
+ tags:
+ - 'v*'
+ workflow_dispatch:
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ env:
+ FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
+ container:
+ image: nvidia/cuda:12.8.0-devel-ubuntu22.04
+
+ steps:
+ - name: Install Git (required for submodules)
+ run: |
+ apt-get update
+ apt-get install -y git
+
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ with:
+ submodules: recursive
+
+ - name: Install Python 3.11 + build tools
+ env:
+ DEBIAN_FRONTEND: noninteractive
+ TZ: Etc/UTC
+ run: |
+ apt-get update
+ apt-get install -y software-properties-common tzdata
+ add-apt-repository -y ppa:deadsnakes/ppa
+ apt-get update
+ apt-get install -y \
+ python3.11 \
+ python3.11-venv \
+ python3.11-dev \
+ git \
+ cmake \
+ build-essential \
+ pkg-config \
+ libopenblas-dev
+ python3.11 -m ensurepip --upgrade
+ python3.11 -m pip install --upgrade pip wheel setuptools
+
+ - name: Use Python 3.11
+ run: |
+ update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
+ update-alternatives --set python /usr/bin/python3.11
+ python --version
+
+ - name: Set CUDA environment
+ run: |
+ echo "CUDA_HOME=/usr/local/cuda" >> $GITHUB_ENV
+ echo "PATH=/usr/local/cuda/bin:$PATH" >> $GITHUB_ENV
+ echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" >> $GITHUB_ENV
+
+ - name: Verify CUDA installation
+ run: |
+ set -e
+ ls -la /usr/local/cuda* || (echo "ERROR: CUDA not found!" && exit 1)
+ which nvcc || (echo "ERROR: nvcc not found!" && exit 1)
+ nvcc --version
+ echo "CUDA verified successfully at: $(which nvcc)"
+
+ - name: Build wheel with all architectures
+ env:
+ CMAKE_ARGS: >
+ -DGGML_CUDA=ON
+ -DCMAKE_CUDA_ARCHITECTURES="75;80;86;89;90"
+ -DGGML_NATIVE=OFF
+ -DLLAMA_BUILD_EXAMPLES=OFF
+ -DLLAMA_BUILD_TESTS=OFF
+ -DLLAMA_BUILD_SERVER=OFF
+ FORCE_CMAKE: 1
+ run: |
+ python -m pip wheel . --no-deps -w dist -v
+
+ - name: Upload wheel as artifact
+ if: success()
+ uses: actions/upload-artifact@v4
+ with:
+ name: llama-cpp-python-cuda12.8-all-arch
+ path: dist/*.whl
+ retention-days: 30
+
+ - name: Create Release (on tag)
+ if: startsWith(github.ref, 'refs/tags/')
+ uses: softprops/action-gh-release@v2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ files: dist/*.whl
+ draft: false
+ prerelease: false
+ generate_release_notes: true
diff --git a/README.md b/README.md
index 69a0f8234..48ad27750 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
+## Unofficial custom fork meant for https://github.com/Randy420Marsh/VisionLLMCaptioner and gemma 4
+
# Python Bindings for [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
[](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
diff --git a/build.txt b/build.txt
new file mode 100644
index 000000000..fcd141377
--- /dev/null
+++ b/build.txt
@@ -0,0 +1,16 @@
+git submodule update --init --recursive
+export FORCE_CMAKE=1 && CMAKE_ARGS="-DGGML_CUDA=on" pip install -e .
+
+or build wheel:
+
+python -m build --wheel
+
+export LD_LIBRARY_PATH=$PWD/llama_cpp/lib:$LD_LIBRARY_PATH
+
+auditwheel show dist/llama_cpp_python-*.whl
+
+auditwheel repair dist/llama_cpp_python-*.whl
+
+
+docker run --rm -v $PWD:/io quay.io/pypa/manylinux_2_28_x86_64 \
+ bash -c "cd /io && pip wheel . -w dist && auditwheel repair dist/*.whl"
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 75c74b41f..13bb19bf6 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1986,6 +1986,8 @@ def create_chat_completion(
logit_bias: Optional[Dict[int, float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
+ reasoning_budget: Optional[int] = None,
+ enable_thinking: bool = False, # <--- FIXED SYNTAX ERROR HERE
) -> Union[
CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
]:
@@ -2018,6 +2020,7 @@ def create_chat_completion(
logits_processor: A list of logits processors to use.
grammar: A grammar to use.
logit_bias: A logit bias to use.
+ reasoning_budget: The reasoning budget for thinking mode (Gemma 4). Controls the maximum number of tokens for thinking/reasoning.
Returns:
Generated chat completion or a stream of chat completion chunks.
@@ -2057,6 +2060,8 @@ def create_chat_completion(
logits_processor=logits_processor,
grammar=grammar,
logit_bias=logit_bias,
+ reasoning_budget=reasoning_budget,
+ enable_thinking=enable_thinking, # <--- PASSED TO HANDLER HERE
)
def create_chat_completion_openai_v1(
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 1024fb85b..6708fcae7 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1,3 +1,4 @@
+# Drop-in replacement / standalone Gemma 4 chat handler for llama-cpp-python (April 2026)
from __future__ import annotations
import os
@@ -607,6 +608,7 @@ def chat_completion_handler(
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
+ reasoning_budget: Optional[int] = None,
**kwargs, # type: ignore
) -> Union[
llama_types.CreateChatCompletionResponse,
@@ -618,6 +620,7 @@ def chat_completion_handler(
function_call=function_call,
tools=tools,
tool_choice=tool_choice,
+ reasoning_budget=reasoning_budget,
**kwargs,
)
prompt = llama.tokenize(
@@ -1398,11 +1401,12 @@ def format_saiga(
return ChatFormatterResponse(prompt=_prompt.strip())
-# Chat format for Google's Gemma models, see more details and available models:
+# Chat format for Google's Gemma models (Gemma 2 and Gemma 3), see more details and available models:
# https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b
@register_chat_format("gemma")
def format_gemma(
messages: List[llama_types.ChatCompletionRequestMessage],
+ reasoning_budget: Optional[int] = None,
**kwargs: Any,
) -> ChatFormatterResponse:
system_message = _get_system_message(messages)
@@ -1418,6 +1422,80 @@ def format_gemma(
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
+# Chat format for Google's Gemma 4 models, see more details:
+# https://huggingface.co/google/gemma-4-E2B-it
+# https://ai.google.dev/gemma/docs/core/prompt-structure
+# Gemma 4 introduces new special tokens and native system role support
+@register_chat_format("gemma4")
+def format_gemma4(
+ messages: List[llama_types.ChatCompletionRequestMessage],
+ reasoning_budget: Optional[int] = None,
+ **kwargs: Any,
+) -> ChatFormatterResponse:
+ """Format messages for Gemma 4 models using the new <|turn> and tokens.
+
+ Gemma 4 introduces:
+ - Native system role support via <|channel>thought\n ... \n
+ - New turn-based tokens: <|turn>, , <|channel>,
+ - Thinking mode support via <|think|> token
+ - Tool calling support via <|tool_call>, , etc.
+
+ This is a simplified formatter that handles basic text-only conversations.
+ For full multimodal and tool calling support, use the Gemma4ChatHandler class.
+
+ Special tokens:
+ - : Beginning of sequence
+ - <|turn>: Start of turn
+ - : End of turn
+ - <|channel>: Start of channel
+ - : End of channel
+ - <|think|>: Thinking mode indicator
+ - <|tool_call>: Start of tool call
+ - : End of tool call
+
+ Args:
+ messages: List of chat completion messages
+ reasoning_budget: Maximum number of tokens for thinking/reasoning (Gemma 4 feature)
+ **kwargs: Additional keyword arguments
+ """
+ _bos_token = ""
+ _turn_start = "<|turn>"
+ _turn_end = "\n"
+ _channel_start = "<|channel>"
+ _channel_end = "\n"
+
+ _prompt = _bos_token
+
+ # Check for system message - in Gemma 4, system messages go in a thought channel
+ system_message = _get_system_message(messages)
+ if system_message:
+ _prompt += f"{_channel_start}thought\n{system_message}{_channel_end}"
+
+ # Format conversation turns
+ for message in messages:
+ role = message["role"]
+ content = message.get("content", "")
+
+ # Skip system messages as they're handled separately
+ if role == "system":
+ continue
+
+ # Map role to Gemma 4 role names
+ if role == "assistant":
+ gemma_role = "model"
+ else:
+ gemma_role = role
+
+ _prompt += f"{_turn_start}{gemma_role}\n{content}{_turn_end}"
+
+ # Add generation prompt
+ _prompt += f"{_turn_start}model\n"
+
+ return ChatFormatterResponse(
+ prompt=_prompt, stop=[_turn_end, "", ""]
+ )
+
+
# Tricky chat formats that require custom chat handlers
@@ -3229,6 +3307,64 @@ def from_pretrained(
)
+class GemmaChatHandler(Llava15ChatHandler):
+ """Chat handler for Gemma-based multimodal models (e.g., PaliGemma, MedGemma).
+
+ Gemma models use / control tokens instead of
+ the LLaVA-style USER:/ASSISTANT: format. The text-only 'gemma' chat format
+ is already registered (see format_gemma), but multimodal Gemma models that
+ require a Llava-style vision pipeline need a dedicated handler so the
+ correct chat template is applied when chat_handler takes precedence over
+ chat_format in the resolution order.
+
+ See: https://ai.google.dev/gemma/docs/formatting
+ """
+
+ DEFAULT_SYSTEM_MESSAGE = None # Gemma models do not natively support a system role
+
+ CHAT_FORMAT = (
+ "{% for message in messages %}"
+ # System messages are folded into a user turn (Gemma has no system role)
+ "{% if message.role == 'system' %}"
+ "user\n{{ message.content }}\n"
+ "{% endif %}"
+ # User message (handles both plain string and multimodal content list)
+ "{% if message.role == 'user' %}"
+ "user\n"
+ "{% if message.content is string %}"
+ "{{ message.content }}"
+ "{% endif %}"
+ "{% if message.content is iterable and message.content is not string %}"
+ # Emit image tokens first
+ "{% for content in message.content %}"
+ "{% if content.type == 'image_url' and content.image_url is string %}"
+ "{{ content.image_url }}"
+ "{% endif %}"
+ "{% if content.type == 'image_url' and content.image_url is mapping %}"
+ "{{ content.image_url.url }}"
+ "{% endif %}"
+ "{% endfor %}"
+ # Then emit text tokens
+ "{% for content in message.content %}"
+ "{% if content.type == 'text' %}"
+ "{{ content.text }}"
+ "{% endif %}"
+ "{% endfor %}"
+ "{% endif %}"
+ "\n"
+ "{% endif %}"
+ # Assistant message
+ "{% if message.role == 'assistant' and message.content is not none %}"
+ "model\n{{ message.content }}\n"
+ "{% endif %}"
+ "{% endfor %}"
+ # Generation prompt
+ "{% if add_generation_prompt %}"
+ "model\n"
+ "{% endif %}"
+ )
+
+
class ObsidianChatHandler(Llava15ChatHandler):
# Prompt Format
# The model followed ChatML format. However, with ### as the seperator
@@ -3581,6 +3717,223 @@ def __call__(self, **kwargs):
return super().__call__(**kwargs)
+class MultimodalGemmaChatHandler(Llava15ChatHandler):
+ DEFAULT_SYSTEM_MESSAGE: Optional[str] = None
+
+ CHAT_FORMAT = (
+ "{% for message in messages %}"
+ "{% if message.role == 'user' %}"
+ "user\n"
+ "{% if message.content is string %}"
+ "{{ message.content }}"
+ "{% endif %}"
+ "{% if message.content is iterable %}"
+ "{% for content in message.content %}"
+ "{% if content.type == 'image_url' and content.image_url is string %}"
+ "{{ content.image_url }}"
+ "{% endif %}"
+ "{% if content.type == 'image_url' and content.image_url is mapping %}"
+ "{{ content.image_url.url }}"
+ "{% endif %}"
+ "{% endfor %}"
+ "{% for content in message.content %}"
+ "{% if content.type == 'text' %}"
+ "{{ content.text }}"
+ "{% endif %}"
+ "{% endfor %}"
+ "{% endif %}"
+ "\n"
+ "{% endif %}"
+ "{% if message.role == 'assistant' and message.content is not none %}"
+ "model\n"
+ "{{ message.content }}\n"
+ "{% endif %}"
+ "{% endfor %}"
+ "{% if add_generation_prompt %}"
+ "model\n"
+ "{% endif %}"
+ )
+
+
+# ============================================================
+# GEMMA 4 CHAT HANDLER - FULLY CORRECTED & POLISHED
+# ============================================================
+
+
+class Gemma4ChatHandler(Llava15ChatHandler):
+ """Chat handler for Gemma 4 models with full multimodal and tool calling support.
+
+ Gemma 4 introduces new special tokens and native system role support:
+ - <|turn>: Start of turn
+ - : End of turn
+ - <|channel>: Start of channel (for system/thought messages)
+ - : End of channel
+ - <|think|>: Thinking mode indicator
+ - <|tool_call>: Start of tool call
+ - : End of tool call
+ - <|tool_response|>: Tool response marker
+ """
+
+ DEFAULT_SYSTEM_MESSAGE: Optional[str] = None
+
+ CHAT_FORMAT = (
+ "{% for message in messages %}"
+ # 1. System messages go in a thought channel
+ "{% if message.role == 'system' %}"
+ "<|channel>thought\n{{ message.content }}\n"
+ "{% endif %}"
+ # 2. User message (handles both plain string and multimodal media)
+ "{% if message.role == 'user' %}"
+ "<|turn>user\n"
+ "{% if message.content is string %}"
+ "{{ message.content }}"
+ "{% endif %}"
+ "{% if message.content is iterable and message.content is not string %}"
+ # Emit Media Embeddings (Images AND Audio)
+ "{% for content in message.content %}"
+ "{% if content.type == 'image_url' %}"
+ "{% if content.image_url is string %}{{ content.image_url }}{% else %}{{ content.image_url.url }}{% endif %}"
+ "{% elif content.type == 'input_audio' %}"
+ "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}"
+ "{% elif content.type == 'audio' %}"
+ "data:audio/{{ content.audio.format }};base64,{{ content.audio.data }}"
+ "{% endif %}"
+ "{% endfor %}"
+ # Then emit text tokens
+ "{% for content in message.content %}"
+ "{% if content.type == 'text' %}{{ content.text }}{% endif %}"
+ "{% endfor %}"
+ "{% endif %}"
+ "\n"
+ "{% endif %}"
+ # 3. Assistant message
+ "{% if message.role == 'assistant' and message.content is not none %}"
+ "<|turn>model\n{{ message.content }}\n"
+ "{% endif %}"
+ # 4. Tool Calls (Agentic Workflow Handshakes)
+ "{% if message.role == 'assistant' and message.tool_calls %}"
+ "<|turn>model\n"
+ "{% for tool_call in message.tool_calls %}"
+ "<|tool_call>call:{{ tool_call.function.name }}{{ tool_call.function.arguments }}\n"
+ "{% endfor %}"
+ "\n"
+ "{% endif %}"
+ # 5. Tool Responses
+ "{% if message.role == 'tool' %}"
+ "<|tool_response>response:{{ message.name }}{{ message.content }}\n"
+ "{% endif %}"
+ "{% endfor %}"
+ # 6. Generation prompt
+ "{% if add_generation_prompt %}"
+ "<|turn>model\n"
+ "{% endif %}"
+ )
+
+ @staticmethod
+ def get_image_urls(
+ messages: List[llama_types.ChatCompletionRequestMessage],
+ ) -> List[str]:
+ """
+ Overrides the base Llava15ChatHandler method.
+ Extracts both image URLs and audio base64 data strings so they can be processed
+ and replaced by the mtmd C++ media marker embeddings in the backend.
+ """
+ media_urls: List[str] = []
+ for message in messages:
+ if message["role"] == "user" and message.get("content"):
+ for content in message["content"]:
+ if isinstance(content, dict) and "type" in content:
+ # Extract Vision
+ if content["type"] == "image_url":
+ if (
+ isinstance(content["image_url"], dict)
+ and "url" in content["image_url"]
+ ):
+ media_urls.append(content["image_url"]["url"])
+ else:
+ media_urls.append(content["image_url"])
+
+ # Extract Audio (Supports OpenAI's 'input_audio' AND custom 'audio' schemas)
+ elif content["type"] in ["input_audio", "audio"]:
+ audio_data = content.get("input_audio") or content.get(
+ "audio"
+ )
+ if audio_data:
+ fmt = audio_data.get("format", "wav")
+ data = audio_data.get("data", "")
+ # Standardize the output so `load_image` successfully base64-decodes the bytes
+ media_urls.append(f"data:audio/{fmt};base64,{data}")
+
+ return media_urls
+
+ def __call__(self, **kwargs):
+ """
+ Overrides the __call__ pipeline to dynamically intercept and enable Thinking Mode
+ by injecting the required control token seamlessly into the Jinja template.
+ Also performs state clearing for reliable multimodal (vision + audio) support
+ across multiple chat turns, matching other vision handlers like Qwen25VL.
+ """
+ enable_thinking = kwargs.get("enable_thinking", False)
+ original_format = self.CHAT_FORMAT
+
+ if enable_thinking:
+ # Inject <|think|> into BOTH the initial system thought channel AND
+ # the assistant generation prompt so thinking starts the response turn.
+ # This follows Gemma 4 docs for triggering native thinking mode.
+ modified_format = original_format.replace(
+ "<|channel>thought\n", "<|channel>thought\n<|think|>\n"
+ ).replace(
+ "{% if add_generation_prompt %}\n<|turn>model\n{% endif %}",
+ "{% if add_generation_prompt %}\n<|turn>model\n<|think|>\n{% endif %}",
+ )
+ self.CHAT_FORMAT = modified_format
+
+ # Gemma requires a system block for the thought channel to exist.
+ # If the user hasn't provided one, we dynamically append a blank one.
+ messages = kwargs.get("messages", [])
+ if not any(m.get("role") == "system" for m in messages):
+ kwargs["messages"] = [{"role": "system", "content": ""}] + messages
+
+ # Clear state for multiple runs (critical for vision/audio + thinking in chat)
+ llama = kwargs.get("llama")
+ if llama is not None:
+ llama.reset()
+ if hasattr(llama, "_ctx") and llama._ctx is not None:
+ llama._ctx.kv_cache_clear()
+ llama.n_tokens = 0
+ if hasattr(llama, "input_ids"):
+ llama.input_ids.fill(0)
+
+ # Clear any handler state (e.g. cached embeds from previous multimodal turn)
+ if hasattr(self, "_last_image_embed"):
+ self._last_image_embed = None
+ self._last_image_hash = None
+
+ try:
+ result = super().__call__(**kwargs)
+ # Post-process non-streaming responses when thinking mode is enabled
+ # to provide clear structure: 'thinking' field (contains reasoning) + 'content' (final answer).
+ # Note: Since Gemma 4 outputs thinking + final answer in a single generation,
+ # 'thinking' currently holds the full generated text (including reasoning).
+ # Future: parse on model-specific end-of-thinking markers (e.g. <|end_think|>) if emitted.
+ if (
+ enable_thinking
+ and not kwargs.get("stream", False)
+ and isinstance(result, dict)
+ ):
+ for choice in result.get("choices", []):
+ if "message" in choice:
+ content = choice["message"].get("content", "") or ""
+ choice["message"]["thinking"] = (
+ content # structured access for test app
+ )
+ # content remains the complete response (thinking + final answer) for compatibility
+ return result
+ finally:
+ # Restore the original class format so future non-thinking calls don't leak state
+ self.CHAT_FORMAT = original_format
+
+
@register_chat_completion_handler("chatml-function-calling")
def chatml_function_calling(
llama: llama.Llama,