diff --git a/.github/workflows/build-llama-cpp-wheel.yml b/.github/workflows/build-llama-cpp-wheel.yml new file mode 100644 index 000000000..148f9a29b --- /dev/null +++ b/.github/workflows/build-llama-cpp-wheel.yml @@ -0,0 +1,99 @@ +name: Build llama-cpp-python Wheel (CUDA 12.8) + +on: + push: + tags: + - 'v*' + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + container: + image: nvidia/cuda:12.8.0-devel-ubuntu22.04 + + steps: + - name: Install Git (required for submodules) + run: | + apt-get update + apt-get install -y git + + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install Python 3.11 + build tools + env: + DEBIAN_FRONTEND: noninteractive + TZ: Etc/UTC + run: | + apt-get update + apt-get install -y software-properties-common tzdata + add-apt-repository -y ppa:deadsnakes/ppa + apt-get update + apt-get install -y \ + python3.11 \ + python3.11-venv \ + python3.11-dev \ + git \ + cmake \ + build-essential \ + pkg-config \ + libopenblas-dev + python3.11 -m ensurepip --upgrade + python3.11 -m pip install --upgrade pip wheel setuptools + + - name: Use Python 3.11 + run: | + update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 + update-alternatives --set python /usr/bin/python3.11 + python --version + + - name: Set CUDA environment + run: | + echo "CUDA_HOME=/usr/local/cuda" >> $GITHUB_ENV + echo "PATH=/usr/local/cuda/bin:$PATH" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" >> $GITHUB_ENV + + - name: Verify CUDA installation + run: | + set -e + ls -la /usr/local/cuda* || (echo "ERROR: CUDA not found!" && exit 1) + which nvcc || (echo "ERROR: nvcc not found!" && exit 1) + nvcc --version + echo "CUDA verified successfully at: $(which nvcc)" + + - name: Build wheel with all architectures + env: + CMAKE_ARGS: > + -DGGML_CUDA=ON + -DCMAKE_CUDA_ARCHITECTURES="75;80;86;89;90" + -DGGML_NATIVE=OFF + -DLLAMA_BUILD_EXAMPLES=OFF + -DLLAMA_BUILD_TESTS=OFF + -DLLAMA_BUILD_SERVER=OFF + FORCE_CMAKE: 1 + run: | + python -m pip wheel . --no-deps -w dist -v + + - name: Upload wheel as artifact + if: success() + uses: actions/upload-artifact@v4 + with: + name: llama-cpp-python-cuda12.8-all-arch + path: dist/*.whl + retention-days: 30 + + - name: Create Release (on tag) + if: startsWith(github.ref, 'refs/tags/') + uses: softprops/action-gh-release@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + files: dist/*.whl + draft: false + prerelease: false + generate_release_notes: true diff --git a/README.md b/README.md index 69a0f8234..48ad27750 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@

+## Unofficial custom fork meant for https://github.com/Randy420Marsh/VisionLLMCaptioner and gemma 4 + # Python Bindings for [`llama.cpp`](https://github.com/ggerganov/llama.cpp) [![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) diff --git a/build.txt b/build.txt new file mode 100644 index 000000000..fcd141377 --- /dev/null +++ b/build.txt @@ -0,0 +1,16 @@ +git submodule update --init --recursive +export FORCE_CMAKE=1 && CMAKE_ARGS="-DGGML_CUDA=on" pip install -e . + +or build wheel: + +python -m build --wheel + +export LD_LIBRARY_PATH=$PWD/llama_cpp/lib:$LD_LIBRARY_PATH + +auditwheel show dist/llama_cpp_python-*.whl + +auditwheel repair dist/llama_cpp_python-*.whl + + +docker run --rm -v $PWD:/io quay.io/pypa/manylinux_2_28_x86_64 \ + bash -c "cd /io && pip wheel . -w dist && auditwheel repair dist/*.whl" diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 75c74b41f..13bb19bf6 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1986,6 +1986,8 @@ def create_chat_completion( logit_bias: Optional[Dict[int, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + reasoning_budget: Optional[int] = None, + enable_thinking: bool = False, # <--- FIXED SYNTAX ERROR HERE ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -2018,6 +2020,7 @@ def create_chat_completion( logits_processor: A list of logits processors to use. grammar: A grammar to use. logit_bias: A logit bias to use. + reasoning_budget: The reasoning budget for thinking mode (Gemma 4). Controls the maximum number of tokens for thinking/reasoning. Returns: Generated chat completion or a stream of chat completion chunks. @@ -2057,6 +2060,8 @@ def create_chat_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + reasoning_budget=reasoning_budget, + enable_thinking=enable_thinking, # <--- PASSED TO HANDLER HERE ) def create_chat_completion_openai_v1( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 1024fb85b..6708fcae7 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1,3 +1,4 @@ +# Drop-in replacement / standalone Gemma 4 chat handler for llama-cpp-python (April 2026) from __future__ import annotations import os @@ -607,6 +608,7 @@ def chat_completion_handler( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + reasoning_budget: Optional[int] = None, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -618,6 +620,7 @@ def chat_completion_handler( function_call=function_call, tools=tools, tool_choice=tool_choice, + reasoning_budget=reasoning_budget, **kwargs, ) prompt = llama.tokenize( @@ -1398,11 +1401,12 @@ def format_saiga( return ChatFormatterResponse(prompt=_prompt.strip()) -# Chat format for Google's Gemma models, see more details and available models: +# Chat format for Google's Gemma models (Gemma 2 and Gemma 3), see more details and available models: # https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b @register_chat_format("gemma") def format_gemma( messages: List[llama_types.ChatCompletionRequestMessage], + reasoning_budget: Optional[int] = None, **kwargs: Any, ) -> ChatFormatterResponse: system_message = _get_system_message(messages) @@ -1418,6 +1422,80 @@ def format_gemma( return ChatFormatterResponse(prompt=_prompt, stop=_sep) +# Chat format for Google's Gemma 4 models, see more details: +# https://huggingface.co/google/gemma-4-E2B-it +# https://ai.google.dev/gemma/docs/core/prompt-structure +# Gemma 4 introduces new special tokens and native system role support +@register_chat_format("gemma4") +def format_gemma4( + messages: List[llama_types.ChatCompletionRequestMessage], + reasoning_budget: Optional[int] = None, + **kwargs: Any, +) -> ChatFormatterResponse: + """Format messages for Gemma 4 models using the new <|turn> and tokens. + + Gemma 4 introduces: + - Native system role support via <|channel>thought\n ... \n + - New turn-based tokens: <|turn>, , <|channel>, + - Thinking mode support via <|think|> token + - Tool calling support via <|tool_call>, , etc. + + This is a simplified formatter that handles basic text-only conversations. + For full multimodal and tool calling support, use the Gemma4ChatHandler class. + + Special tokens: + - : Beginning of sequence + - <|turn>: Start of turn + - : End of turn + - <|channel>: Start of channel + - : End of channel + - <|think|>: Thinking mode indicator + - <|tool_call>: Start of tool call + - : End of tool call + + Args: + messages: List of chat completion messages + reasoning_budget: Maximum number of tokens for thinking/reasoning (Gemma 4 feature) + **kwargs: Additional keyword arguments + """ + _bos_token = "" + _turn_start = "<|turn>" + _turn_end = "\n" + _channel_start = "<|channel>" + _channel_end = "\n" + + _prompt = _bos_token + + # Check for system message - in Gemma 4, system messages go in a thought channel + system_message = _get_system_message(messages) + if system_message: + _prompt += f"{_channel_start}thought\n{system_message}{_channel_end}" + + # Format conversation turns + for message in messages: + role = message["role"] + content = message.get("content", "") + + # Skip system messages as they're handled separately + if role == "system": + continue + + # Map role to Gemma 4 role names + if role == "assistant": + gemma_role = "model" + else: + gemma_role = role + + _prompt += f"{_turn_start}{gemma_role}\n{content}{_turn_end}" + + # Add generation prompt + _prompt += f"{_turn_start}model\n" + + return ChatFormatterResponse( + prompt=_prompt, stop=[_turn_end, "", ""] + ) + + # Tricky chat formats that require custom chat handlers @@ -3229,6 +3307,64 @@ def from_pretrained( ) +class GemmaChatHandler(Llava15ChatHandler): + """Chat handler for Gemma-based multimodal models (e.g., PaliGemma, MedGemma). + + Gemma models use / control tokens instead of + the LLaVA-style USER:/ASSISTANT: format. The text-only 'gemma' chat format + is already registered (see format_gemma), but multimodal Gemma models that + require a Llava-style vision pipeline need a dedicated handler so the + correct chat template is applied when chat_handler takes precedence over + chat_format in the resolution order. + + See: https://ai.google.dev/gemma/docs/formatting + """ + + DEFAULT_SYSTEM_MESSAGE = None # Gemma models do not natively support a system role + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System messages are folded into a user turn (Gemma has no system role) + "{% if message.role == 'system' %}" + "user\n{{ message.content }}\n" + "{% endif %}" + # User message (handles both plain string and multimodal content list) + "{% if message.role == 'user' %}" + "user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable and message.content is not string %}" + # Emit image tokens first + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + # Then emit text tokens + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' and message.content is not none %}" + "model\n{{ message.content }}\n" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "model\n" + "{% endif %}" + ) + + class ObsidianChatHandler(Llava15ChatHandler): # Prompt Format # The model followed ChatML format. However, with ### as the seperator @@ -3581,6 +3717,223 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class MultimodalGemmaChatHandler(Llava15ChatHandler): + DEFAULT_SYSTEM_MESSAGE: Optional[str] = None + + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'user' %}" + "user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "\n" + "{% endif %}" + "{% if message.role == 'assistant' and message.content is not none %}" + "model\n" + "{{ message.content }}\n" + "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}" + "model\n" + "{% endif %}" + ) + + +# ============================================================ +# GEMMA 4 CHAT HANDLER - FULLY CORRECTED & POLISHED +# ============================================================ + + +class Gemma4ChatHandler(Llava15ChatHandler): + """Chat handler for Gemma 4 models with full multimodal and tool calling support. + + Gemma 4 introduces new special tokens and native system role support: + - <|turn>: Start of turn + - : End of turn + - <|channel>: Start of channel (for system/thought messages) + - : End of channel + - <|think|>: Thinking mode indicator + - <|tool_call>: Start of tool call + - : End of tool call + - <|tool_response|>: Tool response marker + """ + + DEFAULT_SYSTEM_MESSAGE: Optional[str] = None + + CHAT_FORMAT = ( + "{% for message in messages %}" + # 1. System messages go in a thought channel + "{% if message.role == 'system' %}" + "<|channel>thought\n{{ message.content }}\n" + "{% endif %}" + # 2. User message (handles both plain string and multimodal media) + "{% if message.role == 'user' %}" + "<|turn>user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable and message.content is not string %}" + # Emit Media Embeddings (Images AND Audio) + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}{{ content.image_url }}{% else %}{{ content.image_url.url }}{% endif %}" + "{% elif content.type == 'input_audio' %}" + "data:audio/{{ content.input_audio.format }};base64,{{ content.input_audio.data }}" + "{% elif content.type == 'audio' %}" + "data:audio/{{ content.audio.format }};base64,{{ content.audio.data }}" + "{% endif %}" + "{% endfor %}" + # Then emit text tokens + "{% for content in message.content %}" + "{% if content.type == 'text' %}{{ content.text }}{% endif %}" + "{% endfor %}" + "{% endif %}" + "\n" + "{% endif %}" + # 3. Assistant message + "{% if message.role == 'assistant' and message.content is not none %}" + "<|turn>model\n{{ message.content }}\n" + "{% endif %}" + # 4. Tool Calls (Agentic Workflow Handshakes) + "{% if message.role == 'assistant' and message.tool_calls %}" + "<|turn>model\n" + "{% for tool_call in message.tool_calls %}" + "<|tool_call>call:{{ tool_call.function.name }}{{ tool_call.function.arguments }}\n" + "{% endfor %}" + "\n" + "{% endif %}" + # 5. Tool Responses + "{% if message.role == 'tool' %}" + "<|tool_response>response:{{ message.name }}{{ message.content }}\n" + "{% endif %}" + "{% endfor %}" + # 6. Generation prompt + "{% if add_generation_prompt %}" + "<|turn>model\n" + "{% endif %}" + ) + + @staticmethod + def get_image_urls( + messages: List[llama_types.ChatCompletionRequestMessage], + ) -> List[str]: + """ + Overrides the base Llava15ChatHandler method. + Extracts both image URLs and audio base64 data strings so they can be processed + and replaced by the mtmd C++ media marker embeddings in the backend. + """ + media_urls: List[str] = [] + for message in messages: + if message["role"] == "user" and message.get("content"): + for content in message["content"]: + if isinstance(content, dict) and "type" in content: + # Extract Vision + if content["type"] == "image_url": + if ( + isinstance(content["image_url"], dict) + and "url" in content["image_url"] + ): + media_urls.append(content["image_url"]["url"]) + else: + media_urls.append(content["image_url"]) + + # Extract Audio (Supports OpenAI's 'input_audio' AND custom 'audio' schemas) + elif content["type"] in ["input_audio", "audio"]: + audio_data = content.get("input_audio") or content.get( + "audio" + ) + if audio_data: + fmt = audio_data.get("format", "wav") + data = audio_data.get("data", "") + # Standardize the output so `load_image` successfully base64-decodes the bytes + media_urls.append(f"data:audio/{fmt};base64,{data}") + + return media_urls + + def __call__(self, **kwargs): + """ + Overrides the __call__ pipeline to dynamically intercept and enable Thinking Mode + by injecting the required control token seamlessly into the Jinja template. + Also performs state clearing for reliable multimodal (vision + audio) support + across multiple chat turns, matching other vision handlers like Qwen25VL. + """ + enable_thinking = kwargs.get("enable_thinking", False) + original_format = self.CHAT_FORMAT + + if enable_thinking: + # Inject <|think|> into BOTH the initial system thought channel AND + # the assistant generation prompt so thinking starts the response turn. + # This follows Gemma 4 docs for triggering native thinking mode. + modified_format = original_format.replace( + "<|channel>thought\n", "<|channel>thought\n<|think|>\n" + ).replace( + "{% if add_generation_prompt %}\n<|turn>model\n{% endif %}", + "{% if add_generation_prompt %}\n<|turn>model\n<|think|>\n{% endif %}", + ) + self.CHAT_FORMAT = modified_format + + # Gemma requires a system block for the thought channel to exist. + # If the user hasn't provided one, we dynamically append a blank one. + messages = kwargs.get("messages", []) + if not any(m.get("role") == "system" for m in messages): + kwargs["messages"] = [{"role": "system", "content": ""}] + messages + + # Clear state for multiple runs (critical for vision/audio + thinking in chat) + llama = kwargs.get("llama") + if llama is not None: + llama.reset() + if hasattr(llama, "_ctx") and llama._ctx is not None: + llama._ctx.kv_cache_clear() + llama.n_tokens = 0 + if hasattr(llama, "input_ids"): + llama.input_ids.fill(0) + + # Clear any handler state (e.g. cached embeds from previous multimodal turn) + if hasattr(self, "_last_image_embed"): + self._last_image_embed = None + self._last_image_hash = None + + try: + result = super().__call__(**kwargs) + # Post-process non-streaming responses when thinking mode is enabled + # to provide clear structure: 'thinking' field (contains reasoning) + 'content' (final answer). + # Note: Since Gemma 4 outputs thinking + final answer in a single generation, + # 'thinking' currently holds the full generated text (including reasoning). + # Future: parse on model-specific end-of-thinking markers (e.g. <|end_think|>) if emitted. + if ( + enable_thinking + and not kwargs.get("stream", False) + and isinstance(result, dict) + ): + for choice in result.get("choices", []): + if "message" in choice: + content = choice["message"].get("content", "") or "" + choice["message"]["thinking"] = ( + content # structured access for test app + ) + # content remains the complete response (thinking + final answer) for compatibility + return result + finally: + # Restore the original class format so future non-thinking calls don't leak state + self.CHAT_FORMAT = original_format + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama,