diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index bbb58afc35..40f13b16c1 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -96,6 +96,8 @@ class CreateChatCompletionResponse(TypedDict): model: str choices: List["ChatCompletionResponseChoice"] usage: CompletionUsage + nonce: Optional[str] + s1: Optional[str] class ChatCompletionMessageToolCallChunkFunction(TypedDict): @@ -143,7 +145,8 @@ class CreateChatCompletionStreamResponse(TypedDict): object: Literal["chat.completion.chunk"] created: int choices: List[ChatCompletionStreamResponseChoice] - + nonce: Optional[str] + s1: Optional[str] class ChatCompletionFunctions(TypedDict): name: str diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index cd3255176f..5030bca6bf 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -8,6 +8,7 @@ from threading import Lock from functools import partial from typing import Iterator, List, Optional, Union, Dict +from typing_extensions import TypedDict, Literal import llama_cpp @@ -21,6 +22,7 @@ from sse_starlette.sse import EventSourceResponse from starlette_context.plugins import RequestIdPlugin # type: ignore from starlette_context.middleware import RawContextMiddleware +from llama_cpp.server.util import get_device_info from llama_cpp.server.model import ( LlamaProxy, @@ -43,6 +45,11 @@ DetokenizeInputResponse, ) from llama_cpp.server.errors import RouteErrorHandler +import hashlib +from pathlib import Path + +MODEL_HASHS = [] +PREFIX = "/llmapi" router = APIRouter(route_class=RouteErrorHandler) @@ -50,6 +57,20 @@ _server_settings: Optional[ServerSettings] = None +def calculate_sha256(model_settings): + hashs =[] + for model_setting in model_settings: + filename = model_setting.model + hash_sha256 = hashlib.sha256() + blksize = 1024 * 1024 + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(blksize), b""): + hash_sha256.update(chunk) + code = hash_sha256.hexdigest() + hashs.append(code) + return hashs + + def set_server_settings(server_settings: ServerSettings): global _server_settings _server_settings = server_settings @@ -148,6 +169,10 @@ def create_app( assert model_settings is not None set_llama_proxy(model_settings=model_settings) + + global MODEL_HASHS + if not MODEL_HASHS: + MODEL_HASHS = calculate_sha256(model_settings) if server_settings.disable_ping_events: set_ping_message_factory(lambda: bytes()) @@ -225,7 +250,7 @@ async def authenticate( @router.post( - "/v1/completions", + "{}/v1/completions".format(PREFIX), summary="Completion", dependencies=[Depends(authenticate)], response_model=Union[ @@ -258,7 +283,7 @@ async def authenticate( tags=[openai_v1_tag], ) @router.post( - "/v1/engines/copilot-codex/completions", + "{}/v1/engines/copilot-codex/completions".format(PREFIX), include_in_schema=False, dependencies=[Depends(authenticate)], tags=[openai_v1_tag], @@ -282,7 +307,7 @@ async def create_completion( llama = llama_proxy( body.model - if request.url.path != "/v1/engines/copilot-codex/completions" + if request.url.path != "{}/v1/engines/copilot-codex/completions".format(PREFIX) else "copilot-codex" ) @@ -348,7 +373,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: @router.post( - "/v1/embeddings", + "{}/v1/embeddings".format(PREFIX), summary="Embedding", dependencies=[Depends(authenticate)], tags=[openai_v1_tag], @@ -364,7 +389,7 @@ async def create_embedding( @router.post( - "/v1/chat/completions", + "{}/v1/chat/completions".format(PREFIX), summary="Chat", dependencies=[Depends(authenticate)], response_model=Union[llama_cpp.ChatCompletion, str], @@ -486,6 +511,7 @@ async def create_chat_completion( "logit_bias_type", "user", "min_tokens", + "challenge" } kwargs = body.model_dump(exclude=exclude) llama = llama_proxy(body.model) @@ -508,6 +534,7 @@ async def create_chat_completion( else: kwargs["logits_processor"].extend(_min_tokens_logits_processor) + nonce, s1 = get_device_info(body.challenge) iterator_or_completion: Union[ llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk] ] = await run_in_threadpool(llama.create_chat_completion, **kwargs) @@ -520,6 +547,8 @@ async def create_chat_completion( # the iterator is valid and we can use it to stream the response. def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: yield first_response + iterator_or_completion["nonce"] = nonce + iterator_or_completion["s1"] = str(s1) yield from iterator_or_completion exit_stack.close() @@ -538,11 +567,31 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: ) else: exit_stack.close() + iterator_or_completion["nonce"] = nonce + iterator_or_completion["s1"] = str(s1) return iterator_or_completion +class ModelData(TypedDict): + id: str + object: Literal["model"] + owned_by: str + permissions: List[str] + title: str + model_name: str + hash: str + sha256: str + filename: str + config: dict + + +class ModelList(TypedDict): + object: Literal["list"] + data: List[ModelData] + + @router.get( - "/v1/models", + "{}/v1/models".format(PREFIX), summary="Models", dependencies=[Depends(authenticate)], tags=[openai_v1_tag], @@ -550,17 +599,37 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: async def get_models( llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> ModelList: - return { - "object": "list", - "data": [ - { + results = [] + index = 0 + for proxy in llama_proxy: + model_setting = llama_proxy._model_settings_dict[proxy] + file_name = model_setting.model + model_alias = model_setting.model_alias + sha256 = MODEL_HASHS[index] + index += 1 + if sha256: + shorthash = sha256[:10] + else: + shorthash = "" + title = "{} [{}]".format(os.path.basename(file_name), shorthash) + model_name = Path(file_name).stem + config = {} + result = { "id": model_alias, "object": "model", "owned_by": "me", "permissions": [], + "title": title, + "model_name": model_name, + "hash": shorthash, + "sha256": sha256, + "filename": file_name, + "config": config } - for model_alias in llama_proxy - ], + results.append(result) + return { + "object": "list", + "data": results, } @@ -568,7 +637,7 @@ async def get_models( @router.post( - "/extras/tokenize", + "{}/extras/tokenize".format(PREFIX), summary="Tokenize", dependencies=[Depends(authenticate)], tags=[extras_tag], @@ -583,7 +652,7 @@ async def tokenize( @router.post( - "/extras/tokenize/count", + "{}/extras/tokenize/count".format(PREFIX), summary="Tokenize Count", dependencies=[Depends(authenticate)], tags=[extras_tag], @@ -598,7 +667,7 @@ async def count_query_tokens( @router.post( - "/extras/detokenize", + "{}/extras/detokenize".format(PREFIX), summary="Detokenize", dependencies=[Depends(authenticate)], tags=[extras_tag], diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index fdd1644568..da97e988e3 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -232,6 +232,7 @@ class CreateChatCompletionRequest(BaseModel): frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) seed: Optional[int] = Field(None) + challenge: Optional[Union[str, List[str]]] = Field(None) response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field( default=None, ) diff --git a/llama_cpp/server/util.py b/llama_cpp/server/util.py new file mode 100644 index 0000000000..9542694040 --- /dev/null +++ b/llama_cpp/server/util.py @@ -0,0 +1,41 @@ +import ctypes +import os +if os.name == 'nt': + lib_path = os.path.join(os.path.dirname(__file__), "hash.dll") +elif os.name == 'posix': + if os.uname().machine=='x86_64': + lib_path = "/usr/lib/x86_64-linux-gnu/libhash.so" + else: + lib_path = "/usr/lib/aarch64-linux-gnu/libhash.so" + +rust_lib = ctypes.CDLL(lib_path) +rust_lib.device_info.argtypes = [ctypes.c_char_p] +rust_lib.device_info.restype = ctypes.c_char_p + +def version(): + return rust_lib.version() + +def device_info(key): + if key: + key_bytes = key.encode('utf-8') + else: + key_bytes = b"" + return rust_lib.device_info(key_bytes) + +def get_device_info(key="123"): + device_infos = device_info(key) + import json + device_infos = json.loads(device_infos) + nonce = device_infos["devices"][0]["nonce"] + seed = device_infos["devices"][0]["s1"] + return (nonce, seed) + + +if __name__ == "__main__": + key = "AMSMgPqkDGFPANDuJ1MpUiG3N7fcoVyABakcfQixnLa3" + device_index = 0 + device_uuid = "30dcf980f95b736939b3da28170dc6f824a8901d456fc60da4c9156b4e4f8c20550081155abdbfeed01546846f0735731f565cddf5f7dc6d7804777fd9a796eb" + device_infos = device_info(key) + print(device_infos) + +