From 034082bb8ef596ede18084ae3174c06f66cc63ec Mon Sep 17 00:00:00 2001 From: Guo Changzhu Date: Sat, 27 Apr 2024 13:42:51 +0800 Subject: [PATCH 1/4] update app.py --- llama_cpp/server/app.py | 92 ++++++++++++++++++++++++++++++++++------- 1 file changed, 77 insertions(+), 15 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b6ed9b1b6b..620ae97226 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -6,6 +6,7 @@ from threading import Lock from functools import partial from typing import Iterator, List, Optional, Union, Dict +from typing_extensions import TypedDict, Literal import llama_cpp @@ -41,6 +42,11 @@ DetokenizeInputResponse, ) from llama_cpp.server.errors import RouteErrorHandler +import hashlib +from pathlib import Path + +MODEL_HASHS = [] +PREFIX = "/llmapi" router = APIRouter(route_class=RouteErrorHandler) @@ -48,6 +54,20 @@ _server_settings: Optional[ServerSettings] = None +def calculate_sha256(model_settings): + hashs =[] + for model_setting in model_settings: + filename = model_setting.model + hash_sha256 = hashlib.sha256() + blksize = 1024 * 1024 + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(blksize), b""): + hash_sha256.update(chunk) + code = hash_sha256.hexdigest() + hashs.append(code) + return hashs + + def set_server_settings(server_settings: ServerSettings): global _server_settings _server_settings = server_settings @@ -144,6 +164,10 @@ def create_app( assert model_settings is not None set_llama_proxy(model_settings=model_settings) + + global MODEL_HASHS + if not MODEL_HASHS: + MODEL_HASHS = calculate_sha256(model_settings) if server_settings.disable_ping_events: set_ping_message_factory(lambda: bytes()) @@ -216,7 +240,7 @@ async def authenticate( @router.post( - "/v1/completions", + "{}/v1/completions".format(PREFIX), summary="Completion", dependencies=[Depends(authenticate)], response_model=Union[ @@ -249,7 +273,7 @@ async def authenticate( tags=[openai_v1_tag], ) @router.post( - "/v1/engines/copilot-codex/completions", + "{}/v1/engines/copilot-codex/completions".format(PREFIX), include_in_schema=False, dependencies=[Depends(authenticate)], tags=[openai_v1_tag], @@ -265,7 +289,7 @@ async def create_completion( llama = llama_proxy( body.model - if request.url.path != "/v1/engines/copilot-codex/completions" + if request.url.path != "{}/v1/engines/copilot-codex/completions".format(PREFIX) else "copilot-codex" ) @@ -319,7 +343,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: @router.post( - "/v1/embeddings", + "{}/v1/embeddings".format(PREFIX), summary="Embedding", dependencies=[Depends(authenticate)], tags=[openai_v1_tag], @@ -335,7 +359,7 @@ async def create_embedding( @router.post( - "/v1/chat/completions", + "{}/v1/chat/completions".format(PREFIX), summary="Chat", dependencies=[Depends(authenticate)], response_model=Union[llama_cpp.ChatCompletion, str], @@ -487,8 +511,26 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: return iterator_or_completion +class ModelData(TypedDict): + id: str + object: Literal["model"] + owned_by: str + permissions: List[str] + title: str + model_name: str + hash: str + sha256: str + filename: str + config: dict + + +class ModelList(TypedDict): + object: Literal["list"] + data: List[ModelData] + + @router.get( - "/v1/models", + "{}/v1/models".format(PREFIX), summary="Models", dependencies=[Depends(authenticate)], tags=[openai_v1_tag], @@ -496,17 +538,37 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: async def get_models( llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> ModelList: - return { - "object": "list", - "data": [ - { + results = [] + index = 0 + for proxy in llama_proxy: + model_setting = llama_proxy._model_settings_dict[proxy] + file_name = model_setting.model + model_alias = model_setting.model_alias + sha256 = MODEL_HASHS[index] + index += 1 + if sha256: + shorthash = sha256[:10] + else: + shorthash = "" + title = "{} [{}]".format(os.path.basename(file_name), shorthash) + model_name = Path(file_name).stem + config = {} + result = { "id": model_alias, "object": "model", "owned_by": "me", "permissions": [], + "title": title, + "model_name": model_name, + "hash": shorthash, + "sha256": sha256, + "filename": file_name, + "config": config } - for model_alias in llama_proxy - ], + results.append(result) + return { + "object": "list", + "data": results, } @@ -514,7 +576,7 @@ async def get_models( @router.post( - "/extras/tokenize", + "{}/extras/tokenize".format(PREFIX), summary="Tokenize", dependencies=[Depends(authenticate)], tags=[extras_tag], @@ -529,7 +591,7 @@ async def tokenize( @router.post( - "/extras/tokenize/count", + "{}/extras/tokenize/count".format(PREFIX), summary="Tokenize Count", dependencies=[Depends(authenticate)], tags=[extras_tag], @@ -544,7 +606,7 @@ async def count_query_tokens( @router.post( - "/extras/detokenize", + "{}/extras/detokenize".format(PREFIX), summary="Detokenize", dependencies=[Depends(authenticate)], tags=[extras_tag], From ea86797cdf8eb5f6e45d94c99244572d0fd3e58e Mon Sep 17 00:00:00 2001 From: haria Date: Tue, 10 Sep 2024 16:12:09 +0800 Subject: [PATCH 2/4] add nonce --- llama_cpp/llama_types.py | 5 ++++- llama_cpp/server/app.py | 7 +++++++ llama_cpp/server/types.py | 1 + llama_cpp/server/util.py | 31 +++++++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 llama_cpp/server/util.py diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index bbb58afc35..40f13b16c1 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -96,6 +96,8 @@ class CreateChatCompletionResponse(TypedDict): model: str choices: List["ChatCompletionResponseChoice"] usage: CompletionUsage + nonce: Optional[str] + s1: Optional[str] class ChatCompletionMessageToolCallChunkFunction(TypedDict): @@ -143,7 +145,8 @@ class CreateChatCompletionStreamResponse(TypedDict): object: Literal["chat.completion.chunk"] created: int choices: List[ChatCompletionStreamResponseChoice] - + nonce: Optional[str] + s1: Optional[str] class ChatCompletionFunctions(TypedDict): name: str diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ce17e8e196..5030bca6bf 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -22,6 +22,7 @@ from sse_starlette.sse import EventSourceResponse from starlette_context.plugins import RequestIdPlugin # type: ignore from starlette_context.middleware import RawContextMiddleware +from llama_cpp.server.util import get_device_info from llama_cpp.server.model import ( LlamaProxy, @@ -510,6 +511,7 @@ async def create_chat_completion( "logit_bias_type", "user", "min_tokens", + "challenge" } kwargs = body.model_dump(exclude=exclude) llama = llama_proxy(body.model) @@ -532,6 +534,7 @@ async def create_chat_completion( else: kwargs["logits_processor"].extend(_min_tokens_logits_processor) + nonce, s1 = get_device_info(body.challenge) iterator_or_completion: Union[ llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk] ] = await run_in_threadpool(llama.create_chat_completion, **kwargs) @@ -544,6 +547,8 @@ async def create_chat_completion( # the iterator is valid and we can use it to stream the response. def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: yield first_response + iterator_or_completion["nonce"] = nonce + iterator_or_completion["s1"] = str(s1) yield from iterator_or_completion exit_stack.close() @@ -562,6 +567,8 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: ) else: exit_stack.close() + iterator_or_completion["nonce"] = nonce + iterator_or_completion["s1"] = str(s1) return iterator_or_completion diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index fdd1644568..da97e988e3 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -232,6 +232,7 @@ class CreateChatCompletionRequest(BaseModel): frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) seed: Optional[int] = Field(None) + challenge: Optional[Union[str, List[str]]] = Field(None) response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field( default=None, ) diff --git a/llama_cpp/server/util.py b/llama_cpp/server/util.py new file mode 100644 index 0000000000..0815bc2521 --- /dev/null +++ b/llama_cpp/server/util.py @@ -0,0 +1,31 @@ +import ctypes +import os + +rust_lib = ctypes.CDLL("/usr/lib/x86_64-linux-gnu/libhash.so") +rust_lib.device_info.argtypes = [ctypes.c_char_p] +rust_lib.device_info.restype = ctypes.c_char_p + +def version(): + return rust_lib.version() + +def device_info(key): + key_bytes = key.encode('utf-8') + return rust_lib.device_info(key_bytes) + +def get_device_info(key="123"): + device_infos = device_info(key) + import json + device_infos = json.loads(device_infos) + nonce = device_infos["devices"][0]["nonce"] + seed = device_infos["devices"][0]["s1"] + return (nonce, seed) + + +if __name__ == "__main__": + key = "AMSMgPqkDGFPANDuJ1MpUiG3N7fcoVyABakcfQixnLa3" + device_index = 0 + device_uuid = "30dcf980f95b736939b3da28170dc6f824a8901d456fc60da4c9156b4e4f8c20550081155abdbfeed01546846f0735731f565cddf5f7dc6d7804777fd9a796eb" + device_infos = device_info(key) + print(device_infos) + + From 62d87d7cea46bca31479f2d368c3889a126fb55a Mon Sep 17 00:00:00 2001 From: haria Date: Tue, 10 Sep 2024 16:36:35 +0800 Subject: [PATCH 3/4] add util --- llama_cpp/server/util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/util.py b/llama_cpp/server/util.py index 0815bc2521..567c4e1364 100644 --- a/llama_cpp/server/util.py +++ b/llama_cpp/server/util.py @@ -9,7 +9,10 @@ def version(): return rust_lib.version() def device_info(key): - key_bytes = key.encode('utf-8') + if key: + key_bytes = key.encode('utf-8') + else: + key_bytes = b"" return rust_lib.device_info(key_bytes) def get_device_info(key="123"): From 1eb11dd9c42754ad1abf6e52f34872a4d3c41af4 Mon Sep 17 00:00:00 2001 From: haria Date: Thu, 12 Sep 2024 09:32:16 +0800 Subject: [PATCH 4/4] fix arm64 issue. --- llama_cpp/server/util.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/util.py b/llama_cpp/server/util.py index 567c4e1364..9542694040 100644 --- a/llama_cpp/server/util.py +++ b/llama_cpp/server/util.py @@ -1,7 +1,14 @@ import ctypes import os - -rust_lib = ctypes.CDLL("/usr/lib/x86_64-linux-gnu/libhash.so") +if os.name == 'nt': + lib_path = os.path.join(os.path.dirname(__file__), "hash.dll") +elif os.name == 'posix': + if os.uname().machine=='x86_64': + lib_path = "/usr/lib/x86_64-linux-gnu/libhash.so" + else: + lib_path = "/usr/lib/aarch64-linux-gnu/libhash.so" + +rust_lib = ctypes.CDLL(lib_path) rust_lib.device_info.argtypes = [ctypes.c_char_p] rust_lib.device_info.restype = ctypes.c_char_p