From e83c965c745ad1d4c49af7ceb114d412d12905df Mon Sep 17 00:00:00 2001 From: Holden <34213478+hodlen@users.noreply.github.com> Date: Sat, 23 Dec 2023 16:13:03 +0800 Subject: [PATCH] basic integration of PowerInfer --- .gitignore | 2 + .gitmodules | 3 ++ CMakeLists.txt | 50 +++++++++---------- .../high_level_api_inference.py | 2 +- llama_cpp/llama_cpp.py | 3 ++ vendor/PowerInfer | 1 + 6 files changed, 35 insertions(+), 26 deletions(-) create mode 160000 vendor/PowerInfer diff --git a/.gitignore b/.gitignore index 51f357200f..becdcd0839 100644 --- a/.gitignore +++ b/.gitignore @@ -176,3 +176,5 @@ cython_debug/ # downloaded model .bin files docker/open_llama/*.bin + +/.venv/** diff --git a/.gitmodules b/.gitmodules index 7edf0975dc..d1980cbb03 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp url = https://github.com/ggerganov/llama.cpp.git +[submodule "vendor/PowerInfer"] + path = vendor/PowerInfer + url = https://github.com/SJTU-IPADS/PowerInfer.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 658b71def2..4ab25daadd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ if (LLAMA_BUILD) set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE) set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE) endif() - add_subdirectory(vendor/llama.cpp) + add_subdirectory(vendor/PowerInfer) install( TARGETS llama LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp @@ -44,28 +44,28 @@ if (LLAMA_BUILD) DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp ) - # Building llava - add_subdirectory(vendor/llama.cpp/examples/llava) - set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava") - # Set CUDA_ARCHITECTURES to OFF on windows - if (WIN32) - set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF) - endif() - install( - TARGETS llava_shared - LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - ) - # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374 - install( - TARGETS llava_shared - LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - ) + # # Building llava + # add_subdirectory(vendor/llama.cpp/examples/llava) + # set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava") + # # Set CUDA_ARCHITECTURES to OFF on windows + # if (WIN32) + # set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF) + # endif() + # install( + # TARGETS llava_shared + # LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp + # RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp + # ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp + # FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp + # RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp + # ) + # # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374 + # install( + # TARGETS llava_shared + # LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + # RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + # ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + # FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + # RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp + # ) endif() diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py index e41f375774..96b1263b1c 100644 --- a/examples/high_level_api/high_level_api_inference.py +++ b/examples/high_level_api/high_level_api_inference.py @@ -11,7 +11,7 @@ output = llm( "Question: What are the names of the planets in the solar system? Answer: ", - max_tokens=48, + max_tokens=512, stop=["Q:", "\n"], echo=True, ) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index a4d21004f6..bf537a7cdf 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -305,12 +305,15 @@ class llama_model_params(Structure): _fields_ = [ ("n_gpu_layers", c_int32), ("main_gpu", c_int32), + ("vram_budget_gb", c_float), ("tensor_split", c_float_p), ("progress_callback", llama_progress_callback), ("progress_callback_user_data", c_void_p), ("vocab_only", c_bool), ("use_mmap", c_bool), ("use_mlock", c_bool), + ("reset_gpu_index", c_bool), + ("disable_gpu_index", c_bool), ] diff --git a/vendor/PowerInfer b/vendor/PowerInfer new file mode 160000 index 0000000000..9d726685d2 --- /dev/null +++ b/vendor/PowerInfer @@ -0,0 +1 @@ +Subproject commit 9d726685d299b669ffb6197f6b0e72b21ebf019c