From e83c965c745ad1d4c49af7ceb114d412d12905df Mon Sep 17 00:00:00 2001
From: Holden <34213478+hodlen@users.noreply.github.com>
Date: Sat, 23 Dec 2023 16:13:03 +0800
Subject: [PATCH] basic integration of PowerInfer

---
 .gitignore                                    |  2 +
 .gitmodules                                   |  3 ++
 CMakeLists.txt                                | 50 +++++++++----------
 .../high_level_api_inference.py               |  2 +-
 llama_cpp/llama_cpp.py                        |  3 ++
 vendor/PowerInfer                             |  1 +
 6 files changed, 35 insertions(+), 26 deletions(-)
 create mode 160000 vendor/PowerInfer

diff --git a/.gitignore b/.gitignore
index 51f357200f..becdcd0839 100644
--- a/.gitignore
+++ b/.gitignore
@@ -176,3 +176,5 @@ cython_debug/
 
 # downloaded model .bin files
 docker/open_llama/*.bin
+
+/.venv/**
diff --git a/.gitmodules b/.gitmodules
index 7edf0975dc..d1980cbb03 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
 	url = https://github.com/ggerganov/llama.cpp.git
+[submodule "vendor/PowerInfer"]
+	path = vendor/PowerInfer
+	url = https://github.com/SJTU-IPADS/PowerInfer.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 658b71def2..4ab25daadd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ if (LLAMA_BUILD)
         set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
         set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
     endif()
-    add_subdirectory(vendor/llama.cpp)
+    add_subdirectory(vendor/PowerInfer)
     install(
         TARGETS llama 
         LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
@@ -44,28 +44,28 @@ if (LLAMA_BUILD)
         DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
     )
 
-    # Building llava
-    add_subdirectory(vendor/llama.cpp/examples/llava)
-    set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
-    # Set CUDA_ARCHITECTURES to OFF on windows
-    if (WIN32)
-        set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
-    endif()
-    install(
-        TARGETS llava_shared
-        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-    )
-    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
-    install(
-        TARGETS llava_shared
-        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
-        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
-        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
-        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
-        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
-    )
+    # # Building llava
+    # add_subdirectory(vendor/llama.cpp/examples/llava)
+    # set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
+    # # Set CUDA_ARCHITECTURES to OFF on windows
+    # if (WIN32)
+    #     set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
+    # endif()
+    # install(
+    #     TARGETS llava_shared
+    #     LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    #     RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    #     ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    #     FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    #     RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    # )
+    # # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
+    # install(
+    #     TARGETS llava_shared
+    #     LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    #     RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    #     ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    #     FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    #     RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    # )
 endif()
diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py
index e41f375774..96b1263b1c 100644
--- a/examples/high_level_api/high_level_api_inference.py
+++ b/examples/high_level_api/high_level_api_inference.py
@@ -11,7 +11,7 @@
 
 output = llm(
     "Question: What are the names of the planets in the solar system? Answer: ",
-    max_tokens=48,
+    max_tokens=512,
     stop=["Q:", "\n"],
     echo=True,
 )
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index a4d21004f6..bf537a7cdf 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -305,12 +305,15 @@ class llama_model_params(Structure):
     _fields_ = [
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
+        ("vram_budget_gb", c_float),
         ("tensor_split", c_float_p),
         ("progress_callback", llama_progress_callback),
         ("progress_callback_user_data", c_void_p),
         ("vocab_only", c_bool),
         ("use_mmap", c_bool),
         ("use_mlock", c_bool),
+        ("reset_gpu_index", c_bool),
+        ("disable_gpu_index", c_bool),
     ]
 
 
diff --git a/vendor/PowerInfer b/vendor/PowerInfer
new file mode 160000
index 0000000000..9d726685d2
--- /dev/null
+++ b/vendor/PowerInfer
@@ -0,0 +1 @@
+Subproject commit 9d726685d299b669ffb6197f6b0e72b21ebf019c