mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-04 08:04:07 +00:00
Compare commits
18 Commits
lookahead
...
gg/fix-cpu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
87f4102a70 | ||
|
|
e9b7a5cbd0 | ||
|
|
f815fe43d3 | ||
|
|
0dab8cd7cc | ||
|
|
bb03290c17 | ||
|
|
f3b269813f | ||
|
|
3e73d31d9c | ||
|
|
9656026b53 | ||
|
|
922754a8d6 | ||
|
|
22da05536f | ||
|
|
1ddb52ec38 | ||
|
|
f837c3a992 | ||
|
|
3014b5415d | ||
|
|
04814e718e | ||
|
|
af19d35734 | ||
|
|
e9c13ff781 | ||
|
|
8a052c131e | ||
|
|
189d68446e |
11
.github/workflows/build.yml
vendored
11
.github/workflows/build.yml
vendored
@@ -498,6 +498,17 @@ jobs:
|
||||
path: |
|
||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
# steps:
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -47,6 +47,7 @@ models-mnt
|
||||
/libllama.so
|
||||
/llama-bench
|
||||
/llava-cli
|
||||
/lookahead
|
||||
/main
|
||||
/metal
|
||||
/perplexity
|
||||
|
||||
5
Makefile
5
Makefile
@@ -2,7 +2,7 @@
|
||||
BUILD_TARGETS = \
|
||||
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||
speculative infill tokenize benchmark-matmult parallel finetune export-lora tests/test-c.o
|
||||
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = \
|
||||
@@ -657,6 +657,9 @@ speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS)
|
||||
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
@@ -10,6 +10,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
|
||||
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
|
||||
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
|
||||
----
|
||||
@@ -114,6 +116,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||
- [semperai/amica](https://github.com/semperai/amica)
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ class Model:
|
||||
from safetensors import safe_open
|
||||
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
||||
else:
|
||||
ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu"))
|
||||
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
||||
|
||||
with ctx as model_part:
|
||||
for name in model_part.keys():
|
||||
@@ -880,20 +880,21 @@ print(f"Loading model: {dir_model.name}")
|
||||
|
||||
hparams = Model.load_hparams(dir_model)
|
||||
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||
with torch.inference_mode():
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||
|
||||
print("Set model parameters")
|
||||
model_instance.set_gguf_parameters()
|
||||
print("Set model parameters")
|
||||
model_instance.set_gguf_parameters()
|
||||
|
||||
print("Set model tokenizer")
|
||||
model_instance.set_vocab()
|
||||
print("Set model tokenizer")
|
||||
model_instance.set_vocab()
|
||||
|
||||
if args.vocab_only:
|
||||
print(f"Exporting model vocab to '{fname_out}'")
|
||||
model_instance.write_vocab()
|
||||
else:
|
||||
print(f"Exporting model to '{fname_out}'")
|
||||
model_instance.write()
|
||||
if args.vocab_only:
|
||||
print(f"Exporting model vocab to '{fname_out}'")
|
||||
model_instance.write_vocab()
|
||||
else:
|
||||
print(f"Exporting model to '{fname_out}'")
|
||||
model_instance.write()
|
||||
|
||||
print(f"Model successfully exported to '{fname_out}'")
|
||||
print(f"Model successfully exported to '{fname_out}'")
|
||||
|
||||
0
convert.py
Normal file → Executable file
0
convert.py
Normal file → Executable file
@@ -32,6 +32,7 @@ else()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
if (LLAMA_METAL)
|
||||
add_subdirectory(metal)
|
||||
|
||||
1
examples/llama.swiftui/.gitignore
vendored
Normal file
1
examples/llama.swiftui/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
xcuserdata
|
||||
7
examples/llama.swiftui/README.md
Normal file
7
examples/llama.swiftui/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# llama.swiftui
|
||||
|
||||
Local inference of llama.cpp on an iPhone.
|
||||
So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.
|
||||
|
||||
https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
|
||||
|
||||
176
examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Normal file
176
examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Normal file
@@ -0,0 +1,176 @@
|
||||
import Foundation
|
||||
|
||||
// import llama
|
||||
|
||||
enum LlamaError: Error {
|
||||
case couldNotInitializeContext
|
||||
}
|
||||
|
||||
actor LlamaContext {
|
||||
private var model: OpaquePointer
|
||||
private var context: OpaquePointer
|
||||
private var batch: llama_batch
|
||||
private var tokens_list: [llama_token]
|
||||
|
||||
var n_len: Int32 = 512
|
||||
var n_cur: Int32 = 0
|
||||
var n_decode: Int32 = 0
|
||||
|
||||
init(model: OpaquePointer, context: OpaquePointer) {
|
||||
self.model = model
|
||||
self.context = context
|
||||
self.tokens_list = []
|
||||
self.batch = llama_batch_init(512, 0, 1)
|
||||
}
|
||||
|
||||
deinit {
|
||||
llama_free(context)
|
||||
llama_free_model(model)
|
||||
llama_backend_free()
|
||||
}
|
||||
|
||||
static func createContext(path: String) throws -> LlamaContext {
|
||||
llama_backend_init(false)
|
||||
let model_params = llama_model_default_params()
|
||||
|
||||
let model = llama_load_model_from_file(path, model_params)
|
||||
guard let model else {
|
||||
print("Could not load model at \(path)")
|
||||
throw LlamaError.couldNotInitializeContext
|
||||
}
|
||||
var ctx_params = llama_context_default_params()
|
||||
ctx_params.seed = 1234
|
||||
ctx_params.n_ctx = 2048
|
||||
ctx_params.n_threads = 8
|
||||
ctx_params.n_threads_batch = 8
|
||||
|
||||
let context = llama_new_context_with_model(model, ctx_params)
|
||||
guard let context else {
|
||||
print("Could not load context!")
|
||||
throw LlamaError.couldNotInitializeContext
|
||||
}
|
||||
|
||||
return LlamaContext(model: model, context: context)
|
||||
}
|
||||
|
||||
func get_n_tokens() -> Int32 {
|
||||
return batch.n_tokens;
|
||||
}
|
||||
|
||||
func completion_init(text: String) {
|
||||
print("attempting to complete \"\(text)\"")
|
||||
|
||||
tokens_list = tokenize(text: text, add_bos: true)
|
||||
|
||||
let n_ctx = llama_n_ctx(context)
|
||||
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
|
||||
|
||||
print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
|
||||
|
||||
if n_kv_req > n_ctx {
|
||||
print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
|
||||
}
|
||||
|
||||
for id in tokens_list {
|
||||
print(token_to_piece(token: id))
|
||||
}
|
||||
|
||||
// batch = llama_batch_init(512, 0) // done in init()
|
||||
batch.n_tokens = Int32(tokens_list.count)
|
||||
|
||||
for i1 in 0..<batch.n_tokens {
|
||||
let i = Int(i1)
|
||||
batch.token[i] = tokens_list[i]
|
||||
batch.pos[i] = i1
|
||||
batch.n_seq_id[Int(i)] = 1
|
||||
batch.seq_id[Int(i)]![0] = 0
|
||||
batch.logits[i] = 0
|
||||
}
|
||||
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
|
||||
|
||||
if llama_decode(context, batch) != 0 {
|
||||
print("llama_decode() failed")
|
||||
}
|
||||
|
||||
n_cur = batch.n_tokens
|
||||
}
|
||||
|
||||
func completion_loop() -> String {
|
||||
var new_token_id: llama_token = 0
|
||||
|
||||
let n_vocab = llama_n_vocab(model)
|
||||
let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
|
||||
|
||||
var candidates = Array<llama_token_data>()
|
||||
candidates.reserveCapacity(Int(n_vocab))
|
||||
|
||||
for token_id in 0..<n_vocab {
|
||||
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
||||
}
|
||||
candidates.withUnsafeMutableBufferPointer() { buffer in
|
||||
var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
|
||||
|
||||
new_token_id = llama_sample_token_greedy(context, &candidates_p)
|
||||
}
|
||||
|
||||
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
||||
print("\n")
|
||||
return ""
|
||||
}
|
||||
|
||||
let new_token_str = token_to_piece(token: new_token_id)
|
||||
print(new_token_str)
|
||||
// tokens_list.append(new_token_id)
|
||||
|
||||
batch.n_tokens = 0
|
||||
|
||||
batch.token[Int(batch.n_tokens)] = new_token_id
|
||||
batch.pos[Int(batch.n_tokens)] = n_cur
|
||||
batch.n_seq_id[Int(batch.n_tokens)] = 1
|
||||
batch.seq_id[Int(batch.n_tokens)]![0] = 0
|
||||
batch.logits[Int(batch.n_tokens)] = 1 // true
|
||||
batch.n_tokens += 1
|
||||
|
||||
n_decode += 1
|
||||
|
||||
n_cur += 1
|
||||
|
||||
if llama_decode(context, batch) != 0 {
|
||||
print("failed to evaluate llama!")
|
||||
}
|
||||
|
||||
return new_token_str
|
||||
}
|
||||
|
||||
func clear() {
|
||||
tokens_list.removeAll()
|
||||
}
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
let n_tokens = text.count + (add_bos ? 1 : 0)
|
||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)
|
||||
|
||||
var swiftTokens: [llama_token] = []
|
||||
for i in 0..<tokenCount {
|
||||
swiftTokens.append(tokens[Int(i)])
|
||||
}
|
||||
|
||||
tokens.deallocate()
|
||||
|
||||
return swiftTokens
|
||||
}
|
||||
|
||||
private func token_to_piece(token: llama_token) -> String {
|
||||
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
|
||||
result.initialize(repeating: Int8(0), count: 8)
|
||||
|
||||
let _ = llama_token_to_piece(model, token, result, 8)
|
||||
|
||||
let resultStr = String(cString: result)
|
||||
|
||||
result.deallocate()
|
||||
|
||||
return resultStr
|
||||
}
|
||||
}
|
||||
5
examples/llama.swiftui/llama.cpp.swift/bridging-header.h
Normal file
5
examples/llama.swiftui/llama.cpp.swift/bridging-header.h
Normal file
@@ -0,0 +1,5 @@
|
||||
//
|
||||
// Use this file to import your target's public headers that you would like to expose to Swift.
|
||||
//
|
||||
|
||||
#import "llama.h"
|
||||
481
examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
Normal file
481
examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
Normal file
@@ -0,0 +1,481 @@
|
||||
// !$*UTF8*$!
|
||||
{
|
||||
archiveVersion = 1;
|
||||
classes = {
|
||||
};
|
||||
objectVersion = 56;
|
||||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
|
||||
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
|
||||
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
|
||||
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
|
||||
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
|
||||
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
|
||||
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
|
||||
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
|
||||
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
|
||||
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
|
||||
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
|
||||
542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
|
||||
542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
|
||||
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
|
||||
542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
|
||||
542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
|
||||
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
|
||||
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
|
||||
542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
|
||||
542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
|
||||
549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
|
||||
549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
|
||||
549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
|
||||
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
|
||||
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
|
||||
8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
|
||||
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
|
||||
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
|
||||
8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
|
||||
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
|
||||
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
|
||||
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
|
||||
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
|
||||
/* Begin PBXFrameworksBuildPhase section */
|
||||
8A1C83702AC328BD0096AF73 /* Frameworks */ = {
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXFrameworksBuildPhase section */
|
||||
|
||||
/* Begin PBXGroup section */
|
||||
8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
|
||||
542376092B0D9C40008E6A1C /* ggml-backend.h */,
|
||||
542376062B0D9BEA008E6A1C /* ggml-quants.h */,
|
||||
542376072B0D9BFB008E6A1C /* ggml-quants.c */,
|
||||
549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
|
||||
549479C62AC9E0F200E0F78B /* ggml-metal.h */,
|
||||
549479C52AC9E0F200E0F78B /* ggml-metal.m */,
|
||||
542EA09B2AC8723900A8AEE9 /* ggml.c */,
|
||||
542EA09C2AC8723900A8AEE9 /* ggml.h */,
|
||||
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
|
||||
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
|
||||
542EA0A12AC8729100A8AEE9 /* llama.cpp */,
|
||||
542EA0A22AC8729100A8AEE9 /* llama.h */,
|
||||
);
|
||||
name = llama.cpp;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C836A2AC328BD0096AF73 = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
|
||||
8A907F312AC7134E006146EA /* llama.cpp.swift */,
|
||||
8A3F84232AC4C891005E2EE8 /* models */,
|
||||
8A1C83752AC328BD0096AF73 /* llama.swiftui */,
|
||||
8A1C83742AC328BD0096AF73 /* Products */,
|
||||
8A39BE082AC7601000BFEB40 /* Frameworks */,
|
||||
);
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C83742AC328BD0096AF73 /* Products */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
|
||||
);
|
||||
name = Products;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F84102AC4BD85005E2EE8 /* Resources */,
|
||||
8A9F7C4B2AC332DC008AE1EA /* Models */,
|
||||
8A9F7C4A2AC332BF008AE1EA /* UI */,
|
||||
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
|
||||
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
|
||||
8A1C837C2AC328BE0096AF73 /* Preview Content */,
|
||||
);
|
||||
path = llama.swiftui;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
|
||||
);
|
||||
path = "Preview Content";
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */,
|
||||
8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
|
||||
);
|
||||
name = Frameworks;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A3F84102AC4BD85005E2EE8 /* Resources */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F84112AC4BD8C005E2EE8 /* models */,
|
||||
);
|
||||
path = Resources;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A3F84112AC4BD8C005E2EE8 /* models */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
|
||||
);
|
||||
path = models;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
|
||||
8A907F322AC7134E006146EA /* LibLlama.swift */,
|
||||
);
|
||||
path = llama.cpp.swift;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A9F7C4A2AC332BF008AE1EA /* UI */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
|
||||
);
|
||||
path = UI;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A9F7C4B2AC332DC008AE1EA /* Models */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
|
||||
);
|
||||
path = Models;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
/* End PBXGroup section */
|
||||
|
||||
/* Begin PBXNativeTarget section */
|
||||
8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
|
||||
isa = PBXNativeTarget;
|
||||
buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
|
||||
buildPhases = (
|
||||
8A1C836F2AC328BD0096AF73 /* Sources */,
|
||||
8A1C83702AC328BD0096AF73 /* Frameworks */,
|
||||
8A1C83712AC328BD0096AF73 /* Resources */,
|
||||
);
|
||||
buildRules = (
|
||||
);
|
||||
dependencies = (
|
||||
);
|
||||
name = llama.swiftui;
|
||||
packageProductDependencies = (
|
||||
);
|
||||
productName = llama.swiftui;
|
||||
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
|
||||
productType = "com.apple.product-type.application";
|
||||
};
|
||||
/* End PBXNativeTarget section */
|
||||
|
||||
/* Begin PBXProject section */
|
||||
8A1C836B2AC328BD0096AF73 /* Project object */ = {
|
||||
isa = PBXProject;
|
||||
attributes = {
|
||||
BuildIndependentTargetsInParallel = 1;
|
||||
LastSwiftUpdateCheck = 1500;
|
||||
LastUpgradeCheck = 1500;
|
||||
TargetAttributes = {
|
||||
8A1C83722AC328BD0096AF73 = {
|
||||
CreatedOnToolsVersion = 15.0;
|
||||
LastSwiftMigration = 1500;
|
||||
};
|
||||
};
|
||||
};
|
||||
buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
|
||||
compatibilityVersion = "Xcode 14.0";
|
||||
developmentRegion = en;
|
||||
hasScannedForEncodings = 0;
|
||||
knownRegions = (
|
||||
en,
|
||||
Base,
|
||||
);
|
||||
mainGroup = 8A1C836A2AC328BD0096AF73;
|
||||
packageReferences = (
|
||||
);
|
||||
productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
|
||||
projectDirPath = "";
|
||||
projectRoot = "";
|
||||
targets = (
|
||||
8A1C83722AC328BD0096AF73 /* llama.swiftui */,
|
||||
);
|
||||
};
|
||||
/* End PBXProject section */
|
||||
|
||||
/* Begin PBXResourcesBuildPhase section */
|
||||
8A1C83712AC328BD0096AF73 /* Resources */ = {
|
||||
isa = PBXResourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */,
|
||||
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
|
||||
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXResourcesBuildPhase section */
|
||||
|
||||
/* Begin PBXSourcesBuildPhase section */
|
||||
8A1C836F2AC328BD0096AF73 /* Sources */ = {
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
|
||||
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
|
||||
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
|
||||
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
|
||||
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
|
||||
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
|
||||
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
|
||||
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXSourcesBuildPhase section */
|
||||
|
||||
/* Begin XCBuildConfiguration section */
|
||||
8A1C837F2AC328BE0096AF73 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = dwarf;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
ENABLE_TESTABILITY = YES;
|
||||
ENABLE_USER_SCRIPT_SANDBOXING = YES;
|
||||
GCC_C_LANGUAGE_STANDARD = gnu17;
|
||||
GCC_DYNAMIC_NO_PIC = NO;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_OPTIMIZATION_LEVEL = 0;
|
||||
GCC_PREPROCESSOR_DEFINITIONS = (
|
||||
"DEBUG=1",
|
||||
"$(inherited)",
|
||||
);
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
|
||||
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
|
||||
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||
MTL_FAST_MATH = YES;
|
||||
ONLY_ACTIVE_ARCH = YES;
|
||||
SDKROOT = iphoneos;
|
||||
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
|
||||
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
8A1C83802AC328BE0096AF73 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
|
||||
ENABLE_NS_ASSERTIONS = NO;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
ENABLE_USER_SCRIPT_SANDBOXING = YES;
|
||||
GCC_C_LANGUAGE_STANDARD = gnu17;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
|
||||
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
|
||||
MTL_ENABLE_DEBUG_INFO = NO;
|
||||
MTL_FAST_MATH = YES;
|
||||
SDKROOT = iphoneos;
|
||||
SWIFT_COMPILATION_MODE = wholemodule;
|
||||
VALIDATE_PRODUCT = YES;
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
8A1C83822AC328BE0096AF73 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
|
||||
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||
ENABLE_PREVIEWS = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = (
|
||||
"$(inherited)",
|
||||
"@executable_path/Frameworks",
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
|
||||
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
|
||||
SWIFT_VERSION = 5.0;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
8A1C83832AC328BE0096AF73 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
|
||||
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||
ENABLE_PREVIEWS = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = (
|
||||
"$(inherited)",
|
||||
"@executable_path/Frameworks",
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
|
||||
SWIFT_VERSION = 5.0;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
/* End XCBuildConfiguration section */
|
||||
|
||||
/* Begin XCConfigurationList section */
|
||||
8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
8A1C837F2AC328BE0096AF73 /* Debug */,
|
||||
8A1C83802AC328BE0096AF73 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
8A1C83822AC328BE0096AF73 /* Debug */,
|
||||
8A1C83832AC328BE0096AF73 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
/* End XCConfigurationList section */
|
||||
};
|
||||
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
|
||||
}
|
||||
7
examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
7
examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
@@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Workspace
|
||||
version = "1.0">
|
||||
<FileRef
|
||||
location = "self:">
|
||||
</FileRef>
|
||||
</Workspace>
|
||||
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>IDEDidComputeMac32BitWarning</key>
|
||||
<true/>
|
||||
</dict>
|
||||
</plist>
|
||||
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"colors" : [
|
||||
{
|
||||
"idiom" : "universal"
|
||||
}
|
||||
],
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"images" : [
|
||||
{
|
||||
"idiom" : "universal",
|
||||
"platform" : "ios",
|
||||
"size" : "1024x1024"
|
||||
}
|
||||
],
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
45
examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
Normal file
45
examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
Normal file
@@ -0,0 +1,45 @@
|
||||
import Foundation
|
||||
|
||||
@MainActor
|
||||
class LlamaState: ObservableObject {
|
||||
@Published var messageLog = ""
|
||||
|
||||
private var llamaContext: LlamaContext?
|
||||
private var modelUrl: URL? {
|
||||
Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
|
||||
// Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
|
||||
}
|
||||
init() {
|
||||
do {
|
||||
try loadModel()
|
||||
} catch {
|
||||
messageLog += "Error!\n"
|
||||
}
|
||||
}
|
||||
|
||||
private func loadModel() throws {
|
||||
messageLog += "Loading model...\n"
|
||||
if let modelUrl {
|
||||
llamaContext = try LlamaContext.createContext(path: modelUrl.path())
|
||||
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
|
||||
} else {
|
||||
messageLog += "Could not locate model\n"
|
||||
}
|
||||
}
|
||||
|
||||
func complete(text: String) async {
|
||||
guard let llamaContext else {
|
||||
return
|
||||
}
|
||||
messageLog += "Attempting to complete text...\n"
|
||||
await llamaContext.completion_init(text: text)
|
||||
messageLog += "\(text)"
|
||||
|
||||
while await llamaContext.n_cur <= llamaContext.n_len {
|
||||
let result = await llamaContext.completion_loop()
|
||||
messageLog += "\(result)"
|
||||
}
|
||||
await llamaContext.clear()
|
||||
messageLog += "\n\ndone\n"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
0
examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
vendored
Normal file
0
examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
vendored
Normal file
42
examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
Normal file
42
examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
Normal file
@@ -0,0 +1,42 @@
|
||||
import SwiftUI
|
||||
|
||||
struct ContentView: View {
|
||||
@StateObject var llamaState = LlamaState()
|
||||
|
||||
@State private var multiLineText = ""
|
||||
|
||||
var body: some View {
|
||||
VStack {
|
||||
ScrollView(.vertical) {
|
||||
Text(llamaState.messageLog)
|
||||
}
|
||||
|
||||
TextEditor(text: $multiLineText)
|
||||
.frame(height: 200)
|
||||
.padding()
|
||||
.border(Color.gray, width: 0.5)
|
||||
Button(action: {
|
||||
sendText()
|
||||
}) {
|
||||
Text("Send")
|
||||
.padding()
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
}
|
||||
}
|
||||
.padding()
|
||||
}
|
||||
|
||||
func sendText() {
|
||||
Task {
|
||||
await llamaState.complete(text: multiLineText)
|
||||
multiLineText = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
#Preview {
|
||||
ContentView()
|
||||
}
|
||||
*/
|
||||
10
examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
Normal file
10
examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
Normal file
@@ -0,0 +1,10 @@
|
||||
import SwiftUI
|
||||
|
||||
@main
|
||||
struct llama_swiftuiApp: App {
|
||||
var body: some Scene {
|
||||
WindowGroup {
|
||||
ContentView()
|
||||
}
|
||||
}
|
||||
}
|
||||
5
examples/lookahead/CMakeLists.txt
Normal file
5
examples/lookahead/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
set(TARGET lookahead)
|
||||
add_executable(${TARGET} lookahead.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
487
examples/lookahead/lookahead.cpp
Normal file
487
examples/lookahead/lookahead.cpp
Normal file
@@ -0,0 +1,487 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct ngram_data {
|
||||
bool active = false;
|
||||
|
||||
llama_seq_id seq_id = -1;
|
||||
|
||||
std::vector<int> i_batch;
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
};
|
||||
|
||||
// n-gram container
|
||||
struct ngram_container {
|
||||
ngram_container(int n_vocab, int N, int G) {
|
||||
cnt.resize(n_vocab);
|
||||
head.resize(n_vocab);
|
||||
tokens.resize(n_vocab * G * (N - 1));
|
||||
}
|
||||
|
||||
int n_total = 0;
|
||||
|
||||
std::vector<int> cnt;
|
||||
std::vector<int> head;
|
||||
|
||||
// [n_vocab][G][N - 1]
|
||||
// for each token of the vocab, keep a ring-buffer of capacity G of n-grams of size N - 1
|
||||
std::vector<llama_token> tokens;
|
||||
};
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int W = 15; // lookahead window
|
||||
const int N = 5; // n-gram size
|
||||
const int G = 15; // max verification n-grams
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("lookahead", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_model * model = NULL;
|
||||
llama_context * ctx = NULL;
|
||||
|
||||
// load the target model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
|
||||
// Tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
LOG("add_bos tgt: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
std::vector<llama_token> all;
|
||||
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
all = inp;
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
||||
if ((int) inp.size() > max_tokens_list_size) {
|
||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
|
||||
const int n_input = inp.size();
|
||||
|
||||
const auto t_enc_start = ggml_time_us();
|
||||
|
||||
// eval the prompt
|
||||
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
|
||||
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
|
||||
|
||||
for (int s = 1; s < W + G + 1; ++s) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
|
||||
int n_predict = 0;
|
||||
int n_accept = 0;
|
||||
|
||||
int n_past = inp.size();
|
||||
|
||||
llama_token id = 0;
|
||||
|
||||
// used to determine end of generation
|
||||
bool has_eos = false;
|
||||
|
||||
// for each decoded batch, we have at most W + G + 1 distinct sequences:
|
||||
// seq_id == 0 : the current input token
|
||||
// seq_id [1, W] : tokens from the past N - 1 Jacobi iterations
|
||||
// seq_id [W + 1, W + G] : verification n-grams
|
||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||
|
||||
// target model sampling context
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
||||
|
||||
// verification n-grams
|
||||
std::vector<ngram_data> ngrams_cur(G);
|
||||
|
||||
// tokens for the past N - 1 Jacobi iterations
|
||||
std::vector<llama_token> tokens_j_prev(W);
|
||||
std::vector<std::vector<llama_token>> tokens_j(N - 1);
|
||||
for (int j = 0; j < N - 1; j++) {
|
||||
tokens_j[j].resize(W);
|
||||
|
||||
for (int i = 0; i < W; i++) {
|
||||
// there are different ways to init these tokens
|
||||
if (0) {
|
||||
// initialize randomly from the prompt tokens
|
||||
tokens_j[j][i] = all[1 + rand() % (all.size() - 1)];
|
||||
} else {
|
||||
// initialize with a sequence of increasing numbers
|
||||
tokens_j[j][i] = 100 + i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<llama_seq_id> seq_id_look;
|
||||
|
||||
// the input token belongs both to all sequences
|
||||
std::vector<llama_seq_id> seq_id_all(W + G + 1);
|
||||
for (int i = 0; i < W + G + 1; i++) {
|
||||
seq_id_all[i] = i;
|
||||
}
|
||||
|
||||
// here we keep adding new n-grams as we go
|
||||
ngram_container ngrams_observed(llama_n_vocab(model), N, G);
|
||||
|
||||
// debug
|
||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
// sample first token
|
||||
{
|
||||
id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
{
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
|
||||
printf("%s", token_str.c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
while (true) {
|
||||
// debug
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||
//
|
||||
// Example for W = 5, N = 4, G = 2:
|
||||
// (I = input, L = lookahead, V = verification)
|
||||
//
|
||||
// Batch: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
|
||||
// T: -2 -2 -2 -2 -1 -1 -1 -1 -1 0 0 0 0 0 0
|
||||
// Info: I L L L L L L L L L L L L L L V V V V V V
|
||||
// Pos: 0 1 2 3 4 1 2 3 4 5 2 3 4 5 6 1 2 3 1 2 3 (+ n_past)
|
||||
// Logits: 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||
// ---------------------------------------------------------------------
|
||||
// Seq: 0
|
||||
// 1 1 1
|
||||
// 2 2 2 2
|
||||
// 3 3 3 3 3
|
||||
// 4 4 4 4 4 4
|
||||
// 5 5 5 5 5 5 5
|
||||
// 6 6 6 6
|
||||
// 7 7 7 7
|
||||
// ---------------------------------------------------------------------
|
||||
// | | | | | | | | | | |
|
||||
// V V V V V | | | | | |
|
||||
// j_tokens | | | | | |
|
||||
// V V V V V V
|
||||
// id
|
||||
{
|
||||
llama_batch_clear(batch);
|
||||
|
||||
// current token - first token of the first level
|
||||
llama_batch_add(batch, id, n_past, seq_id_all, true);
|
||||
|
||||
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
|
||||
{
|
||||
const int g_cur = ngrams_observed.cnt[id];
|
||||
|
||||
ngrams_cur.resize(g_cur);
|
||||
for (int g = 0; g < g_cur; g++) {
|
||||
ngrams_cur[g].active = true;
|
||||
ngrams_cur[g].tokens.resize(N);
|
||||
ngrams_cur[g].i_batch.resize(N);
|
||||
ngrams_cur[g].seq_id = W + 1 + g;
|
||||
ngrams_cur[g].i_batch[0] = 0;
|
||||
ngrams_cur[g].tokens [0] = id;
|
||||
}
|
||||
|
||||
for (int j = 0; j < N - 1; j++) {
|
||||
for (int g = 0; g < g_cur; g++) {
|
||||
const int idx = id*(N - 1)*G + g*(N - 1);
|
||||
|
||||
const llama_token t = ngrams_observed.tokens[idx + j];
|
||||
|
||||
ngrams_cur[g].tokens [j + 1] = t;
|
||||
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
|
||||
|
||||
llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fill the remaining W - 1 tokens for the first level
|
||||
for (int i = 1; i < W; i++) {
|
||||
seq_id_look.resize(W - i);
|
||||
for (int j = 0; j < W - i; j++) {
|
||||
seq_id_look[j] = i + j + 1;
|
||||
}
|
||||
|
||||
llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
|
||||
}
|
||||
|
||||
// fill the rest of the levels
|
||||
for (int j = 1; j < N - 1; j++) {
|
||||
for (int i = 0; i < W; i++) {
|
||||
llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int seq_id_best = 0;
|
||||
|
||||
for (int v = 0; v < N; ++v) {
|
||||
int i_batch = 0;
|
||||
|
||||
// if no active ngrams are left, it means the sampled token does not pass the verification
|
||||
if (v > 0) {
|
||||
for (int g = 0; g < (int) ngrams_cur.size(); g++) {
|
||||
if (ngrams_cur[g].active) {
|
||||
i_batch = ngrams_cur[g].i_batch[v];
|
||||
seq_id_best = ngrams_cur[g].seq_id;
|
||||
|
||||
++n_accept;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// no more matches -> create a new batch
|
||||
if (i_batch == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// sample the next token
|
||||
id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
// print
|
||||
{
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
|
||||
if (v == 0) {
|
||||
printf("%s", token_str.c_str());
|
||||
} else {
|
||||
// print light cyan
|
||||
printf("\033[0;96m%s\033[0m", token_str.c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (id == llama_token_eos(model)) {
|
||||
has_eos = true;
|
||||
}
|
||||
|
||||
all.push_back(id);
|
||||
}
|
||||
|
||||
++n_predict;
|
||||
++n_past;
|
||||
|
||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||
break;
|
||||
}
|
||||
|
||||
// verify across active n-grams
|
||||
for (int g = 0; g < (int) ngrams_cur.size(); g++) {
|
||||
if (ngrams_cur[g].active) {
|
||||
if (v == N - 1) {
|
||||
ngrams_cur[g].active = false;
|
||||
} else {
|
||||
if (id != ngrams_cur[g].tokens[v + 1]) {
|
||||
ngrams_cur[g].active = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print known n-grams starting with token id (debug)
|
||||
if (0 && v == 0) {
|
||||
if (ngrams_observed.cnt[id] > 0) {
|
||||
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
||||
printf(" - ngram %2d: ", i);
|
||||
|
||||
const int idx = id*(N - 1)*G + i*(N - 1);
|
||||
|
||||
for (int j = 0; j < N - 1; j++) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
||||
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
// update lookahead tokens
|
||||
{
|
||||
for (int i = 0; i < W; i++) {
|
||||
tokens_j_prev[i] = tokens_j[0][i];
|
||||
}
|
||||
|
||||
for (int j = 0; j < N - 2; j++) {
|
||||
tokens_j[j] = tokens_j[j + 1];
|
||||
}
|
||||
|
||||
if (v == 0) {
|
||||
// sample from the last level
|
||||
for (int i = 0; i < W; i++) {
|
||||
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < W; i++) {
|
||||
// there are different ways to init these tokens
|
||||
if (0) {
|
||||
// random init
|
||||
tokens_j[N - 2][i] = all[1 + rand() % (all.size() - 1)];
|
||||
} else {
|
||||
// init from the previous level
|
||||
tokens_j[N - 2][i] = tokens_j[0][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// update observed ngrams
|
||||
if (v == 0) {
|
||||
// the first token of the n-gram is determined by the index in the container so it is not stored
|
||||
std::vector<llama_token> ngram(N - 1);
|
||||
|
||||
// n-gram generation
|
||||
// ref: https://github.com/hao-ai-lab/LookaheadDecoding/issues/14#issuecomment-1826198518
|
||||
for (int f = 0; f < W; ++f) {
|
||||
const int ft = tokens_j_prev[f]; // first token of the n-gram
|
||||
|
||||
for (int j = 0; j < N - 1; ++j) {
|
||||
ngram[j] = tokens_j[j][f];
|
||||
}
|
||||
|
||||
// filter-out repeating n-grams
|
||||
{
|
||||
bool is_unique = true;
|
||||
|
||||
for (int k = 0; k < ngrams_observed.cnt[ft]; ++k) {
|
||||
const int idx = ft*(N - 1)*G + k*(N - 1);
|
||||
|
||||
bool is_match = true;
|
||||
for (int j = 0; j < N - 1; ++j) {
|
||||
if (ngrams_observed.tokens[idx + j] != ngram[j]) {
|
||||
is_match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_match) {
|
||||
is_unique = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_unique) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const int head = ngrams_observed.head[ft];
|
||||
const int idx = ft*(N - 1)*G + head*(N - 1);
|
||||
|
||||
for (int i = 0; i < N - 1; i++) {
|
||||
ngrams_observed.tokens[idx + i] = ngram[i];
|
||||
}
|
||||
|
||||
ngrams_observed.cnt[ft] = std::min(G, ngrams_observed.cnt[ft] + 1);
|
||||
ngrams_observed.head[ft] = (head + 1) % G;
|
||||
|
||||
ngrams_observed.n_total++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||
break;
|
||||
}
|
||||
|
||||
// KV cache management
|
||||
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
|
||||
llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
|
||||
|
||||
if (seq_id_best != 0) {
|
||||
// if a verification token matched, we keep the best sequence and remove the rest
|
||||
// this leads to some KV cache fragmentation
|
||||
llama_kv_cache_seq_keep(ctx, seq_id_best);
|
||||
llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1);
|
||||
llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1);
|
||||
|
||||
for (int s = 1; s < W + G + 1; ++s) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto t_dec_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("W = %2d\n", W);
|
||||
LOG_TEE("N = %2d\n", N);
|
||||
LOG_TEE("G = %2d\n", G);
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("n_predict = %d\n", n_predict);
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
llama_kv_cache_view_free(&kvc_view);
|
||||
llama_sampling_free(ctx_sampling);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -234,6 +234,55 @@ node index.js
|
||||
|
||||
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
|
||||
|
||||
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
|
||||
|
||||
*Options:*
|
||||
|
||||
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
|
||||
|
||||
*Examples:*
|
||||
|
||||
You can use either Python `openai` library with appropriate checkpoints:
|
||||
|
||||
```python
|
||||
import openai
|
||||
|
||||
client = openai.OpenAI(
|
||||
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
|
||||
api_key = "sk-no-key-required"
|
||||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
|
||||
{"role": "user", "content": "Write a limerick about python exceptions"}
|
||||
]
|
||||
)
|
||||
|
||||
print(completion.choices[0].message)
|
||||
```
|
||||
... or raw HTTP requests:
|
||||
|
||||
```shell
|
||||
curl http://localhost:8080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer no-key" \
|
||||
-d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Write a limerick about python exceptions"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
## More examples
|
||||
|
||||
### Change system prompt on runtime
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
#define SERVER_VERBOSE 1
|
||||
#endif
|
||||
|
||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
struct server_params
|
||||
@@ -59,6 +61,10 @@ static bool server_verbose = false;
|
||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
|
||||
json oaicompat_completion_params_parse(const json &body);
|
||||
std::string format_chatml(std::vector<json> messages);
|
||||
|
||||
|
||||
//
|
||||
// base64 utils (TODO: move to common in the future)
|
||||
//
|
||||
@@ -378,6 +384,9 @@ struct llama_client_slot
|
||||
bool stopped_word = false;
|
||||
bool stopped_limit = false;
|
||||
|
||||
bool oaicompat = false;
|
||||
std::string oaicompat_model;
|
||||
|
||||
std::string stopping_word;
|
||||
|
||||
// sampling
|
||||
@@ -477,7 +486,7 @@ struct llama_client_slot
|
||||
};
|
||||
}
|
||||
|
||||
void print_timings() {
|
||||
void print_timings() const {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
|
||||
@@ -609,6 +618,11 @@ struct llama_server_context
|
||||
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
||||
{
|
||||
// TODO: currently, we tokenize using special tokens by default
|
||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||
const bool TMP_FORCE_SPECIAL = true;
|
||||
|
||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||
// or the first element of the json_prompt array is a string.
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
@@ -624,12 +638,12 @@ struct llama_server_context
|
||||
std::vector<llama_token> p;
|
||||
if (first)
|
||||
{
|
||||
p = ::llama_tokenize(ctx, s, add_bos);
|
||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = ::llama_tokenize(ctx, s, false);
|
||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
}
|
||||
@@ -646,7 +660,7 @@ struct llama_server_context
|
||||
else
|
||||
{
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
@@ -677,6 +691,14 @@ struct llama_server_context
|
||||
slot_params default_params;
|
||||
llama_sampling_params default_sparams;
|
||||
|
||||
if (data.count("__oaicompat") != 0) {
|
||||
slot->oaicompat = true;
|
||||
slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||
} else {
|
||||
slot->oaicompat = false;
|
||||
slot->oaicompat_model = "";
|
||||
}
|
||||
|
||||
slot->params.stream = json_value(data, "stream", false);
|
||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||
@@ -1170,6 +1192,12 @@ struct llama_server_context
|
||||
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
|
||||
}
|
||||
|
||||
if (slot.oaicompat)
|
||||
{
|
||||
res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
|
||||
res.result_json["model"] = slot.oaicompat_model;
|
||||
}
|
||||
|
||||
queue_results.push_back(res);
|
||||
}
|
||||
|
||||
@@ -1217,6 +1245,12 @@ struct llama_server_context
|
||||
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
|
||||
}
|
||||
|
||||
if (slot.oaicompat)
|
||||
{
|
||||
res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
|
||||
res.result_json["model"] = slot.oaicompat_model;
|
||||
}
|
||||
|
||||
queue_results.push_back(res);
|
||||
}
|
||||
|
||||
@@ -1257,7 +1291,7 @@ struct llama_server_context
|
||||
task_server task;
|
||||
task.id = id_gen++;
|
||||
task.target_id = 0;
|
||||
task.data = data;
|
||||
task.data = std::move(data);
|
||||
task.infill_mode = infill;
|
||||
task.embedding_mode = embedding;
|
||||
task.type = COMPLETION_TASK;
|
||||
@@ -2180,6 +2214,233 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static std::string random_string()
|
||||
{
|
||||
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
||||
|
||||
std::random_device rd;
|
||||
std::mt19937 generator(rd());
|
||||
|
||||
std::string result(32, ' ');
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
result[i] = str[generator() % str.size()];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string gen_chatcmplid()
|
||||
{
|
||||
std::stringstream chatcmplid;
|
||||
chatcmplid << "chatcmpl-" << random_string();
|
||||
return chatcmplid.str();
|
||||
}
|
||||
|
||||
std::string format_chatml(std::vector<json> messages)
|
||||
{
|
||||
std::ostringstream chatml_msgs;
|
||||
|
||||
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||
chatml_msgs << "<|im_start|>"
|
||||
<< json_value(*it, "role", std::string("user")) << '\n';
|
||||
chatml_msgs << json_value(*it, "content", std::string(""))
|
||||
<< "<|im_end|>\n";
|
||||
}
|
||||
|
||||
chatml_msgs << "<|im_start|>assistant" << '\n';
|
||||
|
||||
return chatml_msgs.str();
|
||||
}
|
||||
|
||||
/* llama.cpp completion api semantics */
|
||||
json oaicompat_completion_params_parse(
|
||||
const json &body /* openai api json semantics */)
|
||||
{
|
||||
json llama_params;
|
||||
|
||||
llama_params["__oaicompat"] = true;
|
||||
|
||||
// Map OpenAI parameters to llama.cpp parameters
|
||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
||||
llama_params["top_k"] = json_value(body, "top_k", 40);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 0.95);
|
||||
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
||||
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
|
||||
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
||||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||
llama_params["seed"] = json_value(body, "seed", 0);
|
||||
llama_params["stream"] = json_value(body, "stream", false);
|
||||
llama_params["mirostat"] = json_value(body, "mirostat", false);
|
||||
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
|
||||
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
|
||||
llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
|
||||
llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
|
||||
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
|
||||
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
|
||||
|
||||
if (llama_params.count("grammar") != 0) {
|
||||
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||
}
|
||||
|
||||
// Handle 'stop' field
|
||||
if (body["stop"].is_null()) {
|
||||
llama_params["stop"] = json::array({});
|
||||
} else if (body["stop"].is_string()) {
|
||||
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
|
||||
} else {
|
||||
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||
}
|
||||
|
||||
// Ensure there is ChatML-specific end sequence among stop words
|
||||
llama_params["stop"].push_back("<|im_end|>");
|
||||
|
||||
return llama_params;
|
||||
}
|
||||
|
||||
static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
|
||||
{
|
||||
json result = response.result_json;
|
||||
|
||||
bool stopped_word = result.count("stopped_word") != 0;
|
||||
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
||||
std::string content = json_value(result, "content", std::string(""));
|
||||
|
||||
std::string finish_reason = "length";
|
||||
if (stopped_word || stopped_eos) {
|
||||
finish_reason = "stop";
|
||||
}
|
||||
|
||||
json choices =
|
||||
streaming ? json::array({json{{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"delta", json::object()}}})
|
||||
: json::array({json{{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"message", json{{"content", content},
|
||||
{"role", "assistant"}}}}});
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
|
||||
json res =
|
||||
json{{"choices", choices},
|
||||
{"created", t},
|
||||
{"model",
|
||||
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
|
||||
{"usage",
|
||||
json{{"completion_tokens", num_tokens_predicted},
|
||||
{"prompt_tokens", num_prompt_tokens},
|
||||
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
|
||||
{"id", gen_chatcmplid()}};
|
||||
|
||||
if (server_verbose) {
|
||||
res["__verbose"] = result;
|
||||
}
|
||||
|
||||
if (result.contains("completion_probabilities")) {
|
||||
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// return value is vector as there is one case where we might need to generate two responses
|
||||
static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
|
||||
json result = response.result_json;
|
||||
|
||||
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
||||
return std::vector<json>({response.result_json});
|
||||
}
|
||||
|
||||
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
|
||||
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||
|
||||
bool stopped_word = json_value(result, "stopped_word", false);
|
||||
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||
bool stopped_limit = json_value(result, "stopped_limit", false);
|
||||
std::string content = json_value(result, "content", std::string(""));
|
||||
|
||||
std::string finish_reason;
|
||||
if (stopped_word || stopped_eos) {
|
||||
finish_reason = "stop";
|
||||
}
|
||||
if (stopped_limit) {
|
||||
finish_reason = "length";
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
|
||||
json choices;
|
||||
|
||||
if (!finish_reason.empty()) {
|
||||
choices = json::array({json{{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"delta", json::object()}}});
|
||||
} else {
|
||||
if (first) {
|
||||
if (content.empty()) {
|
||||
choices = json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{{"role", "assistant"}}}}});
|
||||
} else {
|
||||
// We have to send this as two updates to conform to openai behavior
|
||||
json initial_ret = json{{"choices", json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"role", "assistant"}
|
||||
}}}})},
|
||||
{"created", t},
|
||||
{"id", gen_chatcmplid()},
|
||||
{"model", modelname},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
json second_ret = json{
|
||||
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"content", content}}}
|
||||
}})},
|
||||
{"created", t},
|
||||
{"id", gen_chatcmplid()},
|
||||
{"model", modelname},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
return std::vector<json>({initial_ret, second_ret});
|
||||
}
|
||||
} else {
|
||||
// Some idiosyncrasy in task processing logic makes several trailing calls
|
||||
// with empty content, we ignore these at the calee site.
|
||||
if (content.empty()) {
|
||||
return std::vector<json>({json::object()});
|
||||
}
|
||||
|
||||
choices = json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta",
|
||||
json{
|
||||
{"content", content},
|
||||
}},
|
||||
}});
|
||||
}
|
||||
}
|
||||
|
||||
json ret = json{{"choices", choices},
|
||||
{"created", t},
|
||||
{"id", gen_chatcmplid()},
|
||||
{"model", modelname},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
return std::vector<json>({ret});
|
||||
}
|
||||
|
||||
static json format_partial_response(
|
||||
llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
|
||||
) {
|
||||
@@ -2356,9 +2617,9 @@ int main(int argc, char **argv)
|
||||
task_result result = llama.next_result(task_id);
|
||||
if (!result.error) {
|
||||
const std::string str =
|
||||
"data: " +
|
||||
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
"data: " +
|
||||
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
LOG_VERBOSE("data stream", {
|
||||
{ "to_send", str }
|
||||
});
|
||||
@@ -2371,9 +2632,9 @@ int main(int argc, char **argv)
|
||||
}
|
||||
} else {
|
||||
const std::string str =
|
||||
"error: " +
|
||||
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
"error: " +
|
||||
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
LOG_VERBOSE("data stream", {
|
||||
{ "to_send", str }
|
||||
});
|
||||
@@ -2398,6 +2659,98 @@ int main(int argc, char **argv)
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
|
||||
svr.Get("/v1/models", [¶ms](const httplib::Request&, httplib::Response& res)
|
||||
{
|
||||
std::time_t t = std::time(0);
|
||||
|
||||
json models = {
|
||||
{"object", "list"},
|
||||
{"data", {
|
||||
{
|
||||
{"id", params.model_alias},
|
||||
{"object", "model"},
|
||||
{"created", t},
|
||||
{"owned_by", "llamacpp"}
|
||||
},
|
||||
}}
|
||||
};
|
||||
|
||||
res.set_content(models.dump(), "application/json");
|
||||
});
|
||||
|
||||
// TODO: add mount point without "/v1" prefix -- how?
|
||||
svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
||||
|
||||
const int task_id = llama.request_completion(data, false, false);
|
||||
|
||||
if (!json_value(data, "stream", false)) {
|
||||
std::string completion_text;
|
||||
task_result result = llama.next_result(task_id);
|
||||
|
||||
if (!result.error && result.stop) {
|
||||
json oaicompat_result = format_final_response_oaicompat(data, result);
|
||||
|
||||
res.set_content(oaicompat_result.dump(-1, ' ', false,
|
||||
json::error_handler_t::replace),
|
||||
"application/json");
|
||||
} else {
|
||||
res.status = 500;
|
||||
res.set_content(result.result_json["content"], "text/plain");
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
|
||||
while (true) {
|
||||
task_result llama_result = llama.next_result(task_id);
|
||||
if (!llama_result.error) {
|
||||
std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
|
||||
|
||||
for (auto it = result_array.begin(); it != result_array.end(); ++it)
|
||||
{
|
||||
if (!it->empty()) {
|
||||
const std::string str =
|
||||
"data: " +
|
||||
it->dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
LOG_VERBOSE("data stream", {{"to_send", str}});
|
||||
if (!sink.write(str.c_str(), str.size())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (llama_result.stop) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
const std::string str =
|
||||
"error: " +
|
||||
llama_result.result_json.dump(-1, ' ', false,
|
||||
json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
LOG_VERBOSE("data stream", {{"to_send", str}});
|
||||
if (!sink.write(str.c_str(), str.size())) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
sink.done();
|
||||
return true;
|
||||
};
|
||||
|
||||
auto on_complete = [task_id, &llama](bool) {
|
||||
// cancel request
|
||||
llama.request_cancel(task_id);
|
||||
};
|
||||
|
||||
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
|
||||
}
|
||||
});
|
||||
|
||||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
json data = json::parse(req.body);
|
||||
|
||||
37
ggml-cuda.cu
37
ggml-cuda.cu
@@ -4610,8 +4610,8 @@ static __global__ void rope(
|
||||
|
||||
template<typename T, bool has_pos>
|
||||
static __global__ void rope_neox(
|
||||
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
||||
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
||||
) {
|
||||
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
@@ -4620,23 +4620,25 @@ static __global__ void rope_neox(
|
||||
}
|
||||
|
||||
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int i = row*ncols + col/2;
|
||||
const int ib = col / n_dims;
|
||||
const int ic = col % n_dims;
|
||||
|
||||
const int i = row*ncols + ib*n_dims + ic/2;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
// simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
|
||||
const float cur_rot = -float(col)/ncols;
|
||||
float cur_rot = inv_ndims * ic - ib;
|
||||
|
||||
const int p = has_pos ? pos[i2] : 0;
|
||||
const float theta_base = p*powf(freq_base, cur_rot);
|
||||
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + ncols/2];
|
||||
const float x1 = x[i + n_dims/2];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
static __global__ void rope_glm_f32(
|
||||
@@ -5739,20 +5741,26 @@ static void rope_cuda(
|
||||
|
||||
template<typename T>
|
||||
static void rope_neox_cuda(
|
||||
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
||||
) {
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float inv_ndims = -1.0f / n_dims;
|
||||
|
||||
if (pos == nullptr) {
|
||||
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, inv_ndims
|
||||
);
|
||||
} else {
|
||||
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, inv_ndims
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -6707,15 +6715,14 @@ inline void ggml_cuda_op_rope(
|
||||
GGML_ASSERT(false);
|
||||
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
||||
} else if (is_neox) {
|
||||
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_neox_cuda(
|
||||
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, main_stream
|
||||
);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_neox_cuda(
|
||||
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
(const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, main_stream
|
||||
);
|
||||
} else {
|
||||
|
||||
@@ -1433,7 +1433,8 @@ void ggml_metal_graph_compute(
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
|
||||
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
||||
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
|
||||
13
ggml.c
13
ggml.c
@@ -9373,7 +9373,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
||||
// TODO: find the optimal values for these
|
||||
if (ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) &&
|
||||
src0->type == GGML_TYPE_F32 &&
|
||||
//src0->type == GGML_TYPE_F32 &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
||||
|
||||
@@ -15689,13 +15689,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
{
|
||||
n_tasks = 1;
|
||||
} break;
|
||||
case GGML_OP_COUNT:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
|
||||
fprintf(stderr, "%s: op not implemented: ", __func__);
|
||||
if (node->op < GGML_OP_COUNT) {
|
||||
fprintf(stderr, "%s\n", ggml_op_name(node->op));
|
||||
} else {
|
||||
fprintf(stderr, "%d\n", node->op);
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
|
||||
41
llama.cpp
41
llama.cpp
@@ -1118,6 +1118,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
||||
//
|
||||
|
||||
struct llama_state {
|
||||
llama_state() {
|
||||
#ifdef GGML_USE_METAL
|
||||
ggml_metal_log_set_callback(log_callback, log_callback_user_data);
|
||||
#endif
|
||||
}
|
||||
|
||||
// We save the log callback globally
|
||||
ggml_log_callback log_callback = llama_log_callback_default;
|
||||
void * log_callback_user_data = nullptr;
|
||||
@@ -3469,7 +3475,7 @@ static void llm_build_k_shift(
|
||||
struct ggml_cgraph * graph,
|
||||
llm_rope_type type,
|
||||
int64_t n_ctx,
|
||||
int64_t n_rot,
|
||||
int n_rot,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
const llm_build_cb & cb) {
|
||||
@@ -3501,7 +3507,7 @@ static void llm_build_k_shift(
|
||||
// we rotate only the first n_rot dimensions
|
||||
ggml_rope_custom_inplace(ctx,
|
||||
ggml_view_3d(ctx, kv.k,
|
||||
n_rot, n_head_kv, n_ctx,
|
||||
n_embd_head, n_head_kv, n_ctx,
|
||||
ggml_element_size(kv.k)*n_embd_head,
|
||||
ggml_element_size(kv.k)*n_embd_gqa,
|
||||
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
|
||||
@@ -5544,18 +5550,8 @@ static int llama_decode_internal(
|
||||
n_threads = std::min(4, n_threads);
|
||||
}
|
||||
|
||||
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
||||
const bool full_offload_supported =
|
||||
model.arch == LLM_ARCH_LLAMA ||
|
||||
model.arch == LLM_ARCH_BAICHUAN ||
|
||||
model.arch == LLM_ARCH_FALCON ||
|
||||
model.arch == LLM_ARCH_REFACT ||
|
||||
model.arch == LLM_ARCH_MPT ||
|
||||
model.arch == LLM_ARCH_STARCODER ||
|
||||
model.arch == LLM_ARCH_STABLELM;
|
||||
|
||||
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
||||
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
||||
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
||||
n_threads = 1;
|
||||
}
|
||||
|
||||
@@ -6414,10 +6410,13 @@ struct llama_grammar_candidate {
|
||||
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
||||
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
const char * src,
|
||||
size_t n_src,
|
||||
llama_partial_utf8 partial_start) {
|
||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
||||
const char * pos = src;
|
||||
std::vector<uint32_t> code_points;
|
||||
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
||||
code_points.reserve(n_src + 1);
|
||||
uint32_t value = partial_start.value;
|
||||
int n_remain = partial_start.n_remain;
|
||||
|
||||
@@ -6468,6 +6467,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
||||
}
|
||||
|
||||
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
std::string src,
|
||||
llama_partial_utf8 partial_start
|
||||
) {
|
||||
return decode_utf8(src.c_str(), src.size(), partial_start);
|
||||
}
|
||||
|
||||
// returns true iff pos points to the end of one of the definitions of a rule
|
||||
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
||||
switch (pos->type) {
|
||||
@@ -7117,7 +7123,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||
} else if (piece.empty() || piece[0] == 0) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
||||
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||
}
|
||||
}
|
||||
@@ -7324,7 +7330,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||
const std::string piece = llama_token_to_piece(ctx, token);
|
||||
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
||||
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||
@@ -8569,8 +8575,6 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
if (model->n_gpu_layers > 0) {
|
||||
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
||||
|
||||
ctx->ctx_metal = ggml_metal_init(1);
|
||||
if (!ctx->ctx_metal) {
|
||||
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
||||
@@ -9706,6 +9710,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
||||
g_state.log_callback_user_data = user_data;
|
||||
#ifdef GGML_USE_METAL
|
||||
ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
||||
|
||||
2
llama.h
2
llama.h
@@ -185,7 +185,7 @@ extern "C" {
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
||||
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
||||
float yarn_attn_factor; // YaRN magnitude scaling factor
|
||||
float yarn_beta_fast; // YaRN low correction dim
|
||||
float yarn_beta_slow; // YaRN high correction dim
|
||||
|
||||
Reference in New Issue
Block a user