Skip to content

Commit 094975c

Browse files
authored
Update the llamacpp backend (#3022)
* Build faster Signed-off-by: Adrien Gallouët <[email protected]> * Make --model-gguf optional Signed-off-by: Adrien Gallouët <[email protected]> * Bump llama.cpp Signed-off-by: Adrien Gallouët <[email protected]> * Enable mmap, offload_kqv & flash_attention by default Signed-off-by: Adrien Gallouët <[email protected]> * Update doc Signed-off-by: Adrien Gallouët <[email protected]> * Better error message Signed-off-by: Adrien Gallouët <[email protected]> * Update doc Signed-off-by: Adrien Gallouët <[email protected]> * Update installed packages Signed-off-by: Adrien Gallouët <[email protected]> * Save gguf in models/MODEL_ID/model.gguf Signed-off-by: Adrien Gallouët <[email protected]> * Fix build with Mach-O Signed-off-by: Adrien Gallouët <[email protected]> * Quantize without llama-quantize Signed-off-by: Adrien Gallouët <[email protected]> * Bump llama.cpp and switch to ggml-org Signed-off-by: Adrien Gallouët <[email protected]> * Remove make-gguf.sh Signed-off-by: Adrien Gallouët <[email protected]> * Update Cargo.lock Signed-off-by: Adrien Gallouët <[email protected]> * Support HF_HUB_USER_AGENT_ORIGIN Signed-off-by: Adrien Gallouët <[email protected]> * Bump llama.cpp Signed-off-by: Adrien Gallouët <[email protected]> * Add --build-arg llamacpp_native & llamacpp_cpu_arm_arch Signed-off-by: Adrien Gallouët <[email protected]> --------- Signed-off-by: Adrien Gallouët <[email protected]>
1 parent dc5f05f commit 094975c

File tree

10 files changed

+201
-63
lines changed

10 files changed

+201
-63
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Dockerfile_llamacpp

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps
22

3-
ARG llamacpp_version=b4651
3+
ARG llamacpp_version=b4827
44
ARG llamacpp_cuda=OFF
5+
ARG llamacpp_native=ON
6+
ARG llamacpp_cpu_arm_arch=native
57
ARG cuda_arch=75-real;80-real;86-real;89-real;90-real
68

79
WORKDIR /opt/src
810

911
ENV DEBIAN_FRONTEND=noninteractive
10-
RUN apt update && apt install -y \
12+
RUN apt update && apt upgrade -y && apt install -y \
1113
clang \
1214
cmake \
1315
curl \
@@ -17,16 +19,19 @@ RUN apt update && apt install -y \
1719
pkg-config \
1820
tar
1921

20-
ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
21-
RUN tar -xzf ${llamacpp_version}.tar.gz \
22-
&& cd llama.cpp-${llamacpp_version} \
22+
ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
23+
RUN mkdir -p llama.cpp \
24+
&& tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
25+
&& cd llama.cpp \
2326
&& cmake -B build \
2427
-DCMAKE_INSTALL_PREFIX=/usr \
2528
-DCMAKE_INSTALL_LIBDIR=/usr/lib \
2629
-DCMAKE_C_COMPILER=clang \
2730
-DCMAKE_CXX_COMPILER=clang++ \
2831
-DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
2932
-DGGML_CUDA=${llamacpp_cuda} \
33+
-DGGML_NATIVE=${llamacpp_native} \
34+
-DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
3035
-DLLAMA_BUILD_COMMON=OFF \
3136
-DLLAMA_BUILD_TESTS=OFF \
3237
-DLLAMA_BUILD_EXAMPLES=OFF \
@@ -48,28 +53,35 @@ FROM deps AS builder
4853
COPY --from=planner /app/recipe.json recipe.json
4954
RUN cargo chef cook \
5055
--recipe-path recipe.json \
51-
--profile release-opt \
56+
--profile release \
5257
--package text-generation-router-llamacpp
5358
COPY . .
5459
RUN cargo build \
55-
--profile release-opt \
60+
--profile release \
5661
--package text-generation-router-llamacpp --frozen
5762

5863
FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
64+
WORKDIR /app
5965

60-
RUN apt update && apt install -y \
66+
ENV DEBIAN_FRONTEND=noninteractive
67+
RUN apt update && apt upgrade -y && apt install -y \
6168
python3-venv \
6269
python3-pip
6370

6471
RUN python3 -m venv /venv
6572
ENV PATH="/venv/bin:$PATH"
6673

6774
COPY backends/llamacpp/requirements.txt requirements.txt
68-
RUN pip3 install --no-cache-dir -r requirements.txt
75+
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
76+
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
77+
78+
RUN pip3 install --no-cache-dir \
79+
-r requirements.txt \
80+
-e gguf-py
6981

7082
COPY --from=builder /usr/lib/libllama.so /usr/lib/
7183
COPY --from=builder /usr/lib/libggml*.so /usr/lib/
72-
COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /usr/bin/
84+
COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
7385

7486
ENV HF_HUB_ENABLE_HF_TRANSFER=1
7587

backends/llamacpp/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@ pkg-config = "0.3.31"
1212
[dependencies]
1313
async-trait = "0.1.85"
1414
clap = "4.5.27"
15+
hf-hub.workspace = true
1516
num_cpus = "1.16.0"
1617
text-generation-router = { path = "../../router" }
1718
thiserror = "2.0.11"
1819
tokenizers.workspace = true
19-
tokio = "1.43.0"
20+
tokio = { version = "1.43.0", features = ["process"] }
2021
tokio-stream = "0.1.17"
2122
tracing = "0.1.41"

backends/llamacpp/build.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ fn main() {
2525
for path in &llama.link_paths {
2626
println!("cargo:rustc-link-arg=-Wl,-rpath,{}", path.display());
2727
}
28-
println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags");
29-
28+
if cfg!(target_os = "linux") {
29+
println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags");
30+
}
3031
let bindings = bindgen::Builder::default()
3132
.clang_args(
3233
llama

backends/llamacpp/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
transformers==4.49
22
huggingface-hub==0.28.1
33
hf-transfer==0.1.9
4+
torch==2.6.0

backends/llamacpp/src/backend.rs

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
1-
mod llamacpp {
2-
#![allow(non_upper_case_globals)]
3-
#![allow(non_camel_case_types)]
4-
#![allow(non_snake_case)]
5-
#![allow(dead_code)]
6-
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
7-
}
1+
use crate::llamacpp;
2+
83
use async_trait::async_trait;
94
use std::ffi::CString;
105
use std::mem::replace;

backends/llamacpp/src/llamacpp.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#![allow(non_upper_case_globals)]
2+
#![allow(non_camel_case_types)]
3+
#![allow(non_snake_case)]
4+
#![allow(dead_code)]
5+
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));

backends/llamacpp/src/main.rs

Lines changed: 84 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
mod backend;
2+
mod llamacpp;
3+
mod quantize;
4+
5+
use quantize::QuantizeType;
26

37
use backend::{
48
BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
59
LlamacppSplitMode,
610
};
711
use clap::Parser;
12+
use hf_hub::api::tokio::ApiBuilder;
13+
use hf_hub::{Repo, RepoType};
14+
use std::path::Path;
815
use text_generation_router::{logging, server, usage_stats};
916
use thiserror::Error;
10-
use tokenizers::{FromPretrainedParameters, Tokenizer};
17+
use tokenizers::Tokenizer;
18+
use tokio::process::Command;
1119
use tokio::sync::oneshot::error::RecvError;
1220
use tracing::{error, warn};
1321

@@ -25,7 +33,7 @@ struct Args {
2533

2634
/// Path to the GGUF model file for inference.
2735
#[clap(long, env)]
28-
model_gguf: String, // TODO Option() with hf->gguf & quantize
36+
model_gguf: Option<String>,
2937

3038
/// Number of threads to use for generation.
3139
#[clap(long, env)]
@@ -53,19 +61,19 @@ struct Args {
5361

5462
/// Use memory mapping for the model.
5563
#[clap(long, env)]
56-
use_mmap: bool,
64+
disable_mmap: bool,
5765

5866
/// Use memory locking to prevent swapping.
5967
#[clap(long, env)]
6068
use_mlock: bool,
6169

6270
/// Enable offloading of KQV operations to the GPU.
6371
#[clap(long, env)]
64-
offload_kqv: bool,
72+
disable_offload_kqv: bool,
6573

6674
/// Enable flash attention for faster inference. (EXPERIMENTAL)
6775
#[clap(long, env)]
68-
flash_attention: bool,
76+
disable_flash_attention: bool,
6977

7078
/// Data type used for K cache.
7179
#[clap(default_value = "f16", value_enum, long, env)]
@@ -194,35 +202,80 @@ async fn main() -> Result<(), RouterError> {
194202
));
195203
}
196204

197-
// TODO: check if we use the same cache of Server
198-
// check if llamacpp is faster
199-
let tokenizer = {
200-
let token = std::env::var("HF_TOKEN")
201-
.or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
202-
.ok();
203-
let params = FromPretrainedParameters {
204-
revision: args.revision.clone(),
205-
token,
206-
..Default::default()
207-
};
208-
Tokenizer::from_pretrained(args.model_id.clone(), Some(params))?
205+
let api_builder = || {
206+
let mut builder = ApiBuilder::new().with_progress(true);
207+
208+
if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
209+
builder = builder.with_cache_dir(cache_dir.into());
210+
}
211+
if let Ok(token) = std::env::var("HF_TOKEN") {
212+
builder = builder.with_token(token.into());
213+
}
214+
if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
215+
builder = builder.with_user_agent("origin", origin.as_str());
216+
}
217+
builder
218+
};
219+
let api_repo = api_builder().build()?.repo(Repo::with_revision(
220+
args.model_id.clone(),
221+
RepoType::Model,
222+
args.revision.clone(),
223+
));
224+
225+
let tokenizer_path = api_repo.get("tokenizer.json").await?;
226+
let tokenizer = Tokenizer::from_file(&tokenizer_path)?;
227+
228+
let model_gguf = if let Some(model_gguf) = args.model_gguf {
229+
model_gguf
230+
} else {
231+
let model_gguf = format!("models/{}/model.gguf", args.model_id);
232+
let model_gguf_path = Path::new(&model_gguf);
233+
234+
if !model_gguf_path.exists() {
235+
let tmp_gguf = "models/tmp.gguf";
236+
237+
if let Some(parent) = Path::new(model_gguf_path).parent() {
238+
std::fs::create_dir_all(parent)?;
239+
}
240+
let cache_path = tokenizer_path.parent().unwrap();
241+
242+
for sibling in api_repo.info().await?.siblings {
243+
let _ = api_repo.get(&sibling.rfilename).await?;
244+
}
245+
let status = Command::new("convert_hf_to_gguf.py")
246+
.arg("--outfile")
247+
.arg(tmp_gguf)
248+
.arg(cache_path)
249+
.spawn()?
250+
.wait()
251+
.await?;
252+
253+
if !status.success() {
254+
let exit_code = status.code().unwrap_or(-1);
255+
error!("Failed to generate GGUF, exit code: {}", exit_code);
256+
return Err(RouterError::CommandError(exit_code));
257+
}
258+
quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads)
259+
.map_err(RouterError::QuantizeError)?;
260+
}
261+
model_gguf
209262
};
210263

211264
let (backend, ok, shutdown) = LlamacppBackend::new(
212265
LlamacppConfig {
213-
model_gguf: args.model_gguf,
266+
model_gguf,
214267
n_threads,
215268
n_threads_batch,
216269
n_gpu_layers: args.n_gpu_layers,
217270
split_mode: args.split_mode,
218271
defrag_threshold: args.defrag_threshold,
219272
numa: args.numa,
220-
use_mmap: args.use_mmap,
273+
use_mmap: !args.disable_mmap,
221274
use_mlock: args.use_mlock,
222-
flash_attention: args.flash_attention,
275+
flash_attention: !args.disable_flash_attention,
223276
type_k: args.type_k,
224277
type_v: args.type_v,
225-
offload_kqv: args.offload_kqv,
278+
offload_kqv: !args.disable_offload_kqv,
226279
max_batch_total_tokens,
227280
max_physical_batch_total_tokens,
228281
max_batch_size,
@@ -281,4 +334,14 @@ enum RouterError {
281334
WebServer(#[from] server::WebServerError),
282335
#[error("Recv error: {0}")]
283336
RecvError(#[from] RecvError),
337+
#[error("Io error: {0}")]
338+
IoError(#[from] std::io::Error),
339+
#[error("Var error: {0}")]
340+
VarError(#[from] std::env::VarError),
341+
#[error("Quantize error: {0}")]
342+
QuantizeError(String),
343+
#[error("Command error: {0}")]
344+
CommandError(i32),
345+
#[error("HF hub error: {0}")]
346+
HubError(#[from] hf_hub::api::tokio::ApiError),
284347
}

backends/llamacpp/src/quantize.rs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
use crate::llamacpp;
2+
3+
use std::ffi::CString;
4+
5+
#[repr(u32)]
6+
#[derive(Debug, Clone, Copy)]
7+
pub enum QuantizeType {
8+
MostlyQ4_0 = 2,
9+
}
10+
11+
pub fn model(
12+
input_path: &str,
13+
output_path: &str,
14+
ftype: QuantizeType,
15+
n_threads: usize,
16+
) -> Result<(), String> {
17+
let c_input_path =
18+
CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;
19+
20+
let c_output_path =
21+
CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;
22+
23+
let result = unsafe {
24+
let mut params = llamacpp::model_quantize_default_params();
25+
params.nthread = n_threads as _;
26+
params.ftype = ftype as _;
27+
params.quantize_output_tensor = true;
28+
llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), &params)
29+
};
30+
if result == 0 {
31+
Ok(())
32+
} else {
33+
Err(format!("Quantization failed, error code: {}", result))
34+
}
35+
}

0 commit comments

Comments
 (0)