118 lines
2.5 KiB
Bash
Executable File
118 lines
2.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
|
|
ENV_KEYS=(
|
|
DUCK_LLAMA_SERVER_BIN
|
|
DUCK_MAIN_MODEL_PATH
|
|
DUCK_MTP_MODEL_PATH
|
|
DUCK_MAIN_MTP_PORT
|
|
DUCK_CTX_SIZE
|
|
DUCK_N_GPU_LAYERS
|
|
DUCK_LLAMA_DEVICE
|
|
DUCK_PARALLEL
|
|
DUCK_MTP_FLAGS
|
|
DUCK_HOST
|
|
)
|
|
declare -A ENV_OVERRIDES=()
|
|
for key in "${ENV_KEYS[@]}"; do
|
|
if [[ -v "${key}" ]]; then
|
|
ENV_OVERRIDES["${key}"]="${!key}"
|
|
fi
|
|
done
|
|
|
|
if [[ -f "${ROOT_DIR}/.env" ]]; then
|
|
set -a
|
|
# shellcheck disable=SC1091
|
|
source "${ROOT_DIR}/.env"
|
|
set +a
|
|
fi
|
|
for key in "${!ENV_OVERRIDES[@]}"; do
|
|
export "${key}=${ENV_OVERRIDES[${key}]}"
|
|
done
|
|
|
|
: "${DUCK_MAIN_MODEL_PATH:?DUCK_MAIN_MODEL_PATH is required}"
|
|
ACTION="${1:-start}"
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage: scripts/llama/start_thinker_mtp_experimental.sh <command>
|
|
|
|
Commands:
|
|
start Start experimental MTP/speculative llama-server in foreground
|
|
check Check whether the current llama-server binary exposes draft-mtp flags
|
|
help Show this help
|
|
EOF
|
|
}
|
|
|
|
resolve_project_path() {
|
|
local value="$1"
|
|
if [[ "${value}" == /* ]]; then
|
|
printf '%s\n' "${value}"
|
|
else
|
|
printf '%s\n' "${ROOT_DIR}/${value#./}"
|
|
fi
|
|
}
|
|
|
|
LLAMA_BIN="${DUCK_LLAMA_SERVER_BIN:-llama-server}"
|
|
if [[ "${LLAMA_BIN}" == */* ]]; then
|
|
LLAMA_BIN="$(resolve_project_path "${LLAMA_BIN}")"
|
|
fi
|
|
MAIN_MODEL_PATH="$(resolve_project_path "${DUCK_MAIN_MODEL_PATH}")"
|
|
|
|
HELP_TEXT="$("${LLAMA_BIN}" --help 2>&1 || true)"
|
|
if ! grep -qi "draft-mtp" <<< "${HELP_TEXT}"; then
|
|
echo "This llama-server build does not expose draft-mtp speculative decoding."
|
|
exit 1
|
|
fi
|
|
|
|
case "${ACTION}" in
|
|
check)
|
|
echo "OK: draft-mtp speculative decoding is exposed by ${LLAMA_BIN}"
|
|
exit 0
|
|
;;
|
|
help|-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
start)
|
|
;;
|
|
*)
|
|
echo "Unknown command: ${ACTION}" >&2
|
|
usage >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
|
|
command=(
|
|
"${LLAMA_BIN}"
|
|
-m "${MAIN_MODEL_PATH}"
|
|
--alias local-main-mtp
|
|
--host "${DUCK_HOST:-127.0.0.1}"
|
|
--port "${DUCK_MAIN_MTP_PORT:-8085}"
|
|
-c "${DUCK_CTX_SIZE:-65536}"
|
|
--parallel "${DUCK_PARALLEL:-1}"
|
|
-ngl "${DUCK_N_GPU_LAYERS:-auto}"
|
|
--flash-attn on
|
|
--cache-prompt
|
|
--metrics
|
|
--spec-type draft-mtp
|
|
)
|
|
|
|
if [[ -n "${DUCK_LLAMA_DEVICE:-}" ]]; then
|
|
command+=(--device "${DUCK_LLAMA_DEVICE}")
|
|
fi
|
|
|
|
if [[ -n "${DUCK_MTP_MODEL_PATH:-}" ]]; then
|
|
command+=(--model-draft "$(resolve_project_path "${DUCK_MTP_MODEL_PATH}")")
|
|
fi
|
|
|
|
if [[ -n "${DUCK_MTP_FLAGS:-}" ]]; then
|
|
# shellcheck disable=SC2206
|
|
extra_args=( ${DUCK_MTP_FLAGS} )
|
|
command+=("${extra_args[@]}")
|
|
fi
|
|
|
|
exec "${command[@]}"
|