services: llama-cpp: image: ghcr.io/ggml-org/llama.cpp:server container_name: llama-cpp-qwen restart: unless-stopped ports: - 8000:8000 volumes: - ./models:/models command: > -m /models/qwen/Qwen3.5-0.8B-UD-Q8_K_XL.gguf --mmproj /models/qwen/mmproj-F16.gguf --host 0.0.0.0 --port "8000" --ctx-size "16384" --temp "0.6" --top-p "1.00" --top-k "20" --min-p "0.00" --presence-penalty 2.0 --repeat-penalty 1.0 --chat-template-kwargs '{"enable_thinking":false}'