feat: use Q8 quantization
This commit is contained in:
@@ -4,26 +4,18 @@ services:
|
|||||||
container_name: llama-cpp-qwen
|
container_name: llama-cpp-qwen
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- 8000:8000
|
||||||
volumes:
|
volumes:
|
||||||
- ./models:/models
|
- ./models:/models
|
||||||
command:
|
command: >
|
||||||
- -m
|
-m /models/qwen/Qwen3.5-0.8B-UD-Q8_K_XL.gguf
|
||||||
- /models/qwen-3.5-0.8b/Qwen3.5-0.8B-UD-Q2_K_XL.gguf
|
--mmproj /models/qwen/mmproj-F16.gguf
|
||||||
- --mmproj
|
--host 0.0.0.0 --port "8000"
|
||||||
- /models/qwen-3.5-0.8b/mmproj-F16.gguf
|
--ctx-size "16384"
|
||||||
- --host
|
--temp "0.6"
|
||||||
- 0.0.0.0
|
--top-p "1.00"
|
||||||
- --port
|
--top-k "20"
|
||||||
- "8000"
|
--min-p "0.00"
|
||||||
- --ctx-size
|
--presence-penalty 2.0
|
||||||
- "16384"
|
--repeat-penalty 1.0
|
||||||
- --temp
|
--chat-template-kwargs '{"enable_thinking":false}'
|
||||||
- "0.7"
|
|
||||||
- --top-p
|
|
||||||
- "0.8"
|
|
||||||
- --top-k
|
|
||||||
- "20"
|
|
||||||
- --min-p
|
|
||||||
- "0.00"
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user