Development & integration server for AI (LLMs)
WebUI for Georgi Gerganov's llama.cpp
and Benson Wong's llama-swap
Current llama-swap configuration optimized for 32 GB VRAM (Ryzen 9 9950X3D 96 GB DDR5 6600 MT/s RTX 5090 FE Blackwell GB202 GDDR7)
sendLoadingState: true
macros:
llama-server: >
../llama.cpp.pascal/build/bin/llama-server
--port 8081
-ngl 999
-ctk q8_0
-ctv q8_0
-fa on
--mlock
-np 4
-kvu
--jinja
models: /var/www/ia/models
proxy: http://127.0.0.1:8081
models:
Dense-OLMo-2-0325-32B-Instruct:
cmd: |
${llama-server}
-m ${models}/unsloth/OLMo-2-0325-32B-Instruct-GGUF/OLMo-2-0325-32B-Instruct-Q6_K.gguf
--ctx-size 4096
proxy: ${proxy}
Dense-Vision-Mistral-Small-3.2-24B-Instruct-2506:
cmd: |
${llama-server}
-m ${models}/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/Mistral-Small-3.2-24B-Instruct-2506-Q6_K.gguf
--mmproj ${models}/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/mmproj-BF16.gguf
--ctx-size 65536
proxy: ${proxy}
Dense-Vision-Magistral-Small-2509:
cmd: |
${llama-server}
-m ${models}/unsloth/Magistral-Small-2509-GGUF/Magistral-Small-2509-Q6_K.gguf
--mmproj ${models}/unsloth/Magistral-Small-2509-GGUF/mmproj-BF16.gguf
--ctx-size 65536
proxy: ${proxy}
Dense-Uncensored-Dolphin-Mistral-24B-Venice-Edition:
cmd: |
${llama-server}
-m ${models}/bartowski/cognitivecomputations_Dolphin-Mistral-24B-Venice-Edition-GGUF/cognitivecomputations_Dolphin-Mistral-24B-Venice-Edition-Q8_0.gguf
--ctx-size 65536
proxy: ${proxy}
Dense-Uncensored-BlackSheep-24B:
cmd: |
${llama-server}
-m ${models}/mradermacher/BlackSheep-24B-i1-GGUF/BlackSheep-24B.Q8_0.gguf
--ctx-size 65536
proxy: ${proxy}
Dense-Uncensored-XortronCriminalComputingConfig-24B:
cmd: |
${llama-server}
-m ${models}/mradermacher/XortronCriminalComputingConfig-i1-GGUF/XortronCriminalComputingConfig.Q8_0.gguf
--ctx-size 65536
proxy: ${proxy}
Dense-RP-Cydonia-24B-v4.1:
cmd: |
${llama-server}
-m ${models}/bartowski/TheDrummer_Cydonia-24B-v4.1-GGUF/TheDrummer_Cydonia-24B-v4.1-Q8_0.gguf
--ctx-size 65536
proxy: ${proxy}
Dense-Devstral-Small-24B-2507:
cmd: |
${llama-server}
-m ${models}/unsloth/Devstral-Small-2507-GGUF/Devstral-Small-2507-Q6_K.gguf
--ctx-size 131072
proxy: ${proxy}
Dense-Codestral-22B-v0.1:
cmd: |
${llama-server}
-m ${models}/mradermacher/Codestral-22B-v0.1-i1-GGUF/Codestral-22B-v0.1.Q8_0.gguf
--ctx-size 32768
proxy: ${proxy}
Dense-Vision-Gemma-3-27B-IT:
cmd: |
${llama-server}
-m ${models}/unsloth/gemma-3-27b-it-GGUF/gemma-3-27b-it-Q6_K.gguf
--mmproj ${models}/unsloth/gemma-3-27b-it-GGUF/mmproj-BF16.gguf
--ctx-size 131072
proxy: ${proxy}
Dense-RP-Big-Tiger-Gemma-27B-v3:
cmd: |
${llama-server}
-m ${models}/bartowski/TheDrummer_Big-Tiger-Gemma-27B-v3-GGUF/TheDrummer_Big-Tiger-Gemma-27B-v3-Q6_K.gguf
--ctx-size 131072
proxy: ${proxy}
Dense-Seed-OSS-36B-Instruct:
cmd: |
${llama-server}
-m ${models}/unsloth/Seed-OSS-36B-Instruct-GGUF/Seed-OSS-36B-Instruct-Q5_K_M.gguf
--ctx-size 32768
proxy: ${proxy}
Dense-DeepSeek-Coder-33B-Instruct:
cmd: |
${llama-server}
-m ${models}/mradermacher/deepseek-coder-33b-instruct-i1-GGUF/deepseek-coder-33b-instruct.i1-Q6_K.gguf
--ctx-size 32768
proxy: ${proxy}
Dense-DeepSeek-R1-Distill-Qwen-32B:
cmd: |
${llama-server}
-m ${models}/unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF/DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf
--ctx-size 32768
proxy: ${proxy}
Dense-Aya-Expanse-32B:
cmd: |
${llama-server}
-m ${models}/mradermacher/aya-expanse-32b-i1-GGUF/aya-expanse-32b.i1-Q6_K.gguf
--ctx-size 32768
proxy: ${proxy}
Dense-GLM-4-32B-0414:
cmd: |
${llama-server}
-m ${models}/unsloth/GLM-4-32B-0414-GGUF/GLM-4-32B-0414-Q6_K.gguf
--ctx-size 32768
proxy: ${proxy}
Dense-GLM-Z1-32B-0414:
cmd: |
${llama-server}
-m ${models}/unsloth/GLM-Z1-32B-0414-GGUF/GLM-Z1-32B-0414-Q6_K.gguf
--ctx-size 32768
proxy: ${proxy}
MoE-GLM-4.5-Air-106B:
cmd: |
${llama-server}
-m ${models}/unsloth/GLM-4.5-Air-GGUF/GLM-4.5-Air-Q4_K_M-00001-of-00002.gguf
--n-cpu-moe 30
--ctx-size 32768
proxy: ${proxy}
#MoE-RP-GLM-Steam-106B-A12B-v1:
#cmd: |
#${llama-server}
#-m ${models}/bartowski/TheDrummer_GLM-Steam-106B-A12B-v1-GGUF/TheDrummer_GLM-Steam-106B-A12B-v1-Q4_K_M-00001-of-00002.gguf
#--n-cpu-moe 30
#--ctx-size 32768
#proxy: ${proxy}
Dense-EXAONE-4.0.1-32B:
cmd: |
${llama-server}
-m ${models}/mradermacher/EXAONE-4.0.1-32B-i1-GGUF/EXAONE-4.0.1-32B.i1-Q6_K.gguf
--ctx-size 131072
proxy: ${proxy}
Dense-QwQ-32B:
cmd: |
${llama-server}
-m ${models}/unsloth/QwQ-32B-GGUF/QwQ-32B-Q6_K.gguf
--ctx-size 32768
proxy: ${proxy}
Dense-Qwen3-32B:
cmd: |
${llama-server}
-m ${models}/mradermacher/Qwen3-32B-i1-GGUF/Qwen3-32B.i1-Q6_K.gguf
--ctx-size 32768
proxy: ${proxy}
Dense-Vision-Qwen2.5-VL-32B-Instruct:
cmd: |
${llama-server}
-m ${models}/unsloth/Qwen2.5-VL-32B-Instruct-GGUF/Qwen2.5-VL-32B-Instruct-Q5_K_M.gguf
--mmproj ${models}/unsloth/Qwen2.5-VL-32B-Instruct-GGUF/mmproj-BF16.gguf
--ctx-size 32768
proxy: ${proxy}
MoE-Qwen3-30B-A3B-Instruct-2507:
cmd: |
${llama-server}
-m ${models}/mradermacher/Qwen3-30B-A3B-Instruct-2507-i1-GGUF/Qwen3-30B-A3B-Instruct-2507.i1-Q6_K.gguf
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0
--ctx-size 32768
proxy: ${proxy}
filters:
strip_params: "temperature, top_p, top_k, min_p"
MoE-Qwen3-Coder-30B-A3B-Instruct:
cmd: |
${llama-server}
-m ${models}/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/Qwen3-Coder-30B-A3B-Instruct-Q6_K.gguf
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0
--ctx-size 131072
proxy: ${proxy}
filters:
strip_params: "temperature, top_p, top_k, min_p"
MoE-Qwen3-30B-A3B-Thinking-2507:
cmd: |
${llama-server}
-m ${models}/mradermacher/Qwen3-30B-A3B-Thinking-2507-i1-GGUF/Qwen3-30B-A3B-Thinking-2507.i1-Q6_K.gguf
--temp 0.6
--top-p 0.95
--top-k 20
--min-p 0
--ctx-size 32768
proxy: ${proxy}
filters:
strip_params: "temperature, top_p, top_k, min_p"
MoE-Aquif-3.5-Max-42B-A3B:
cmd: |
${llama-server}
-m ${models}/unsloth/aquif-3.5-Max-42B-A3B-GGUF/aquif-3.5-Max-42B-A3B-Q4_K_M.gguf
--ctx-size 65536
--chat-template-file ${models}/unsloth/aquif-3.5-Max-42B-A3B-GGUF/aquif-3.5-Max-42B-A3B.jinja
proxy: ${proxy}
MoE-Aquif-3.5-Plus-30B-A3B:
cmd: |
${llama-server}
-m ${models}/mradermacher/aquif-3.5-Plus-30B-A3B-i1-GGUF/aquif-3.5-Plus-30B-A3B.i1-Q6_K.gguf
--ctx-size 131072
proxy: ${proxy}
#MoE-Qwen3-Next-80B-A3B-Instruct:
#cmd: |
#${llama-server}
#-m ${models}/Qwen3-Next-80B-A3B-Instruct-Q8_0.gguf
#--n-cpu-moe 32
#--ctx-size 32768
#proxy: ${proxy}
MoE-MiniMax-M2-230B-A10B:
cmd: |
${llama-server}
-m ${models}/unsloth/MiniMax-M2-GGUF/MiniMax-M2-UD-Q2_K_XL-00001-of-00002.gguf
--temp 1.0
--top-p 0.95
--top-k 40
--n-cpu-moe 50
--ctx-size 65536
proxy: ${proxy}
filters:
strip_params: "temperature, top_p, top_k"
MoE-GPT-OSS-20B:
cmd: |
${llama-server}
-m ${models}/lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf
--ctx-size 65536
proxy: ${proxy}
MoE-GPT-OSS-120B:
cmd: |
${llama-server}
-m ${models}/lmstudio-community/gpt-oss-120b-GGUF/gpt-oss-120b-MXFP4-00001-of-00002.gguf
--n-cpu-moe 20
--ctx-size 65536
proxy: ${proxy}
#MoE-Vision-Llama-4-Scout-17B-16E-109B-Instruct:
#cmd: |
#${llama-server}
#-m ${models}/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/Llama-4-Scout-17B-16E-Instruct-Q4_K_M-00001-of-00002.gguf
#--mmproj ${models}/unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/mmproj-BF16.gguf
#--n-cpu-moe 33
#--ctx-size 65536
#proxy: ${proxy}
Dense-Llama-3_3-Nemotron-Super-49B-v1_5:
cmd: |
${llama-server}
-m ${models}/unsloth/Llama-3_3-Nemotron-Super-49B-v1_5-GGUF/Llama-3_3-Nemotron-Super-49B-v1_5-Q4_K_S.gguf
--ctx-size 32768
proxy: ${proxy}
#Dense-OpenReasoning-Nemotron-32B:
#cmd: |
#${llama-server}
#-m ${models}/unsloth/OpenReasoning-Nemotron-32B-GGUF/OpenReasoning-Nemotron-32B-Q6_K.gguf
#--ctx-size 32768
#proxy: ${proxy}
Dense-RP-Valkyrie-49B-v2:
cmd: |
${llama-server}
-m ${models}/bartowski/TheDrummer_Valkyrie-49B-v2-GGUF/TheDrummer_Valkyrie-49B-v2-IQ4_NL.gguf
--ctx-size 32768
proxy: ${proxy}
Dense-K2-Think-32B:
cmd: |
${llama-server}
-m ${models}/mradermacher/K2-Think-i1-GGUF/K2-Think.i1-Q6_K.gguf
--ctx-size 32768
proxy: ${proxy}
MoE-Granite-4.0-h-small-32B:
cmd: |
${llama-server}
-m ${models}/unsloth/granite-4.0-h-small-GGUF/granite-4.0-h-small-UD-Q6_K_XL.gguf
--ctx-size 131072
proxy: ${proxy}
hooks:
on_startup:
preload:
- Dense-Llama-3_3-Nemotron-Super-49B-v1_5
Podman Kubernetes manifest for LLM sandbox environment
Intuitive for all models to minimize agentic loop calls, includes full dev toolchain for major programming languages
apiVersion: v1
kind: Pod
metadata:
name: sandbox
spec:
containers:
- name: sandbox
image: docker.io/library/debian:bookworm-slim
ports:
- containerPort: 22
hostPort: 2222
resources:
limits:
memory: 4Gi
cpu: "8"
volumeMounts:
- name: inputs
mountPath: /mnt/inputs
readOnly: true
- name: workspace
mountPath: /mnt/workspace
- name: outputs
mountPath: /mnt/outputs
env:
- name: LANG
value: C.UTF-8
- name: LC_ALL
value: C.UTF-8
command:
- /bin/bash
- -c
- |
find /mnt/inputs -mindepth 1 -delete 2>/dev/null || true
find /mnt/workspace -mindepth 1 -delete 2>/dev/null || true
find /mnt/outputs -mindepth 1 -delete 2>/dev/null || true
apt update
apt install -y \
openssh-server \
file \
sudo \
git \
build-essential \
cmake ccache \
python3 python3-pip \
curl wget \
nano vim \
zip unzip p7zip-full \
tree jq \
ripgrep \
figlet
apt clean
apt update
curl -fsSL https://deb.nodesource.com/setup_20.x | bash -
apt install -y nodejs
curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
curl -sSL https://go.dev/dl/go1.25.4.linux-amd64.tar.gz | tar -C /usr/local -xz
echo "export LANG=C.UTF-8" >> /root/.bashrc
echo "export LC_ALL=C.UTF-8" >> /root/.bashrc
echo 'export PATH="$PATH:$HOME/.cargo/bin:/usr/local/go/bin"' >> /root/.bashrc
echo "cd /mnt/workspace" >> /root/.bashrc
echo "root:sandbox" | chpasswd
sed -i "s/#PermitRootLogin.*/PermitRootLogin yes/" /etc/ssh/sshd_config
mkdir -p /run/sshd
exec /usr/sbin/sshd -D
volumes:
- name: inputs
hostPath:
path: /var/www/ia/inputs
type: Directory
- name: workspace
hostPath:
path: /var/www/ia/workspace
type: Directory
- name: outputs
hostPath:
path: /var/www/ia/outputs
type: Directory
restartPolicy: Always
Apache HTTP Server VirtualHost configuration for this dedicated LLM server
Security note: writable directories for users or models must never, obviously, allow code execution on the host system!
<VirtualHost *:80>
# The ServerName directive sets the request scheme, hostname and port that
# the server uses to identify itself. This is used when creating
# redirection URLs. In the context of virtual hosts, the ServerName
# specifies what hostname must appear in the request's Host: header to
# match this virtual host. For the default virtual host (this file) this
# value is not decisive as it is used as a last resort host regardless.
# However, you must set it for any further virtual host explicitly.
#ServerName www.example.com
#ServerAdmin webmaster@localhost
#DocumentRoot /var/www/html
ServerAdmin admin@serveurperso.com
DocumentRoot /var/www
<Directory /var/www/>
AllowOverride AuthConfig FileInfo Limit Options Indexes
</Directory>
# Available loglevels: trace8, ..., trace1, debug, info, notice, warn,
# error, crit, alert, emerg.
# It is also possible to configure the loglevel for particular
# modules, e.g.
#LogLevel info ssl:warn
ErrorLog ${APACHE_LOG_DIR}/error.log
CustomLog ${APACHE_LOG_DIR}/access.log combined
# For most configuration files from conf-available/, which are
# enabled or disabled at a global level, it is possible to
# include a line for only one particular virtual host. For example the
# following line enables the CGI configuration for this host only
# after it has been globally disabled with "a2disconf".
#Include conf-available/serve-cgi-bin.conf
<Location /ia/props>
ProxyPass "http://127.0.0.1:8081/props"
ProxyPassReverse "http://127.0.0.1:8081/props"
</Location>
<Location /ia/new/props>
ProxyPass "http://127.0.0.1:8081/props"
ProxyPassReverse "http://127.0.0.1:8081/props"
</Location>
<Location /ia/slots>
ProxyPass "http://127.0.0.1:8081/slots"
ProxyPassReverse "http://127.0.0.1:8081/slots"
</Location>
<Location /ia/new/slots>
ProxyPass "http://127.0.0.1:8081/slots"
ProxyPassReverse "http://127.0.0.1:8081/slots"
</Location>
<Location /ia/v1>
ProxyPass "http://127.0.0.1:8080/v1"
ProxyPassReverse "http://127.0.0.1:8080/v1"
</Location>
<Location /ia/new/v1>
ProxyPass "http://127.0.0.1:8080/v1"
ProxyPassReverse "http://127.0.0.1:8080/v1"
</Location>
<Location /api>
ProxyPass "http://127.0.0.1:8082/api"
ProxyPassReverse "http://127.0.0.1:8082/api"
</Location>
<Location /ui>
ProxyPass "http://127.0.0.1:8082/ui"
ProxyPassReverse "http://127.0.0.1:8082/ui"
</Location>
<Location /log>
ProxyPass "http://127.0.0.1:8082/log"
ProxyPassReverse "http://127.0.0.1:8082/log"
</Location>
<Location /upstream>
ProxyPass "http://127.0.0.1:8082/upstream"
ProxyPassReverse "http://127.0.0.1:8082/upstream"
</Location>
<Location /unload>
ProxyPass "http://127.0.0.1:8082/unload"
ProxyPassReverse "http://127.0.0.1:8082/unload"
</Location>
<Location /running>
ProxyPass "http://127.0.0.1:8082/running"
ProxyPassReverse "http://127.0.0.1:8082/running"
</Location>
<Location /health>
ProxyPass "http://127.0.0.1:8082/health"
ProxyPassReverse "http://127.0.0.1:8082/health"
</Location>
<Location /ia/gguf/api>
ProxyPass "http://127.0.0.1:8090/api"
ProxyPassReverse "http://127.0.0.1:8090/api"
</Location>
<DirectoryMatch "^/var/www/ia/(inputs|workspace|outputs)">
php_admin_flag engine off
Options -ExecCGI -Includes
RemoveHandler *
RemoveType *
DefaultType application/octet-stream
AllowOverride None
Require all granted
</DirectoryMatch>
</VirtualHost>