Development & integration server for AI (LLMs)
WebUI for Georgi Gerganov's llama.cpp
using native multi-model support from Xuan-Son Nguyen's llama-server
Optimized configuration for 96 GB VRAM (Ryzen 9 9950X3D, 96 GB DDR5 6600 MT/s, RTX PRO 6000 Blackwell GB202 GDDR7)
; llama-server --port 8082 --models-max 1 --models-preset backend.ini --webui-config-file frontend.json
[*]
fit = off ; Disable automatic memory fitting
ngl = 999 ; Full GPU offload
ctk = q8_0 ; KV cache key quantization
ctv = q8_0 ; KV cache value quantization
fa = on ; Enable flash attention
mlock = on ; Lock model in RAM
np = 4 ; Parallel request batching
kvu = on ; Unified KV cache buffer
stop-timeout = 2 ; Force-kill child process after graceful shutdown timeout in seconds (default: 10)
sleep-idle-seconds = 3600 ; Unload weights on child process
b = 128 ; Logical maximum batch size (default: 2048)
ub = 512 ; Physical maximum batch size (default: 512)
[MoE-GPT-OSS-120B]
m = lmstudio-community/gpt-oss-120b-GGUF/gpt-oss-120b-MXFP4-00001-of-00002.gguf
; load-on-startup = 1 ; Load immediately on server startup
c = 131072 ; Context size in tokens for this model
[MoE-GPT-OSS-20B]
m = lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf
c = 131072
[Dense-Qwen3-32B]
m = unsloth/Qwen3-32B-GGUF/Qwen3-32B-UD-Q8_K_XL.gguf
c = 131072
[Dense-Vision-Qwen3-VL-32B-Instruct]
m = unsloth/Qwen3-VL-32B-Instruct-GGUF/Qwen3-VL-32B-Instruct-UD-Q8_K_XL.gguf
mm = unsloth/Qwen3-VL-32B-Instruct-GGUF/mmproj-BF16.gguf
c = 131072
[Dense-Vision-Qwen3-VL-32B-Thinking]
m = unsloth/Qwen3-VL-32B-Thinking-GGUF/Qwen3-VL-32B-Thinking-UD-Q8_K_XL.gguf
mm = unsloth/Qwen3-VL-32B-Thinking-GGUF/mmproj-BF16.gguf
c = 131072
[MoE-Vision-Qwen3-VL-235B-A22B-Thinking]
m = unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF/Qwen3-VL-235B-A22B-Thinking-UD-Q2_K_XL-00001-of-00002.gguf
mm = unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF/mmproj-BF16.gguf
c = 65536
[MoE-Qwen3-Next-80B-A3B-Instruct]
m = unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/Qwen3-Next-80B-A3B-Instruct-UD-Q6_K_XL-00001-of-00002.gguf
c = 262144
[MoE-Qwen3-Next-80B-A3B-Thinking]
m = unsloth/Qwen3-Next-80B-A3B-Thinking-GGUF/Qwen3-Next-80B-A3B-Thinking-UD-Q6_K_XL-00001-of-00002.gguf
c = 262144
[MoE-Qwen3-30B-A3B-Instruct-2507]
m = unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF/Qwen3-30B-A3B-Instruct-2507-UD-Q8_K_XL.gguf
temp = 0.7
top-p = 0.8
top-k = 20
min-p = 0
c = 262144
[MoE-Qwen3-30B-A3B-Thinking-2507]
m = unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF/Qwen3-30B-A3B-Thinking-2507-UD-Q8_K_XL.gguf
temp = 0.6
top-p = 0.95
top-k = 20
min-p = 0
c = 262144
[MoE-Qwen3-Coder-30B-A3B-Instruct]
m = unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/Qwen3-Coder-30B-A3B-Instruct-UD-Q8_K_XL.gguf
temp = 0.7
top-p = 0.8
top-k = 20
min-p = 0
c = 262144
[Dense-Devstral-2-123B-Instruct-2512]
m = unsloth/Devstral-2-123B-Instruct-2512-GGUF/Devstral-2-123B-Instruct-2512-UD-Q4_K_XL-00001-of-00002.gguf
c = 131072
[Dense-Vision-Devstral-Small-2-24B-Instruct-2512]
m = unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/Devstral-Small-2-24B-Instruct-2512-UD-Q6_K_XL.gguf
mm = unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/mmproj-BF16.gguf
; chat-template-file = unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/Devstral-Small-2-24B-Instruct-2512.jinja
c = 131072
[Dense-Vision-Ministral-3-14B-Instruct-2512]
m = unsloth/Ministral-3-14B-Instruct-2512-GGUF/Ministral-3-14B-Instruct-2512-UD-Q8_K_XL.gguf
mm = unsloth/Ministral-3-14B-Instruct-2512-GGUF/mmproj-BF16.gguf
c = 131072
[Dense-Vision-Ministral-3-14B-Reasoning-2512]
m = unsloth/Ministral-3-14B-Reasoning-2512-GGUF/Ministral-3-14B-Reasoning-2512-UD-Q8_K_XL.gguf
mm = unsloth/Ministral-3-14B-Reasoning-2512-GGUF/mmproj-BF16.gguf
c = 131072
[Dense-Uncensored-Dolphin-Mistral-24B-Venice-Edition]
m = bartowski/cognitivecomputations_Dolphin-Mistral-24B-Venice-Edition-GGUF/cognitivecomputations_Dolphin-Mistral-24B-Venice-Edition-Q8_0.gguf
c = 65536
[Dense-Uncensored-BlackSheep-24B]
m = mradermacher/BlackSheep-24B-i1-GGUF/BlackSheep-24B.Q8_0.gguf
c = 65536
[Dense-RP-Cydonia-24B-v4.1]
m = bartowski/TheDrummer_Cydonia-24B-v4.1-GGUF/TheDrummer_Cydonia-24B-v4.1-Q8_0.gguf
c = 65536
[Dense-Vision-Gemma-3-27B-IT]
m = unsloth/gemma-3-27b-it-GGUF/gemma-3-27b-it-Q6_K.gguf
mm = unsloth/gemma-3-27b-it-GGUF/mmproj-BF16.gguf
c = 131072
[Dense-RP-Big-Tiger-Gemma-27B-v3]
m = bartowski/TheDrummer_Big-Tiger-Gemma-27B-v3-GGUF/TheDrummer_Big-Tiger-Gemma-27B-v3-Q6_K.gguf
c = 131072
[MoE-GLM-4.5-Air-106B]
m = unsloth/GLM-4.5-Air-GGUF/GLM-4.5-Air-UD-Q5_K_XL-00001-of-00002.gguf
c = 131072
[MoE-INTELLECT-3-106B]
m = bartowski/PrimeIntellect_INTELLECT-3-GGUF/PrimeIntellect_INTELLECT-3-Q5_K_M-00001-of-00003.gguf
c = 131072
[MoE-Uncensored-GLM-4.5-Air-Derestricted-106B]
m = bartowski/ArliAI_GLM-4.5-Air-Derestricted-GGUF/ArliAI_GLM-4.5-Air-Derestricted-Q4_K_M-00001-of-00002.gguf
c = 131072
[MoE-Vision-GLM-4.6V-106B]
m = unsloth/GLM-4.6V-GGUF/GLM-4.6V-UD-Q5_K_XL-00001-of-00002.gguf
mm = unsloth/GLM-4.6V-GGUF/mmproj-BF16.gguf
c = 131072
[MoE-GLM-4.7-358B]
m = unsloth/GLM-4.7-GGUF/GLM-4.7-UD-Q3_K_XL-00001-of-00004.gguf
n-cpu-moe = 42
c = 32768
[MoE-Nemotron-3-Nano-30B-A3B]
m = unsloth/Nemotron-3-Nano-30B-A3B-GGUF/Nemotron-3-Nano-30B-A3B-UD-Q8_K_XL.gguf
temp = 0.6
top-p = 0.95
c = 131072
[MoE-Llama-4-Scout-17B-16E-Instruct-109B]
m = unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf
c = 131072
[Dense-Llama-3.3-70B-Instruct]
m = unsloth/Llama-3.3-70B-Instruct-GGUF/Llama-3.3-70B-Instruct-UD-Q6_K_XL-00001-of-00002.gguf
c = 131072
[Dense-Llama-3_3-Nemotron-Super-49B-v1_5]
m = unsloth/Llama-3_3-Nemotron-Super-49B-v1_5-GGUF/Llama-3_3-Nemotron-Super-49B-v1_5-UD-Q8_K_XL-00001-of-00002.gguf
c = 131072
[MoE-Ling-Flash-2.0-100B]
m = bartowski/inclusionAI_Ling-flash-2.0-GGUF/inclusionAI_Ling-flash-2.0-Q4_K_M-00001-of-00002.gguf
c = 131072
[MoE-MiniMax-M2.1-229B]
m = unsloth/MiniMax-M2.1-GGUF/MiniMax-M2.1-UD-Q2_K_XL-00001-of-00002.gguf
c = 98304
[Dense-Granite-4.0-H-Small]
m = unsloth/granite-4.0-h-small-GGUF/granite-4.0-h-small-UD-Q8_K_XL.gguf
c = 131072
[Dense-Command-A-Reasoning-08-2025-111B]
m = bartowski/CohereLabs_command-a-reasoning-08-2025-GGUF/CohereLabs_command-a-reasoning-08-2025-Q5_K_M-00001-of-00002.gguf
c = 262144
Podman Kubernetes manifest for LLM sandbox environment
apiVersion: v1
kind: Pod
metadata:
name: pod
labels:
app: mcp-sandbox
spec:
restartPolicy: Always
containers:
- name: container
image: localhost/sandbox-image:clean
ports:
- containerPort: 22
hostPort: 2222
resources:
limits:
memory: 4Gi
cpu: "8"
volumeMounts:
- name: inputs
mountPath: /mnt/inputs
readOnly: true
- name: workspace
mountPath: /mnt/workspace
- name: outputs
mountPath: /mnt/outputs
env:
- name: LANG
value: C.UTF-8
- name: LC_ALL
value: C.UTF-8
command:
- /bin/bash
- -c
- |
find /mnt/inputs -mindepth 1 -delete 2>/dev/null || true
find /mnt/workspace -mindepth 1 -delete 2>/dev/null || true
find /mnt/outputs -mindepth 1 -delete 2>/dev/null || true
exec /usr/sbin/sshd -D
volumes:
- name: inputs
hostPath:
path: /var/www/ia/inputs
type: Directory
- name: workspace
hostPath:
path: /var/www/ia/workspace
type: Directory
- name: outputs
hostPath:
path: /var/www/ia/outputs
type: Directory
Containerfile image template
Intuitive for all models to minimize agentic loop calls, includes full dev toolchain for major programming languages
# MCP Sandbox Clean Image
# Build: su - $HOST_USER -c 'podman build -t localhost/sandbox-image:clean -f Containerfile .'
FROM docker.io/library/debian:bookworm-slim
# Environment variables
ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
PATH="/root/.cargo/bin:/usr/local/go/bin:${PATH}"
# Install base packages (exact order from sandbox.yaml)
RUN apt update && apt install -y \
openssh-server \
file \
sudo \
git \
build-essential \
cmake ccache \
python3 python3-pip \
curl wget \
nano vim \
zip unzip p7zip-full \
tree jq \
ripgrep \
figlet
# Install Node.js 20.x
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash -
RUN apt install -y nodejs
# Install Rust (stable)
RUN curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
# Install Go 1.25.4
RUN curl -sSL https://go.dev/dl/go1.25.4.linux-amd64.tar.gz | tar -C /usr/local -xz
# Final cleanup
RUN apt clean
RUN apt update
# Configure SSH
RUN echo "root:sandbox" | chpasswd \
&& sed -i "s/#PermitRootLogin.*/PermitRootLogin yes/" /etc/ssh/sshd_config \
&& mkdir -p /run/sshd
# Create mount points
RUN mkdir -p /mnt/inputs /mnt/workspace /mnt/outputs
# Working directory
WORKDIR /mnt/workspace
# Default command
CMD ["/usr/sbin/sshd", "-D"]
Apache HTTP Server VirtualHost configuration for this dedicated LLM server
Security note: writable directories for users or models must never, obviously, allow code execution on the host system!
<VirtualHost *:80>
# The ServerName directive sets the request scheme, hostname and port that
# the server uses to identify itself. This is used when creating
# redirection URLs. In the context of virtual hosts, the ServerName
# specifies what hostname must appear in the request's Host: header to
# match this virtual host. For the default virtual host (this file) this
# value is not decisive as it is used as a last resort host regardless.
# However, you must set it for any further virtual host explicitly.
#ServerName www.example.com
#ServerAdmin webmaster@localhost
#DocumentRoot /var/www/html
ServerAdmin admin@serveurperso.com
DocumentRoot /var/www
<Directory /var/www/>
AllowOverride AuthConfig FileInfo Limit Options Indexes
</Directory>
# Available loglevels: trace8, ..., trace1, debug, info, notice, warn,
# error, crit, alert, emerg.
# It is also possible to configure the loglevel for particular
# modules, e.g.
#LogLevel info ssl:warn
ErrorLog ${APACHE_LOG_DIR}/error.log
CustomLog ${APACHE_LOG_DIR}/access.log combined
# For most configuration files from conf-available/, which are
# enabled or disabled at a global level, it is possible to
# include a line for only one particular virtual host. For example the
# following line enables the CGI configuration for this host only
# after it has been globally disabled with "a2disconf".
#Include conf-available/serve-cgi-bin.conf
# Llama.cpp llama-server (for OpenAI and Anthropic clients)
<Location /ia/webui/v1>
ProxyPass "http://127.0.0.1:8082/v1"
ProxyPassReverse "http://127.0.0.1:8082/v1"
</Location>
# Llama.cpp llama-server (for Svelte)
<Location /ia/webui/props>
ProxyPass "http://127.0.0.1:8082/props"
ProxyPassReverse "http://127.0.0.1:8082/props"
</Location>
<Location /ia/webui/slots>
ProxyPass "http://127.0.0.1:8082/slots"
ProxyPassReverse "http://127.0.0.1:8082/slots"
</Location>
<Location /ia/webui/models>
ProxyPass "http://127.0.0.1:8082/models"
ProxyPassReverse "http://127.0.0.1:8082/models"
</Location>
# Backend MCP bridge (for non-MCP-aware OpenAI clients and bots) -> to llama-server
<Location /ia/v1>
ProxyPass "http://127.0.0.1:8080/v1"
ProxyPassReverse "http://127.0.0.1:8080/v1"
</Location>
# Hugging Face chat-ui
<Location /ia/chatui>
ProxyPass "http://127.0.0.1:3000/ia/chatui"
ProxyPassReverse "http://127.0.0.1:3000/ia/chatui"
</Location>
# GGUF viewer backend (in development)
<Location /ia/gguf/api>
ProxyPass "http://127.0.0.1:8090/api"
ProxyPassReverse "http://127.0.0.1:8090/api"
</Location>
# MCP servers -> to LLM sandbox
<Location /ia/mcp-streamable-http>
ProxyPass "http://127.0.0.1:8083"
ProxyPassReverse "http://127.0.0.1:8083"
</Location>
<Location /ia/mcp-websocket>
ProxyPass "ws://127.0.0.1:8084"
ProxyPassReverse "ws://127.0.0.1:8084"
</Location>
# Sandbox mount points security
<DirectoryMatch "^/var/www/ia/(inputs|workspace|outputs)">
php_admin_flag engine off
Options -ExecCGI -Includes
RemoveHandler *
RemoveType *
DefaultType application/octet-stream
AllowOverride None
Require all granted
</DirectoryMatch>
</VirtualHost>