Development & integration server for AI (LLMs)
WebUI for Georgi Gerganov's llama.cpp
using native multi-model support from Xuan-Son Nguyen's llama-server
Current hardware configuration
- Ryzen 9 9950X, 128 GB DDR5 6000 MT/s, RTX 5090 FE Blackwell GB202 32 GB GDDR7
- Ryzen 9 9950X3D, 96 GB DDR5 6600 MT/s, RTX PRO 6000 Blackwell GB202 96 GB GDDR7
; llama-server --host 0.0.0.0 --port 8082 --models-max 1 --models-preset backend.ini --webui-config-file frontend.json
[*]
fit = off ; Disable automatic memory fitting (default: on)
ngl = 999 ; Full GPU offload
ctk = q8_0 ; KV cache key quantization
ctv = q8_0 ; KV cache value quantization
fa = on ; Enable flash attention
mlock = on ; Lock model in RAM
np = 4 ; Parallel request batching (default: 4)
kvu = on ; Unified KV cache buffer (default: on)
stop-timeout = 2 ; Force-kill child process after graceful shutdown timeout in seconds (default: 10)
sleep-idle-seconds = 60 ; Unload weights on child process
b = 128 ; Logical maximum batch size (default: 2048)
ub = 512 ; Physical maximum batch size (default: 512)
webui-mcp-proxy = on ; Enabled for llama-server issue debugging, not exposed to the internet
; slot-save-path = /var/www/ia/kvcache
[MoE-GPT-OSS-120B]
m = lmstudio-community/gpt-oss-120b-GGUF/gpt-oss-120b-MXFP4-00001-of-00002.gguf
; load-on-startup = 1 ; Load immediately on server startup
c = 131072 ; Context size in tokens for this model
alias = gpt-4, gpt-4o
[MoE-GPT-OSS-20B]
m = lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf
c = 131072
[MoE-Ling-Flash-2.0-100B]
m = bartowski/inclusionAI_Ling-flash-2.0-GGUF/inclusionAI_Ling-flash-2.0-Q6_K-00001-of-00003.gguf
rope-scaling = yarn
rope-scale = 4.0
yarn-orig-ctx = 32768
override-kv = bailingmoe2.context_length=int:131072
c = 131072
[MoE-Solar-Open-100B]
m = mradermacher/Solar-Open-100B-i1-GGUF/Solar-Open-100B.i1-Q6_K.gguf
c = 131072
[Dense-Qwen3-32B]
m = unsloth/Qwen3-32B-GGUF/Qwen3-32B-UD-Q8_K_XL.gguf
c = 131072
[Dense-Vision-Qwen3.5-27B]
m = unsloth/Qwen3.5-27B-GGUF/Qwen3.5-27B-UD-Q8_K_XL.gguf
mm = unsloth/Qwen3.5-27B-GGUF/mmproj-BF16.gguf
c = 262144
[Dense-Vision-Qwen3-VL-32B-Instruct]
m = unsloth/Qwen3-VL-32B-Instruct-GGUF/Qwen3-VL-32B-Instruct-UD-Q8_K_XL.gguf
mm = unsloth/Qwen3-VL-32B-Instruct-GGUF/mmproj-BF16.gguf
c = 131072
tags = Instant
[Dense-Vision-Qwen3-VL-32B-Thinking]
m = unsloth/Qwen3-VL-32B-Thinking-GGUF/Qwen3-VL-32B-Thinking-UD-Q8_K_XL.gguf
mm = unsloth/Qwen3-VL-32B-Thinking-GGUF/mmproj-BF16.gguf
c = 131072
tags = Thinking
[MoE-Vision-Qwen3-VL-30B-A3B-Instruct]
m = unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF/Qwen3-VL-30B-A3B-Instruct-UD-Q8_K_XL.gguf
mm = unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF/mmproj-BF16.gguf
c = 262144
tags = Instant
[MoE-Vision-Qwen3-VL-30B-A3B-Thinking]
m = unsloth/Qwen3-VL-30B-A3B-Thinking-GGUF/Qwen3-VL-30B-A3B-Thinking-UD-Q8_K_XL.gguf
mm = unsloth/Qwen3-VL-30B-A3B-Thinking-GGUF/mmproj-BF16.gguf
c = 262144
tags = Thinking
[MoE-Qwen3-235B-A22B-Instruct-2507]
m = unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF/Qwen3-235B-A22B-Instruct-2507-UD-Q2_K_XL-00001-of-00002.gguf
c = 98304
tags = Instant
[MoE-Qwen3-235B-A22B-Thinking-2507]
m = unsloth/Qwen3-235B-A22B-Thinking-2507-GGUF/Qwen3-235B-A22B-Thinking-2507-UD-Q2_K_XL-00001-of-00002.gguf
c = 98304
tags = Thinking
[MoE-Vision-Qwen3-VL-235B-A22B-Instruct]
m = unsloth/Qwen3-VL-235B-A22B-Instruct-GGUF/Qwen3-VL-235B-A22B-Instruct-UD-Q2_K_XL-00001-of-00002.gguf
mm = unsloth/Qwen3-VL-235B-A22B-Instruct-GGUF/mmproj-BF16.gguf
c = 98304
tags = Instant
[MoE-Vision-Qwen3-VL-235B-A22B-Thinking]
m = unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF/Qwen3-VL-235B-A22B-Thinking-UD-Q2_K_XL-00001-of-00002.gguf
mm = unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF/mmproj-BF16.gguf
c = 98304
tags = Thinking
[MoE-Qwen3-Next-80B-A3B-Instruct]
m = unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/Qwen3-Next-80B-A3B-Instruct-UD-Q6_K_XL-00001-of-00002.gguf
c = 262144
tags = Instant
[MoE-Qwen3-Next-80B-A3B-Thinking]
m = unsloth/Qwen3-Next-80B-A3B-Thinking-GGUF/Qwen3-Next-80B-A3B-Thinking-UD-Q6_K_XL-00001-of-00002.gguf
c = 262144
tags = Thinking
[MoE-Qwen3-Coder-Next-80B-A3B]
m = unsloth/Qwen3-Coder-Next-GGUF/Qwen3-Coder-Next-UD-Q6_K_XL-00001-of-00003.gguf
temp = 1
top-p = 0.95
top-k = 40
c = 262144
[MoE-Qwen3-30B-A3B-Instruct-2507]
m = unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF/Qwen3-30B-A3B-Instruct-2507-UD-Q8_K_XL.gguf
temp = 0.7
top-p = 0.8
top-k = 20
min-p = 0
c = 262144
tags = Instant
[MoE-Qwen3-30B-A3B-Thinking-2507]
m = unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF/Qwen3-30B-A3B-Thinking-2507-UD-Q8_K_XL.gguf
temp = 0.6
top-p = 0.95
top-k = 20
min-p = 0
c = 262144
tags = Thinking
[MoE-Qwen3-Coder-30B-A3B-Instruct]
m = unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/Qwen3-Coder-30B-A3B-Instruct-UD-Q8_K_XL.gguf
temp = 0.7
top-p = 0.8
top-k = 20
min-p = 0
c = 262144
[MoE-Vision-Qwen3.5-35B-A3B]
m = unsloth/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-UD-Q8_K_XL.gguf
mm = unsloth/Qwen3.5-35B-A3B-GGUF/mmproj-BF16.gguf
c = 262144
[MoE-Vision-Qwen3.5-122B-A10B]
m = unsloth/Qwen3.5-122B-A10B-GGUF/Qwen3.5-122B-A10B-UD-Q4_K_XL-00001-of-00003.gguf
mm = unsloth/Qwen3.5-122B-A10B-GGUF/mmproj-BF16.gguf
c = 262144
[Dense-Devstral-2-123B-Instruct-2512]
m = unsloth/Devstral-2-123B-Instruct-2512-GGUF/Devstral-2-123B-Instruct-2512-UD-Q4_K_XL-00001-of-00002.gguf
c = 131072
[Dense-Vision-Devstral-Small-2-24B-Instruct-2512]
m = unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/Devstral-Small-2-24B-Instruct-2512-UD-Q6_K_XL.gguf
mm = unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/mmproj-BF16.gguf
; chat-template-file = unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/Devstral-Small-2-24B-Instruct-2512.jinja
c = 131072
[Dense-Vision-Ministral-3-14B-Instruct-2512]
m = unsloth/Ministral-3-14B-Instruct-2512-GGUF/Ministral-3-14B-Instruct-2512-UD-Q8_K_XL.gguf
mm = unsloth/Ministral-3-14B-Instruct-2512-GGUF/mmproj-BF16.gguf
c = 131072
[Dense-Vision-Ministral-3-14B-Reasoning-2512]
m = unsloth/Ministral-3-14B-Reasoning-2512-GGUF/Ministral-3-14B-Reasoning-2512-UD-Q8_K_XL.gguf
mm = unsloth/Ministral-3-14B-Reasoning-2512-GGUF/mmproj-BF16.gguf
c = 131072
[Dense-Uncensored-Dolphin-Mistral-24B-Venice-Edition]
m = bartowski/cognitivecomputations_Dolphin-Mistral-24B-Venice-Edition-GGUF/cognitivecomputations_Dolphin-Mistral-24B-Venice-Edition-Q8_0.gguf
c = 65536
[Dense-Uncensored-BlackSheep-24B]
m = mradermacher/BlackSheep-24B-i1-GGUF/BlackSheep-24B.Q8_0.gguf
c = 65536
[Dense-RP-Cydonia-24B-v4.1]
m = bartowski/TheDrummer_Cydonia-24B-v4.1-GGUF/TheDrummer_Cydonia-24B-v4.1-Q8_0.gguf
c = 65536
[Dense-Vision-Gemma-3-27B-IT]
m = unsloth/gemma-3-27b-it-GGUF/gemma-3-27b-it-Q6_K.gguf
mm = unsloth/gemma-3-27b-it-GGUF/mmproj-BF16.gguf
c = 131072
[Dense-RP-Big-Tiger-Gemma-27B-v3]
m = bartowski/TheDrummer_Big-Tiger-Gemma-27B-v3-GGUF/TheDrummer_Big-Tiger-Gemma-27B-v3-Q6_K.gguf
c = 131072
[MoE-GLM-4.5-Air-106B]
m = unsloth/GLM-4.5-Air-GGUF/GLM-4.5-Air-UD-Q5_K_XL-00001-of-00002.gguf
c = 131072
[MoE-INTELLECT-3-106B]
m = bartowski/PrimeIntellect_INTELLECT-3-GGUF/PrimeIntellect_INTELLECT-3-Q5_K_M-00001-of-00003.gguf
c = 131072
[MoE-Uncensored-GLM-4.5-Air-Derestricted-106B]
m = bartowski/ArliAI_GLM-4.5-Air-Derestricted-GGUF/ArliAI_GLM-4.5-Air-Derestricted-Q4_K_M-00001-of-00002.gguf
c = 131072
[MoE-Vision-GLM-4.6V-106B]
m = unsloth/GLM-4.6V-GGUF/GLM-4.6V-UD-Q5_K_XL-00001-of-00002.gguf
mm = unsloth/GLM-4.6V-GGUF/mmproj-BF16.gguf
c = 131072
[MoE-GLM-4.7-Flash-30B-A3B]
m = unsloth/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-UD-Q8_K_XL.gguf
temp = 0.7
top-p = 1.0
min-p = 0.01
c = 202752
[MoE-GLM-4.7-358B-IQ1_S]
m = unsloth/GLM-4.7-GGUF/GLM-4.7-UD-IQ1_S-00001-of-00002.gguf
c = 16384
tags = GPU-IQ1_S
[MoE-GLM-4.7-358B-Q3_K_XL]
m = unsloth/GLM-4.7-GGUF/GLM-4.7-UD-Q3_K_XL-00001-of-00004.gguf
n-cpu-moe = 42
c = 32768
tags = CPU-Q3_K_XL
[MoE-Nemotron-3-Nano-30B-A3B]
m = unsloth/Nemotron-3-Nano-30B-A3B-GGUF/Nemotron-3-Nano-30B-A3B-UD-Q8_K_XL.gguf
temp = 0.6
top-p = 0.95
c = 262144
[MoE-Llama-4-Scout-17B-16E-Instruct-109B]
m = unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf
c = 131072
[Dense-Llama-3.3-70B-Instruct]
m = unsloth/Llama-3.3-70B-Instruct-GGUF/Llama-3.3-70B-Instruct-UD-Q6_K_XL-00001-of-00002.gguf
c = 131072
[Dense-Llama-3_3-Nemotron-Super-49B-v1_5]
m = unsloth/Llama-3_3-Nemotron-Super-49B-v1_5-GGUF/Llama-3_3-Nemotron-Super-49B-v1_5-UD-Q8_K_XL-00001-of-00002.gguf
c = 131072
[MoE-MiniMax-M2.1-229B]
m = unsloth/MiniMax-M2.1-GGUF/MiniMax-M2.1-UD-Q2_K_XL-00001-of-00002.gguf
c = 98304
[Dense-Granite-4.0-H-Small-32B]
m = unsloth/granite-4.0-h-small-GGUF/granite-4.0-h-small-UD-Q8_K_XL.gguf
c = 131072
[Dense-Command-A-Reasoning-08-2025-111B]
m = bartowski/CohereLabs_command-a-reasoning-08-2025-GGUF/CohereLabs_command-a-reasoning-08-2025-Q5_K_M-00001-of-00002.gguf
c = 262144
Podman Kubernetes manifest for LLM sandbox environment
apiVersion: v1
kind: Pod
metadata:
name: pod
labels:
app: mcp-sandbox
spec:
restartPolicy: Always
containers:
- name: container
image: localhost/sandbox-image:clean
ports:
- containerPort: 22
hostPort: 2222
resources:
limits:
nvidia.com/gpu=all: 1
#memory: 4Gi
#cpu: "2"
volumeMounts:
- name: inputs
mountPath: /mnt/inputs
readOnly: true
- name: workspace
mountPath: /mnt/workspace
- name: outputs
mountPath: /mnt/outputs
env:
- name: LANG
value: C.UTF-8
- name: LC_ALL
value: C.UTF-8
command:
- /bin/bash
- -c
- |
find /mnt/inputs -mindepth 1 -delete 2>/dev/null || true
#find /mnt/workspace -mindepth 1 -delete 2>/dev/null || true
find /mnt/outputs -mindepth 1 -delete 2>/dev/null || true
exec /usr/sbin/sshd -D
volumes:
- name: inputs
hostPath:
path: /var/www/ia/inputs
type: Directory
- name: workspace
hostPath:
path: /var/www/ia/workspace
type: Directory
- name: outputs
hostPath:
path: /var/www/ia/outputs
type: Directory
Containerfile image template
Intuitive for all models to minimize agentic loop calls, includes full dev toolchain for major programming languages
# MCP Sandbox Clean Image
# Build: su - $HOST_USER -c 'podman build -t localhost/sandbox-image:clean -f Containerfile .'
FROM docker.io/library/debian:trixie-slim
# Environment variables
ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
PATH="/root/.cargo/bin:/usr/local/go/bin:/usr/local/cuda/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
# Install base packages (exact order from sandbox.yaml)
RUN apt update && apt install -y \
openssh-server \
file \
sudo \
git \
build-essential \
cmake ccache \
python3 python3-pip \
curl wget \
nano vim \
zip unzip p7zip-full \
tree jq \
ripgrep \
figlet \
gnupg2
# Install CUDA Toolkit via NVIDIA repo
RUN curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/debian13/x86_64/8793F200.pub | gpg --dearmor -o /usr/share/keyrings/cuda-archive-keyring.gpg \
&& echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/debian13/x86_64/ /" > /etc/apt/sources.list.d/cuda-debian13-x86_64.list \
&& apt update \
&& apt install -y cuda-toolkit-13-1
# Install Node.js 24.x
RUN curl -fsSL https://deb.nodesource.com/setup_24.x | bash -
RUN apt install -y nodejs
# Install Rust (stable)
RUN curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
# Install Go 1.25.4
RUN curl -sSL https://go.dev/dl/go1.25.4.linux-amd64.tar.gz | tar -C /usr/local -xz
# Final cleanup
RUN apt clean
RUN apt update
# Configure SSH
RUN echo "root:sandbox" | chpasswd \
&& sed -i "s/#PermitRootLogin.*/PermitRootLogin yes/" /etc/ssh/sshd_config \
&& sed -i 's/^AcceptEnv LANG LC_\*/#AcceptEnv LANG LC_*/' /etc/ssh/sshd_config \
&& mkdir -p /run/sshd
# Create mount points
RUN mkdir -p /mnt/inputs /mnt/workspace /mnt/outputs
# Working directory
WORKDIR /mnt/workspace
# Default command
CMD ["/usr/sbin/sshd", "-D"]
Apache HTTP Server VirtualHost configuration for this dedicated LLM server
Security note: writable directories for users or models must never, obviously, allow code execution on the host system!
<VirtualHost *:80>
# The ServerName directive sets the request scheme, hostname and port that
# the server uses to identify itself. This is used when creating
# redirection URLs. In the context of virtual hosts, the ServerName
# specifies what hostname must appear in the request's Host: header to
# match this virtual host. For the default virtual host (this file) this
# value is not decisive as it is used as a last resort host regardless.
# However, you must set it for any further virtual host explicitly.
#ServerName www.example.com
#ServerAdmin webmaster@localhost
#DocumentRoot /var/www/html
ServerAdmin admin@serveurperso.com
DocumentRoot /var/www
<Directory /var/www/>
AllowOverride AuthConfig FileInfo Limit Options Indexes
</Directory>
# Available loglevels: trace8, ..., trace1, debug, info, notice, warn,
# error, crit, alert, emerg.
# It is also possible to configure the loglevel for particular
# modules, e.g.
#LogLevel info ssl:warn
ErrorLog ${APACHE_LOG_DIR}/error.log
CustomLog ${APACHE_LOG_DIR}/access.log combined
# For most configuration files from conf-available/, which are
# enabled or disabled at a global level, it is possible to
# include a line for only one particular virtual host. For example the
# following line enables the CGI configuration for this host only
# after it has been globally disabled with "a2disconf".
#Include conf-available/serve-cgi-bin.conf
# Llama.cpp llama-server (for OpenAI and Anthropic clients)
<Location /ia/webui/v1>
ProxyPass "http://127.0.0.1:8082/v1"
ProxyPassReverse "http://127.0.0.1:8082/v1"
</Location>
# Llama.cpp llama-server (for Svelte)
<Location /ia/webui/props>
ProxyPass "http://127.0.0.1:8082/props"
ProxyPassReverse "http://127.0.0.1:8082/props"
</Location>
<Location /ia/webui/slots>
ProxyPass "http://127.0.0.1:8082/slots"
ProxyPassReverse "http://127.0.0.1:8082/slots"
</Location>
<Location /ia/webui/models>
ProxyPass "http://127.0.0.1:8082/models"
ProxyPassReverse "http://127.0.0.1:8082/models"
</Location>
# Backend MCP bridge (for non-MCP-aware OpenAI clients and bots) -> to llama-server
<Location /ia/v1>
ProxyPass "http://127.0.0.1:8080/v1"
ProxyPassReverse "http://127.0.0.1:8080/v1"
</Location>
# Hugging Face chat-ui
<Location /ia/chatui>
ProxyPass "http://127.0.0.1:3000/ia/chatui"
ProxyPassReverse "http://127.0.0.1:3000/ia/chatui"
</Location>
# GGUF viewer backend (in development)
<Location /ia/gguf/api>
ProxyPass "http://127.0.0.1:8090/api"
ProxyPassReverse "http://127.0.0.1:8090/api"
</Location>
# MCP servers -> to LLM sandbox
<Location /ia/mcp-streamable-http>
ProxyPass "http://127.0.0.1:8083"
ProxyPassReverse "http://127.0.0.1:8083"
</Location>
<Location /ia/mcp-websocket>
ProxyPass "ws://127.0.0.1:8084"
ProxyPassReverse "ws://127.0.0.1:8084"
</Location>
# Xterm.js
<Location /ia/term>
ProxyPass "ws://127.0.0.1:8888"
ProxyPassReverse "ws://127.0.0.1:8888"
</Location>
# Sandbox mount points security
<DirectoryMatch "^/var/www/ia/(inputs|workspace|outputs)">
php_admin_flag engine off
Options -ExecCGI -Includes
RemoveHandler *
RemoveType *
DefaultType application/octet-stream
AllowOverride None
Require all granted
</DirectoryMatch>
</VirtualHost>