Development & integration server for AI (LLMs)

WebUI for Georgi Gerganov's llama.cpp using native multi-model support from Xuan-Son Nguyen's llama-server

- Aleksander Grygier's Svelte UI (llama.cpp native) with the zero-diff SSE to WebSocket tunnel that bypasses corporate proxy buffering

- Victor Muštar's Svelte UI (self-hosted instance of Hugging Face Chat)

- Oleg Shulyakov's React UI (llama.cpp legacy fork) with the zero-diff SSE to WebSocket tunnel

- Pascal's GGUF Viewer (in development)

Optimized configuration for 96 GB VRAM (Ryzen 9 9950X3D, 96 GB DDR5 6600 MT/s, RTX PRO 6000 Blackwell GB202 GDDR7)

GGUF quantizations from unsloth, bartowski, and mradermacher

; llama-server --port 8082 --models-max 1 --models-preset backend.ini --webui-config-file frontend.json

[*]
fit = off                 ; Disable automatic memory fitting
ngl = 999                 ; Full GPU offload
ctk = q8_0                ; KV cache key quantization
ctv = q8_0                ; KV cache value quantization
fa = on                   ; Enable flash attention
mlock = on                ; Lock model in RAM
np = 4                    ; Parallel request batching
kvu = on                  ; Unified KV cache buffer
stop-timeout = 2          ; Force-kill child process after graceful shutdown timeout in seconds (default: 10)
sleep-idle-seconds = 3600 ; Unload weights on child process
b = 128                   ; Logical maximum batch size (default: 2048)
ub = 512                  ; Physical maximum batch size (default: 512)

[MoE-GPT-OSS-120B]
m = lmstudio-community/gpt-oss-120b-GGUF/gpt-oss-120b-MXFP4-00001-of-00002.gguf
; load-on-startup = 1       ; Load immediately on server startup
c = 131072                ; Context size in tokens for this model

[MoE-GPT-OSS-20B]
m = lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf
c = 131072

[Dense-Qwen3-32B]
m = unsloth/Qwen3-32B-GGUF/Qwen3-32B-UD-Q8_K_XL.gguf
c = 131072

[Dense-Vision-Qwen3-VL-32B-Instruct]
m = unsloth/Qwen3-VL-32B-Instruct-GGUF/Qwen3-VL-32B-Instruct-UD-Q8_K_XL.gguf
mm = unsloth/Qwen3-VL-32B-Instruct-GGUF/mmproj-BF16.gguf
c = 131072

[Dense-Vision-Qwen3-VL-32B-Thinking]
m = unsloth/Qwen3-VL-32B-Thinking-GGUF/Qwen3-VL-32B-Thinking-UD-Q8_K_XL.gguf
mm = unsloth/Qwen3-VL-32B-Thinking-GGUF/mmproj-BF16.gguf
c = 131072

[MoE-Vision-Qwen3-VL-235B-A22B-Thinking]
m = unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF/Qwen3-VL-235B-A22B-Thinking-UD-Q2_K_XL-00001-of-00002.gguf
mm = unsloth/Qwen3-VL-235B-A22B-Thinking-GGUF/mmproj-BF16.gguf
c = 65536

[MoE-Qwen3-Next-80B-A3B-Instruct]
m = unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF/Qwen3-Next-80B-A3B-Instruct-UD-Q6_K_XL-00001-of-00002.gguf
c = 262144

[MoE-Qwen3-Next-80B-A3B-Thinking]
m = unsloth/Qwen3-Next-80B-A3B-Thinking-GGUF/Qwen3-Next-80B-A3B-Thinking-UD-Q6_K_XL-00001-of-00002.gguf
c = 262144

[MoE-Qwen3-30B-A3B-Instruct-2507]
m = unsloth/Qwen3-30B-A3B-Instruct-2507-GGUF/Qwen3-30B-A3B-Instruct-2507-UD-Q8_K_XL.gguf
temp = 0.7
top-p = 0.8
top-k = 20
min-p = 0
c = 262144

[MoE-Qwen3-30B-A3B-Thinking-2507]
m = unsloth/Qwen3-30B-A3B-Thinking-2507-GGUF/Qwen3-30B-A3B-Thinking-2507-UD-Q8_K_XL.gguf
temp = 0.6
top-p = 0.95
top-k = 20
min-p = 0
c = 262144

[MoE-Qwen3-Coder-30B-A3B-Instruct]
m = unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/Qwen3-Coder-30B-A3B-Instruct-UD-Q8_K_XL.gguf
temp = 0.7
top-p = 0.8
top-k = 20
min-p = 0
c = 262144

[Dense-Devstral-2-123B-Instruct-2512]
m = unsloth/Devstral-2-123B-Instruct-2512-GGUF/Devstral-2-123B-Instruct-2512-UD-Q4_K_XL-00001-of-00002.gguf
c = 131072

[Dense-Vision-Devstral-Small-2-24B-Instruct-2512]
m = unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/Devstral-Small-2-24B-Instruct-2512-UD-Q6_K_XL.gguf
mm = unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/mmproj-BF16.gguf
; chat-template-file = unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF/Devstral-Small-2-24B-Instruct-2512.jinja
c = 131072

[Dense-Vision-Ministral-3-14B-Instruct-2512]
m = unsloth/Ministral-3-14B-Instruct-2512-GGUF/Ministral-3-14B-Instruct-2512-UD-Q8_K_XL.gguf
mm = unsloth/Ministral-3-14B-Instruct-2512-GGUF/mmproj-BF16.gguf
c = 131072

[Dense-Vision-Ministral-3-14B-Reasoning-2512]
m = unsloth/Ministral-3-14B-Reasoning-2512-GGUF/Ministral-3-14B-Reasoning-2512-UD-Q8_K_XL.gguf
mm = unsloth/Ministral-3-14B-Reasoning-2512-GGUF/mmproj-BF16.gguf
c = 131072

[Dense-Uncensored-Dolphin-Mistral-24B-Venice-Edition]
m = bartowski/cognitivecomputations_Dolphin-Mistral-24B-Venice-Edition-GGUF/cognitivecomputations_Dolphin-Mistral-24B-Venice-Edition-Q8_0.gguf
c = 65536

[Dense-Uncensored-BlackSheep-24B]
m = mradermacher/BlackSheep-24B-i1-GGUF/BlackSheep-24B.Q8_0.gguf
c = 65536

[Dense-RP-Cydonia-24B-v4.1]
m = bartowski/TheDrummer_Cydonia-24B-v4.1-GGUF/TheDrummer_Cydonia-24B-v4.1-Q8_0.gguf
c = 65536

[Dense-Vision-Gemma-3-27B-IT]
m = unsloth/gemma-3-27b-it-GGUF/gemma-3-27b-it-Q6_K.gguf
mm = unsloth/gemma-3-27b-it-GGUF/mmproj-BF16.gguf
c = 131072

[Dense-RP-Big-Tiger-Gemma-27B-v3]
m = bartowski/TheDrummer_Big-Tiger-Gemma-27B-v3-GGUF/TheDrummer_Big-Tiger-Gemma-27B-v3-Q6_K.gguf
c = 131072

[MoE-GLM-4.5-Air-106B]
m = unsloth/GLM-4.5-Air-GGUF/GLM-4.5-Air-UD-Q5_K_XL-00001-of-00002.gguf
c = 131072

[MoE-INTELLECT-3-106B]
m = bartowski/PrimeIntellect_INTELLECT-3-GGUF/PrimeIntellect_INTELLECT-3-Q5_K_M-00001-of-00003.gguf
c = 131072

[MoE-Uncensored-GLM-4.5-Air-Derestricted-106B]
m = bartowski/ArliAI_GLM-4.5-Air-Derestricted-GGUF/ArliAI_GLM-4.5-Air-Derestricted-Q4_K_M-00001-of-00002.gguf
c = 131072

[MoE-Vision-GLM-4.6V-106B]
m = unsloth/GLM-4.6V-GGUF/GLM-4.6V-UD-Q5_K_XL-00001-of-00002.gguf
mm = unsloth/GLM-4.6V-GGUF/mmproj-BF16.gguf
c = 131072

[MoE-GLM-4.7-358B]
m = unsloth/GLM-4.7-GGUF/GLM-4.7-UD-Q3_K_XL-00001-of-00004.gguf
n-cpu-moe = 42
c = 32768

[MoE-Nemotron-3-Nano-30B-A3B]
m = unsloth/Nemotron-3-Nano-30B-A3B-GGUF/Nemotron-3-Nano-30B-A3B-UD-Q8_K_XL.gguf
temp = 0.6
top-p = 0.95
c = 131072

[MoE-Llama-4-Scout-17B-16E-Instruct-109B]
m = unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf
c = 131072

[Dense-Llama-3.3-70B-Instruct]
m = unsloth/Llama-3.3-70B-Instruct-GGUF/Llama-3.3-70B-Instruct-UD-Q6_K_XL-00001-of-00002.gguf
c = 131072

[Dense-Llama-3_3-Nemotron-Super-49B-v1_5]
m = unsloth/Llama-3_3-Nemotron-Super-49B-v1_5-GGUF/Llama-3_3-Nemotron-Super-49B-v1_5-UD-Q8_K_XL-00001-of-00002.gguf
c = 131072

[MoE-Ling-Flash-2.0-100B]
m = bartowski/inclusionAI_Ling-flash-2.0-GGUF/inclusionAI_Ling-flash-2.0-Q4_K_M-00001-of-00002.gguf
c = 131072

[MoE-MiniMax-M2.1-229B]
m = unsloth/MiniMax-M2.1-GGUF/MiniMax-M2.1-UD-Q2_K_XL-00001-of-00002.gguf
c = 98304

[Dense-Granite-4.0-H-Small]
m = unsloth/granite-4.0-h-small-GGUF/granite-4.0-h-small-UD-Q8_K_XL.gguf
c = 131072

[Dense-Command-A-Reasoning-08-2025-111B]
m = bartowski/CohereLabs_command-a-reasoning-08-2025-GGUF/CohereLabs_command-a-reasoning-08-2025-Q5_K_M-00001-of-00002.gguf
c = 262144
  

Podman Kubernetes manifest for LLM sandbox environment

apiVersion: v1
kind: Pod
metadata:
  name: pod
  labels:
    app: mcp-sandbox
spec:
  restartPolicy: Always

  containers:
  - name: container
    image: localhost/sandbox-image:clean

    ports:
    - containerPort: 22
      hostPort: 2222

    resources:
      limits:
        memory: 4Gi
        cpu: "8"

    volumeMounts:
    - name: inputs
      mountPath: /mnt/inputs
      readOnly: true
    - name: workspace
      mountPath: /mnt/workspace
    - name: outputs
      mountPath: /mnt/outputs

    env:
    - name: LANG
      value: C.UTF-8
    - name: LC_ALL
      value: C.UTF-8

    command:
    - /bin/bash
    - -c
    - |
      find /mnt/inputs -mindepth 1 -delete 2>/dev/null || true
      find /mnt/workspace -mindepth 1 -delete 2>/dev/null || true
      find /mnt/outputs -mindepth 1 -delete 2>/dev/null || true
      exec /usr/sbin/sshd -D

  volumes:
  - name: inputs
    hostPath:
      path: /var/www/ia/inputs
      type: Directory
  - name: workspace
    hostPath:
      path: /var/www/ia/workspace
      type: Directory
  - name: outputs
    hostPath:
      path: /var/www/ia/outputs
      type: Directory
  

Containerfile image template

Intuitive for all models to minimize agentic loop calls, includes full dev toolchain for major programming languages

# MCP Sandbox Clean Image
# Build: su - $HOST_USER -c 'podman build -t localhost/sandbox-image:clean -f Containerfile .'

FROM docker.io/library/debian:bookworm-slim

# Environment variables
ENV LANG=C.UTF-8 \
    LC_ALL=C.UTF-8 \
    PATH="/root/.cargo/bin:/usr/local/go/bin:${PATH}"

# Install base packages (exact order from sandbox.yaml)
RUN apt update && apt install -y \
    openssh-server \
    file \
    sudo \
    git \
    build-essential \
    cmake ccache \
    python3 python3-pip \
    curl wget \
    nano vim \
    zip unzip p7zip-full \
    tree jq \
    ripgrep \
    figlet

# Install Node.js 20.x
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash -
RUN apt install -y nodejs

# Install Rust (stable)
RUN curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y

# Install Go 1.25.4
RUN curl -sSL https://go.dev/dl/go1.25.4.linux-amd64.tar.gz | tar -C /usr/local -xz

# Final cleanup
RUN apt clean
RUN apt update

# Configure SSH
RUN echo "root:sandbox" | chpasswd \
    && sed -i "s/#PermitRootLogin.*/PermitRootLogin yes/" /etc/ssh/sshd_config \
    && mkdir -p /run/sshd

# Create mount points
RUN mkdir -p /mnt/inputs /mnt/workspace /mnt/outputs

# Working directory
WORKDIR /mnt/workspace

# Default command
CMD ["/usr/sbin/sshd", "-D"]
  

Apache HTTP Server VirtualHost configuration for this dedicated LLM server

Security note: writable directories for users or models must never, obviously, allow code execution on the host system!

<VirtualHost *:80>
	# The ServerName directive sets the request scheme, hostname and port that
	# the server uses to identify itself. This is used when creating
	# redirection URLs. In the context of virtual hosts, the ServerName
	# specifies what hostname must appear in the request's Host: header to
	# match this virtual host. For the default virtual host (this file) this
	# value is not decisive as it is used as a last resort host regardless.
	# However, you must set it for any further virtual host explicitly.
	#ServerName www.example.com

	#ServerAdmin webmaster@localhost
	#DocumentRoot /var/www/html
	ServerAdmin admin@serveurperso.com
	DocumentRoot /var/www
	<Directory /var/www/>
		AllowOverride AuthConfig FileInfo Limit Options Indexes
	</Directory>

	# Available loglevels: trace8, ..., trace1, debug, info, notice, warn,
	# error, crit, alert, emerg.
	# It is also possible to configure the loglevel for particular
	# modules, e.g.
	#LogLevel info ssl:warn

	ErrorLog ${APACHE_LOG_DIR}/error.log
	CustomLog ${APACHE_LOG_DIR}/access.log combined

	# For most configuration files from conf-available/, which are
	# enabled or disabled at a global level, it is possible to
	# include a line for only one particular virtual host. For example the
	# following line enables the CGI configuration for this host only
	# after it has been globally disabled with "a2disconf".
	#Include conf-available/serve-cgi-bin.conf

	# Llama.cpp llama-server (for OpenAI and Anthropic clients)
	<Location /ia/webui/v1>
		ProxyPass "http://127.0.0.1:8082/v1"
		ProxyPassReverse "http://127.0.0.1:8082/v1"
	</Location>

	# Llama.cpp llama-server (for Svelte)
	<Location /ia/webui/props>
		ProxyPass "http://127.0.0.1:8082/props"
		ProxyPassReverse "http://127.0.0.1:8082/props"
	</Location>

	<Location /ia/webui/slots>
		ProxyPass "http://127.0.0.1:8082/slots"
		ProxyPassReverse "http://127.0.0.1:8082/slots"
	</Location>

	<Location /ia/webui/models>
		ProxyPass "http://127.0.0.1:8082/models"
		ProxyPassReverse "http://127.0.0.1:8082/models"
	</Location>

	# Backend MCP bridge (for non-MCP-aware OpenAI clients and bots) -> to llama-server
	<Location /ia/v1>
		ProxyPass "http://127.0.0.1:8080/v1"
		ProxyPassReverse "http://127.0.0.1:8080/v1"
	</Location>

	# Hugging Face chat-ui
	<Location /ia/chatui>
		ProxyPass "http://127.0.0.1:3000/ia/chatui"
		ProxyPassReverse "http://127.0.0.1:3000/ia/chatui"
	</Location>

	# GGUF viewer backend (in development)
	<Location /ia/gguf/api>
		ProxyPass "http://127.0.0.1:8090/api"
		ProxyPassReverse "http://127.0.0.1:8090/api"
	</Location>

	# MCP servers -> to LLM sandbox
	<Location /ia/mcp-streamable-http>
		ProxyPass "http://127.0.0.1:8083"
		ProxyPassReverse "http://127.0.0.1:8083"
	</Location>

	<Location /ia/mcp-websocket>
		ProxyPass "ws://127.0.0.1:8084"
		ProxyPassReverse "ws://127.0.0.1:8084"
	</Location>

	# Sandbox mount points security
	<DirectoryMatch "^/var/www/ia/(inputs|workspace|outputs)">
		php_admin_flag engine off
		Options -ExecCGI -Includes
		RemoveHandler *
		RemoveType *
		DefaultType application/octet-stream
		AllowOverride None
		Require all granted
	</DirectoryMatch>
</VirtualHost>