feat(prod): add production infrastructure management

Add `core prod` command with full production infrastructure tooling:

- `core prod status` — parallel SSH health checks across all hosts,
  Galera cluster state, Redis sentinel, Docker, LB health
- `core prod setup` — Phase 1 foundation: Hetzner topology discovery,
  managed LB creation, CloudNS DNS record management
- `core prod dns` — CloudNS record CRUD with idempotent EnsureRecord
- `core prod lb` — Hetzner Cloud LB status and creation
- `core prod ssh <host>` — SSH into hosts defined in infra.yaml

New packages:
- pkg/infra: config parsing, Hetzner Cloud/Robot API, CloudNS DNS API
- infra.yaml: declarative production topology (hosts, LB, DNS, SSL,
  Galera, Redis, containers, S3, CDN, CI/CD, monitoring, backups)

Docker:
- Dockerfile.app (PHP 8.3-FPM, multi-stage)
- Dockerfile.web (Nginx + security headers)
- docker-compose.prod.yml (app, web, horizon, scheduler, mcp, redis, galera)

Ansible playbooks (runnable via `core deploy ansible`):
- galera-deploy.yml, redis-deploy.yml, galera-backup.yml
- inventory.yml with all production hosts

CI/CD:
- .forgejo/workflows/deploy.yml for Forgejo Actions pipeline

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Snider 2026-02-06 03:03:29 +00:00
parent 00c011bd39
commit 349e8daa0b
25 changed files with 3150 additions and 0 deletions

View file

@ -0,0 +1,146 @@
# Host UK Production Deployment Pipeline
# Runs on Forgejo Actions (gitea.snider.dev)
# Runner: build.de.host.uk.com
#
# Workflow:
# 1. composer install + test
# 2. npm ci + build
# 3. docker build + push
# 4. Coolify deploy webhook (rolling restart)
name: Deploy
on:
push:
branches: [main]
workflow_dispatch:
env:
REGISTRY: gitea.snider.dev
IMAGE_APP: host-uk/app
IMAGE_WEB: host-uk/web
IMAGE_CORE: host-uk/core
jobs:
test:
name: Test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup PHP
uses: shivammathur/setup-php@v2
with:
php-version: "8.3"
extensions: bcmath, gd, intl, mbstring, pdo_mysql, redis, zip
coverage: none
- name: Install Composer dependencies
run: composer install --no-interaction --prefer-dist
- name: Run tests
run: composer test
- name: Check code style
run: ./vendor/bin/pint --test
build-app:
name: Build App Image
needs: test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
- name: Login to registry
run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login ${{ env.REGISTRY }} -u ${{ secrets.REGISTRY_USER }} --password-stdin
- name: Build and push app image
run: |
SHA=$(git rev-parse --short HEAD)
docker build \
-f docker/Dockerfile.app \
-t ${{ env.REGISTRY }}/${{ env.IMAGE_APP }}:${SHA} \
-t ${{ env.REGISTRY }}/${{ env.IMAGE_APP }}:latest \
.
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_APP }}:${SHA}
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_APP }}:latest
build-web:
name: Build Web Image
needs: test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Login to registry
run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login ${{ env.REGISTRY }} -u ${{ secrets.REGISTRY_USER }} --password-stdin
- name: Build and push web image
run: |
SHA=$(git rev-parse --short HEAD)
docker build \
-f docker/Dockerfile.web \
-t ${{ env.REGISTRY }}/${{ env.IMAGE_WEB }}:${SHA} \
-t ${{ env.REGISTRY }}/${{ env.IMAGE_WEB }}:latest \
.
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_WEB }}:${SHA}
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_WEB }}:latest
build-core:
name: Build Core Image
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Go
uses: actions/setup-go@v5
with:
go-version: "1.25"
- name: Build core binary
run: |
go build -ldflags '-s -w' -o bin/core .
- name: Login to registry
run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login ${{ env.REGISTRY }} -u ${{ secrets.REGISTRY_USER }} --password-stdin
- name: Build and push core image
run: |
SHA=$(git rev-parse --short HEAD)
cat > Dockerfile.core <<'EOF'
FROM alpine:3.20
RUN apk add --no-cache ca-certificates
COPY bin/core /usr/local/bin/core
ENTRYPOINT ["core"]
EOF
docker build \
-f Dockerfile.core \
-t ${{ env.REGISTRY }}/${{ env.IMAGE_CORE }}:${SHA} \
-t ${{ env.REGISTRY }}/${{ env.IMAGE_CORE }}:latest \
.
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_CORE }}:${SHA}
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_CORE }}:latest
deploy:
name: Deploy to Production
needs: [build-app, build-web, build-core]
runs-on: ubuntu-latest
steps:
- name: Trigger Coolify deploy
run: |
curl -s -X POST \
-H "Authorization: Bearer ${{ secrets.COOLIFY_TOKEN }}" \
"${{ secrets.COOLIFY_URL }}/api/v1/deploy" \
-H "Content-Type: application/json" \
-d '{"uuid": "${{ secrets.COOLIFY_APP_UUID }}", "force": false}'
- name: Wait for deployment
run: |
echo "Deployment triggered. Coolify will perform rolling restart."
echo "Monitor at: ${{ secrets.COOLIFY_URL }}"

107
docker/Dockerfile.app Normal file
View file

@ -0,0 +1,107 @@
# Host UK — Laravel Application Container
# PHP 8.3-FPM with all extensions required by the federated monorepo
#
# Build: docker build -f docker/Dockerfile.app -t host-uk/app:latest ..
# (run from host-uk/ workspace root, not core/)
FROM php:8.3-fpm-alpine AS base
# System dependencies
RUN apk add --no-cache \
git \
curl \
libpng-dev \
libjpeg-turbo-dev \
freetype-dev \
libwebp-dev \
libzip-dev \
icu-dev \
oniguruma-dev \
libxml2-dev \
linux-headers \
$PHPIZE_DEPS
# PHP extensions
RUN docker-php-ext-configure gd \
--with-freetype \
--with-jpeg \
--with-webp \
&& docker-php-ext-install -j$(nproc) \
bcmath \
exif \
gd \
intl \
mbstring \
opcache \
pcntl \
pdo_mysql \
soap \
xml \
zip
# Redis extension
RUN pecl install redis && docker-php-ext-enable redis
# Composer
COPY --from=composer:2 /usr/bin/composer /usr/bin/composer
# PHP configuration
RUN mv "$PHP_INI_DIR/php.ini-production" "$PHP_INI_DIR/php.ini"
COPY docker/php/opcache.ini $PHP_INI_DIR/conf.d/opcache.ini
COPY docker/php/php-fpm.conf /usr/local/etc/php-fpm.d/zz-host-uk.conf
# --- Build stage ---
FROM base AS build
WORKDIR /app
# Install dependencies first (cache layer)
COPY composer.json composer.lock ./
RUN composer install \
--no-dev \
--no-scripts \
--no-autoloader \
--prefer-dist \
--no-interaction
# Copy application
COPY . .
# Generate autoloader and run post-install
RUN composer dump-autoload --optimize --no-dev \
&& php artisan package:discover --ansi
# Build frontend assets
RUN if [ -f package.json ]; then \
apk add --no-cache nodejs npm && \
npm ci --production=false && \
npm run build && \
rm -rf node_modules; \
fi
# --- Production stage ---
FROM base AS production
WORKDIR /app
# Copy built application
COPY --from=build /app /app
# Create storage directories
RUN mkdir -p \
storage/framework/cache/data \
storage/framework/sessions \
storage/framework/views \
storage/logs \
bootstrap/cache
# Permissions
RUN chown -R www-data:www-data storage bootstrap/cache
# Health check
HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
CMD php-fpm-healthcheck || exit 1
USER www-data
EXPOSE 9000

19
docker/Dockerfile.web Normal file
View file

@ -0,0 +1,19 @@
# Host UK — Nginx Web Server
# Serves static files and proxies PHP to FPM container
#
# Build: docker build -f docker/Dockerfile.web -t host-uk/web:latest .
FROM nginx:1.27-alpine
# Copy nginx configuration
COPY docker/nginx/default.conf /etc/nginx/conf.d/default.conf
COPY docker/nginx/security-headers.conf /etc/nginx/snippets/security-headers.conf
# Copy static assets from app build
# (In production, these are volume-mounted from the app container)
# COPY --from=host-uk/app:latest /app/public /app/public
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD wget -qO- http://localhost/health || exit 1
EXPOSE 80

View file

@ -0,0 +1,200 @@
# Host UK Production Docker Compose
# Deployed to de.host.uk.com and de2.host.uk.com via Coolify
#
# Container topology per app server:
# app - PHP 8.3-FPM (all Laravel modules)
# web - Nginx (static files + FastCGI proxy)
# horizon - Laravel Horizon (queue worker)
# scheduler - Laravel scheduler
# mcp - Go MCP server
# redis - Redis 7 (local cache + sessions)
# galera - MariaDB 11 (Galera cluster node)
services:
app:
image: ${REGISTRY:-gitea.snider.dev}/host-uk/app:${TAG:-latest}
restart: unless-stopped
volumes:
- app-storage:/app/storage
environment:
- APP_ENV=production
- APP_DEBUG=false
- APP_URL=${APP_URL:-https://host.uk.com}
- DB_HOST=galera
- DB_PORT=3306
- DB_DATABASE=${DB_DATABASE:-hostuk}
- DB_USERNAME=${DB_USERNAME:-hostuk}
- DB_PASSWORD=${DB_PASSWORD}
- REDIS_HOST=redis
- REDIS_PORT=6379
- CACHE_DRIVER=redis
- SESSION_DRIVER=redis
- QUEUE_CONNECTION=redis
depends_on:
redis:
condition: service_healthy
galera:
condition: service_healthy
healthcheck:
test: ["CMD-SHELL", "php-fpm-healthcheck || exit 1"]
interval: 30s
timeout: 3s
start_period: 10s
retries: 3
networks:
- app-net
web:
image: ${REGISTRY:-gitea.snider.dev}/host-uk/web:${TAG:-latest}
restart: unless-stopped
ports:
- "${WEB_PORT:-80}:80"
volumes:
- app-storage:/app/storage:ro
depends_on:
app:
condition: service_healthy
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost/health"]
interval: 30s
timeout: 3s
start_period: 5s
retries: 3
networks:
- app-net
horizon:
image: ${REGISTRY:-gitea.snider.dev}/host-uk/app:${TAG:-latest}
restart: unless-stopped
command: php artisan horizon
volumes:
- app-storage:/app/storage
environment:
- APP_ENV=production
- DB_HOST=galera
- DB_PORT=3306
- DB_DATABASE=${DB_DATABASE:-hostuk}
- DB_USERNAME=${DB_USERNAME:-hostuk}
- DB_PASSWORD=${DB_PASSWORD}
- REDIS_HOST=redis
- REDIS_PORT=6379
depends_on:
app:
condition: service_healthy
healthcheck:
test: ["CMD-SHELL", "php artisan horizon:status | grep -q running"]
interval: 60s
timeout: 5s
start_period: 30s
retries: 3
networks:
- app-net
scheduler:
image: ${REGISTRY:-gitea.snider.dev}/host-uk/app:${TAG:-latest}
restart: unless-stopped
command: php artisan schedule:work
volumes:
- app-storage:/app/storage
environment:
- APP_ENV=production
- DB_HOST=galera
- DB_PORT=3306
- DB_DATABASE=${DB_DATABASE:-hostuk}
- DB_USERNAME=${DB_USERNAME:-hostuk}
- DB_PASSWORD=${DB_PASSWORD}
- REDIS_HOST=redis
- REDIS_PORT=6379
depends_on:
app:
condition: service_healthy
networks:
- app-net
mcp:
image: ${REGISTRY:-gitea.snider.dev}/host-uk/core:${TAG:-latest}
restart: unless-stopped
command: core mcp serve
ports:
- "${MCP_PORT:-9001}:9000"
environment:
- MCP_ADDR=:9000
healthcheck:
test: ["CMD-SHELL", "nc -z localhost 9000 || exit 1"]
interval: 30s
timeout: 3s
retries: 3
networks:
- app-net
redis:
image: redis:7-alpine
restart: unless-stopped
command: >
redis-server
--maxmemory 512mb
--maxmemory-policy allkeys-lru
--appendonly yes
--appendfsync everysec
volumes:
- redis-data:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 3s
retries: 5
networks:
- app-net
galera:
image: mariadb:11
restart: unless-stopped
environment:
- MARIADB_ROOT_PASSWORD=${DB_ROOT_PASSWORD}
- MARIADB_DATABASE=${DB_DATABASE:-hostuk}
- MARIADB_USER=${DB_USERNAME:-hostuk}
- MARIADB_PASSWORD=${DB_PASSWORD}
- WSREP_CLUSTER_NAME=hostuk-galera
- WSREP_CLUSTER_ADDRESS=${GALERA_CLUSTER_ADDRESS:-gcomm://}
- WSREP_NODE_ADDRESS=${GALERA_NODE_ADDRESS}
- WSREP_NODE_NAME=${GALERA_NODE_NAME}
- WSREP_SST_METHOD=mariabackup
command: >
--wsrep-on=ON
--wsrep-provider=/usr/lib/galera/libgalera_smm.so
--wsrep-cluster-name=hostuk-galera
--wsrep-cluster-address=${GALERA_CLUSTER_ADDRESS:-gcomm://}
--wsrep-node-address=${GALERA_NODE_ADDRESS}
--wsrep-node-name=${GALERA_NODE_NAME}
--wsrep-sst-method=mariabackup
--binlog-format=ROW
--default-storage-engine=InnoDB
--innodb-autoinc-lock-mode=2
--innodb-buffer-pool-size=1G
--innodb-log-file-size=256M
--character-set-server=utf8mb4
--collation-server=utf8mb4_unicode_ci
volumes:
- galera-data:/var/lib/mysql
ports:
- "${GALERA_PORT:-3306}:3306"
- "4567:4567"
- "4568:4568"
- "4444:4444"
healthcheck:
test: ["CMD-SHELL", "mariadb -u root -p${DB_ROOT_PASSWORD} -e 'SHOW STATUS LIKE \"wsrep_ready\"' | grep -q ON"]
interval: 30s
timeout: 10s
start_period: 60s
retries: 5
networks:
- app-net
volumes:
app-storage:
redis-data:
galera-data:
networks:
app-net:
driver: bridge

59
docker/nginx/default.conf Normal file
View file

@ -0,0 +1,59 @@
# Host UK Nginx Configuration
# Proxies PHP to the app (FPM) container, serves static files directly
server {
listen 80;
server_name _;
root /app/public;
index index.php;
charset utf-8;
# Security headers
include /etc/nginx/snippets/security-headers.conf;
# Health check endpoint (no logging)
location = /health {
access_log off;
try_files $uri /index.php?$query_string;
}
# Static file caching
location ~* \.(css|js|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot|webp|avif)$ {
expires 1y;
add_header Cache-Control "public, immutable";
access_log off;
try_files $uri =404;
}
# Laravel application
location / {
try_files $uri $uri/ /index.php?$query_string;
}
# PHP-FPM upstream
location ~ \.php$ {
fastcgi_pass app:9000;
fastcgi_param SCRIPT_FILENAME $realpath_root$fastcgi_script_name;
include fastcgi_params;
fastcgi_hide_header X-Powered-By;
fastcgi_buffer_size 32k;
fastcgi_buffers 16 16k;
fastcgi_read_timeout 300;
# Pass real client IP from LB proxy protocol
fastcgi_param REMOTE_ADDR $http_x_forwarded_for;
}
# Block dotfiles (except .well-known)
location ~ /\.(?!well-known) {
deny all;
}
# Block access to sensitive files
location ~* \.(env|log|yaml|yml|toml|lock|bak|sql)$ {
deny all;
}
}

View file

@ -0,0 +1,6 @@
# Security headers for Host UK
add_header X-Frame-Options "SAMEORIGIN" always;
add_header X-Content-Type-Options "nosniff" always;
add_header X-XSS-Protection "1; mode=block" always;
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
add_header Permissions-Policy "camera=(), microphone=(), geolocation=(), payment=()" always;

10
docker/php/opcache.ini Normal file
View file

@ -0,0 +1,10 @@
; OPcache configuration for production
opcache.enable=1
opcache.memory_consumption=256
opcache.interned_strings_buffer=16
opcache.max_accelerated_files=20000
opcache.validate_timestamps=0
opcache.save_comments=1
opcache.fast_shutdown=1
opcache.jit_buffer_size=128M
opcache.jit=1255

22
docker/php/php-fpm.conf Normal file
View file

@ -0,0 +1,22 @@
; Host UK PHP-FPM pool configuration
[www]
pm = dynamic
pm.max_children = 50
pm.start_servers = 10
pm.min_spare_servers = 5
pm.max_spare_servers = 20
pm.max_requests = 1000
pm.process_idle_timeout = 10s
; Status page for health checks
pm.status_path = /fpm-status
ping.path = /fpm-ping
ping.response = pong
; Logging
access.log = /proc/self/fd/2
slowlog = /proc/self/fd/2
request_slowlog_timeout = 5s
; Security
security.limit_extensions = .php

268
infra.yaml Normal file
View file

@ -0,0 +1,268 @@
# Infrastructure Configuration — Host UK Production
# This file is the source of truth for production topology.
# Used by: core prod status, core prod setup, core deploy ansible
# --- Hosts ---
hosts:
noc:
fqdn: noc.host.uk.com
ip: 77.42.42.205
private_ip: 10.0.0.4
type: hcloud
role: bastion
ssh:
user: root
key: ~/.ssh/hostuk
port: 22
services:
- coolify
de:
fqdn: de.host.uk.com
ip: 116.202.82.115
type: hrobot
role: app
ssh:
user: root
key: ~/.ssh/hostuk
port: 22
services:
- traefik
- app
- web
- horizon
- scheduler
- mcp
- redis
- galera
de2:
fqdn: de2.host.uk.com
ip: 88.99.195.41
type: hrobot
role: app
ssh:
user: root
key: ~/.ssh/hostuk
port: 22
services:
- traefik
- app
- web
- horizon
- scheduler
- mcp
- redis
- galera
build:
fqdn: build.de.host.uk.com
ip: 46.224.93.62
private_ip: 10.0.0.5
type: hcloud
role: builder
ssh:
user: root
key: ~/.ssh/hostuk
port: 22
services:
- forgejo-runner
# --- Load Balancer ---
load_balancer:
name: hermes
fqdn: hermes.lb.host.uk.com
provider: hetzner
type: lb11
location: fsn1
algorithm: round_robin
backends:
- host: de
port: 80
- host: de2
port: 80
health_check:
protocol: http
path: /health
interval: 15
listeners:
- frontend: 443
backend: 80
protocol: https
proxy_protocol: true
ssl:
certificate: "*.host.uk.com"
san:
- host.uk.com
# --- Private Network ---
network:
cidr: 10.0.0.0/16
name: host-uk-internal
# --- DNS ---
dns:
provider: cloudns
nameservers:
- ns1.lthn.io
- ns2.lthn.io
- ns3.lthn.io
- ns4.lthn.io
zones:
host.uk.com:
records:
- name: "@"
type: A
value: "{{.lb_ip}}"
ttl: 300
- name: "*"
type: CNAME
value: hermes.lb.host.uk.com
ttl: 300
- name: hermes.lb
type: A
value: "{{.lb_ip}}"
ttl: 300
- name: noc
type: A
value: 77.42.42.205
ttl: 300
- name: de
type: A
value: 116.202.82.115
ttl: 300
- name: de2
type: A
value: 88.99.195.41
ttl: 300
- name: build.de
type: A
value: 46.224.93.62
ttl: 300
# --- SSL ---
ssl:
wildcard:
domains:
- "*.host.uk.com"
- host.uk.com
method: dns-01
dns_provider: cloudns
termination: load_balancer
# --- Database ---
database:
engine: mariadb
version: "11"
cluster: galera
nodes:
- host: de
port: 3306
- host: de2
port: 3306
sst_method: mariabackup
backup:
schedule: "0 3 * * *"
destination: s3
bucket: hostuk
prefix: backup/galera/
# --- Cache ---
cache:
engine: redis
version: "7"
sentinel: true
nodes:
- host: de
port: 6379
- host: de2
port: 6379
# --- Containers (per app server) ---
containers:
app:
image: host-uk/app:latest
port: 9000
runtime: php-fpm
replicas: 1
web:
image: host-uk/web:latest
port: 80
runtime: nginx
depends_on: [app]
horizon:
image: host-uk/app:latest
command: php artisan horizon
replicas: 1
scheduler:
image: host-uk/app:latest
command: php artisan schedule:work
replicas: 1
mcp:
image: host-uk/core:latest
port: 9000
command: core mcp serve
replicas: 1
# --- Object Storage ---
s3:
endpoint: fsn1.your-objectstorage.com
buckets:
hostuk:
purpose: infra
paths:
- backup/galera/
- backup/coolify/
- backup/certs/
host-uk:
purpose: media
paths:
- uploads/
- assets/
# --- CDN ---
cdn:
provider: bunnycdn
origin: hermes.lb.host.uk.com
zones:
- "*.host.uk.com"
# --- CI/CD ---
cicd:
provider: forgejo
url: https://gitea.snider.dev
runner: build.de
registry: gitea.snider.dev
deploy_hook: coolify
# --- Monitoring ---
monitoring:
health_endpoints:
- url: https://host.uk.com/health
interval: 60
- url: https://bio.host.uk.com/health
interval: 60
alerts:
galera_cluster_size: 2
redis_sentinel_quorum: 2
# --- Backups ---
backups:
daily:
- name: galera
type: mysqldump
destination: s3://hostuk/backup/galera/
- name: coolify
type: tar
destination: s3://hostuk/backup/coolify/
- name: certs
type: tar
destination: s3://hostuk/backup/certs/
weekly:
- name: snapshot
type: hcloud-snapshot
hosts: [noc, build]

View file

@ -0,0 +1,15 @@
package prod
import (
"github.com/host-uk/core/pkg/cli"
"github.com/spf13/cobra"
)
func init() {
cli.RegisterCommands(AddProdCommands)
}
// AddProdCommands registers the 'prod' command and all subcommands.
func AddProdCommands(root *cobra.Command) {
root.AddCommand(Cmd)
}

View file

@ -0,0 +1,129 @@
package prod
import (
"context"
"fmt"
"os"
"time"
"github.com/host-uk/core/pkg/cli"
"github.com/host-uk/core/pkg/infra"
"github.com/spf13/cobra"
)
var dnsCmd = &cobra.Command{
Use: "dns",
Short: "Manage DNS records via CloudNS",
Long: `View and manage DNS records for host.uk.com via CloudNS API.
Requires:
CLOUDNS_AUTH_ID CloudNS auth ID
CLOUDNS_AUTH_PASSWORD CloudNS auth password`,
}
var dnsListCmd = &cobra.Command{
Use: "list [zone]",
Short: "List DNS records",
Args: cobra.MaximumNArgs(1),
RunE: runDNSList,
}
var dnsSetCmd = &cobra.Command{
Use: "set <host> <type> <value>",
Short: "Create or update a DNS record",
Long: `Create or update a DNS record. Example:
core prod dns set hermes.lb A 1.2.3.4
core prod dns set "*.host.uk.com" CNAME hermes.lb.host.uk.com`,
Args: cobra.ExactArgs(3),
RunE: runDNSSet,
}
var (
dnsZone string
dnsTTL int
)
func init() {
dnsCmd.PersistentFlags().StringVar(&dnsZone, "zone", "host.uk.com", "DNS zone")
dnsSetCmd.Flags().IntVar(&dnsTTL, "ttl", 300, "Record TTL in seconds")
dnsCmd.AddCommand(dnsListCmd)
dnsCmd.AddCommand(dnsSetCmd)
}
func getDNSClient() (*infra.CloudNSClient, error) {
authID := os.Getenv("CLOUDNS_AUTH_ID")
authPass := os.Getenv("CLOUDNS_AUTH_PASSWORD")
if authID == "" || authPass == "" {
return nil, fmt.Errorf("CLOUDNS_AUTH_ID and CLOUDNS_AUTH_PASSWORD required")
}
return infra.NewCloudNSClient(authID, authPass), nil
}
func runDNSList(cmd *cobra.Command, args []string) error {
dns, err := getDNSClient()
if err != nil {
return err
}
zone := dnsZone
if len(args) > 0 {
zone = args[0]
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
records, err := dns.ListRecords(ctx, zone)
if err != nil {
return fmt.Errorf("list records: %w", err)
}
cli.Print("%s DNS records for %s\n\n", cli.BoldStyle.Render("▶"), cli.TitleStyle.Render(zone))
if len(records) == 0 {
cli.Print(" No records found\n")
return nil
}
for id, r := range records {
cli.Print(" %s %-6s %-30s %s TTL:%s\n",
cli.DimStyle.Render(id),
cli.BoldStyle.Render(r.Type),
r.Host,
r.Record,
r.TTL)
}
return nil
}
func runDNSSet(cmd *cobra.Command, args []string) error {
dns, err := getDNSClient()
if err != nil {
return err
}
host := args[0]
recordType := args[1]
value := args[2]
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
changed, err := dns.EnsureRecord(ctx, dnsZone, host, recordType, value, dnsTTL)
if err != nil {
return fmt.Errorf("set record: %w", err)
}
if changed {
cli.Print("%s %s %s %s -> %s\n",
cli.SuccessStyle.Render("✓"),
recordType, host, dnsZone, value)
} else {
cli.Print("%s Record already correct\n", cli.DimStyle.Render("·"))
}
return nil
}

113
internal/cmd/prod/cmd_lb.go Normal file
View file

@ -0,0 +1,113 @@
package prod
import (
"context"
"fmt"
"os"
"time"
"github.com/host-uk/core/pkg/cli"
"github.com/host-uk/core/pkg/infra"
"github.com/spf13/cobra"
)
var lbCmd = &cobra.Command{
Use: "lb",
Short: "Manage Hetzner load balancer",
Long: `View and manage the Hetzner Cloud managed load balancer.
Requires: HCLOUD_TOKEN`,
}
var lbStatusCmd = &cobra.Command{
Use: "status",
Short: "Show load balancer status and target health",
RunE: runLBStatus,
}
var lbCreateCmd = &cobra.Command{
Use: "create",
Short: "Create load balancer from infra.yaml",
RunE: runLBCreate,
}
func init() {
lbCmd.AddCommand(lbStatusCmd)
lbCmd.AddCommand(lbCreateCmd)
}
func getHCloudClient() (*infra.HCloudClient, error) {
token := os.Getenv("HCLOUD_TOKEN")
if token == "" {
return nil, fmt.Errorf("HCLOUD_TOKEN environment variable required")
}
return infra.NewHCloudClient(token), nil
}
func runLBStatus(cmd *cobra.Command, args []string) error {
hc, err := getHCloudClient()
if err != nil {
return err
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
lbs, err := hc.ListLoadBalancers(ctx)
if err != nil {
return fmt.Errorf("list load balancers: %w", err)
}
if len(lbs) == 0 {
cli.Print("No load balancers found\n")
return nil
}
for _, lb := range lbs {
cli.Print("%s %s\n", cli.BoldStyle.Render("▶"), cli.TitleStyle.Render(lb.Name))
cli.Print(" ID: %d\n", lb.ID)
cli.Print(" IP: %s\n", lb.PublicNet.IPv4.IP)
cli.Print(" Algorithm: %s\n", lb.Algorithm.Type)
cli.Print(" Location: %s\n", lb.Location.Name)
if len(lb.Services) > 0 {
cli.Print("\n Services:\n")
for _, s := range lb.Services {
cli.Print(" %s :%d -> :%d proxy_protocol=%v\n",
s.Protocol, s.ListenPort, s.DestinationPort, s.Proxyprotocol)
}
}
if len(lb.Targets) > 0 {
cli.Print("\n Targets:\n")
for _, t := range lb.Targets {
ip := ""
if t.IP != nil {
ip = t.IP.IP
}
for _, hs := range t.HealthStatus {
icon := cli.SuccessStyle.Render("●")
if hs.Status != "healthy" {
icon = cli.ErrorStyle.Render("○")
}
cli.Print(" %s %s :%d %s\n", icon, ip, hs.ListenPort, hs.Status)
}
}
}
fmt.Println()
}
return nil
}
func runLBCreate(cmd *cobra.Command, args []string) error {
cfg, _, err := loadConfig()
if err != nil {
return err
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
return stepLoadBalancer(ctx, cfg)
}

View file

@ -0,0 +1,35 @@
package prod
import (
"github.com/spf13/cobra"
)
var (
infraFile string
)
// Cmd is the root prod command.
var Cmd = &cobra.Command{
Use: "prod",
Short: "Production infrastructure management",
Long: `Manage the Host UK production infrastructure.
Commands:
status Show infrastructure health and connectivity
setup Phase 1: discover topology, create LB, configure DNS
dns Manage DNS records via CloudNS
lb Manage Hetzner load balancer
ssh SSH into a production host
Configuration is read from infra.yaml in the project root.`,
}
func init() {
Cmd.PersistentFlags().StringVar(&infraFile, "config", "", "Path to infra.yaml (auto-discovered if not set)")
Cmd.AddCommand(statusCmd)
Cmd.AddCommand(setupCmd)
Cmd.AddCommand(dnsCmd)
Cmd.AddCommand(lbCmd)
Cmd.AddCommand(sshCmd)
}

View file

@ -0,0 +1,284 @@
package prod
import (
"context"
"fmt"
"os"
"time"
"github.com/host-uk/core/pkg/cli"
"github.com/host-uk/core/pkg/infra"
"github.com/spf13/cobra"
)
var setupCmd = &cobra.Command{
Use: "setup",
Short: "Phase 1: discover topology, create LB, configure DNS",
Long: `Run the Phase 1 foundation setup:
1. Discover Hetzner topology (Cloud + Robot servers)
2. Create Hetzner managed load balancer
3. Configure DNS records via CloudNS
4. Verify connectivity to all hosts
Required environment variables:
HCLOUD_TOKEN Hetzner Cloud API token
HETZNER_ROBOT_USER Hetzner Robot username
HETZNER_ROBOT_PASS Hetzner Robot password
CLOUDNS_AUTH_ID CloudNS auth ID
CLOUDNS_AUTH_PASSWORD CloudNS auth password`,
RunE: runSetup,
}
var (
setupDryRun bool
setupStep string
)
func init() {
setupCmd.Flags().BoolVar(&setupDryRun, "dry-run", false, "Show what would be done without making changes")
setupCmd.Flags().StringVar(&setupStep, "step", "", "Run a specific step only (discover, lb, dns)")
}
func runSetup(cmd *cobra.Command, args []string) error {
cfg, cfgPath, err := loadConfig()
if err != nil {
return err
}
cli.Print("%s Production setup from %s\n\n",
cli.BoldStyle.Render("▶"),
cli.DimStyle.Render(cfgPath))
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
steps := []struct {
name string
fn func(context.Context, *infra.Config) error
}{
{"discover", stepDiscover},
{"lb", stepLoadBalancer},
{"dns", stepDNS},
}
for _, step := range steps {
if setupStep != "" && setupStep != step.name {
continue
}
cli.Print("\n%s Step: %s\n", cli.BoldStyle.Render("━━"), cli.TitleStyle.Render(step.name))
if err := step.fn(ctx, cfg); err != nil {
cli.Print(" %s %s: %s\n", cli.ErrorStyle.Render("✗"), step.name, err)
return fmt.Errorf("step %s failed: %w", step.name, err)
}
cli.Print(" %s %s complete\n", cli.SuccessStyle.Render("✓"), step.name)
}
cli.Print("\n%s Setup complete\n", cli.SuccessStyle.Render("✓"))
return nil
}
func stepDiscover(ctx context.Context, cfg *infra.Config) error {
// Discover HCloud servers
hcloudToken := os.Getenv("HCLOUD_TOKEN")
if hcloudToken != "" {
cli.Print(" Discovering Hetzner Cloud servers...\n")
hc := infra.NewHCloudClient(hcloudToken)
servers, err := hc.ListServers(ctx)
if err != nil {
return fmt.Errorf("list HCloud servers: %w", err)
}
for _, s := range servers {
cli.Print(" %s %s %s %s %s\n",
cli.SuccessStyle.Render("●"),
cli.BoldStyle.Render(s.Name),
s.PublicNet.IPv4.IP,
s.ServerType.Name,
cli.DimStyle.Render(s.Datacenter.Name))
}
} else {
cli.Print(" %s HCLOUD_TOKEN not set — skipping Cloud discovery\n",
cli.WarningStyle.Render("⚠"))
}
// Discover Robot servers
robotUser := os.Getenv("HETZNER_ROBOT_USER")
robotPass := os.Getenv("HETZNER_ROBOT_PASS")
if robotUser != "" && robotPass != "" {
cli.Print(" Discovering Hetzner Robot servers...\n")
hr := infra.NewHRobotClient(robotUser, robotPass)
servers, err := hr.ListServers(ctx)
if err != nil {
return fmt.Errorf("list Robot servers: %w", err)
}
for _, s := range servers {
status := cli.SuccessStyle.Render("●")
if s.Status != "ready" {
status = cli.WarningStyle.Render("○")
}
cli.Print(" %s %s %s %s %s\n",
status,
cli.BoldStyle.Render(s.ServerName),
s.ServerIP,
s.Product,
cli.DimStyle.Render(s.Datacenter))
}
} else {
cli.Print(" %s HETZNER_ROBOT_USER/PASS not set — skipping Robot discovery\n",
cli.WarningStyle.Render("⚠"))
}
return nil
}
func stepLoadBalancer(ctx context.Context, cfg *infra.Config) error {
hcloudToken := os.Getenv("HCLOUD_TOKEN")
if hcloudToken == "" {
return fmt.Errorf("HCLOUD_TOKEN required for load balancer management")
}
hc := infra.NewHCloudClient(hcloudToken)
// Check if LB already exists
lbs, err := hc.ListLoadBalancers(ctx)
if err != nil {
return fmt.Errorf("list load balancers: %w", err)
}
for _, lb := range lbs {
if lb.Name == cfg.LoadBalancer.Name {
cli.Print(" Load balancer '%s' already exists (ID: %d, IP: %s)\n",
lb.Name, lb.ID, lb.PublicNet.IPv4.IP)
return nil
}
}
if setupDryRun {
cli.Print(" [dry-run] Would create load balancer '%s' (%s) in %s\n",
cfg.LoadBalancer.Name, cfg.LoadBalancer.Type, cfg.LoadBalancer.Location)
for _, b := range cfg.LoadBalancer.Backends {
if host, ok := cfg.Hosts[b.Host]; ok {
cli.Print(" [dry-run] Backend: %s (%s:%d)\n", b.Host, host.IP, b.Port)
}
}
return nil
}
// Build targets from config
targets := make([]infra.HCloudLBCreateTarget, 0, len(cfg.LoadBalancer.Backends))
for _, b := range cfg.LoadBalancer.Backends {
host, ok := cfg.Hosts[b.Host]
if !ok {
return fmt.Errorf("backend host '%s' not found in config", b.Host)
}
targets = append(targets, infra.HCloudLBCreateTarget{
Type: "ip",
IP: &infra.HCloudLBTargetIP{IP: host.IP},
})
}
// Build services
services := make([]infra.HCloudLBService, 0, len(cfg.LoadBalancer.Listeners))
for _, l := range cfg.LoadBalancer.Listeners {
svc := infra.HCloudLBService{
Protocol: l.Protocol,
ListenPort: l.Frontend,
DestinationPort: l.Backend,
Proxyprotocol: l.ProxyProtocol,
HealthCheck: &infra.HCloudLBHealthCheck{
Protocol: cfg.LoadBalancer.Health.Protocol,
Port: l.Backend,
Interval: cfg.LoadBalancer.Health.Interval,
Timeout: 10,
Retries: 3,
HTTP: &infra.HCloudLBHCHTTP{
Path: cfg.LoadBalancer.Health.Path,
StatusCode: "2??",
},
},
}
services = append(services, svc)
}
req := infra.HCloudLBCreateRequest{
Name: cfg.LoadBalancer.Name,
LoadBalancerType: cfg.LoadBalancer.Type,
Location: cfg.LoadBalancer.Location,
Algorithm: infra.HCloudLBAlgorithm{Type: cfg.LoadBalancer.Algorithm},
Services: services,
Targets: targets,
Labels: map[string]string{
"project": "host-uk",
"managed": "core-cli",
},
}
cli.Print(" Creating load balancer '%s'...\n", cfg.LoadBalancer.Name)
lb, err := hc.CreateLoadBalancer(ctx, req)
if err != nil {
return fmt.Errorf("create load balancer: %w", err)
}
cli.Print(" Created: %s (ID: %d, IP: %s)\n",
cli.BoldStyle.Render(lb.Name), lb.ID, lb.PublicNet.IPv4.IP)
return nil
}
func stepDNS(ctx context.Context, cfg *infra.Config) error {
authID := os.Getenv("CLOUDNS_AUTH_ID")
authPass := os.Getenv("CLOUDNS_AUTH_PASSWORD")
if authID == "" || authPass == "" {
return fmt.Errorf("CLOUDNS_AUTH_ID and CLOUDNS_AUTH_PASSWORD required")
}
dns := infra.NewCloudNSClient(authID, authPass)
for zoneName, zone := range cfg.DNS.Zones {
cli.Print(" Zone: %s\n", cli.BoldStyle.Render(zoneName))
for _, rec := range zone.Records {
value := rec.Value
// Skip templated values (need LB IP first)
if value == "{{.lb_ip}}" {
cli.Print(" %s %s %s %s — %s\n",
cli.WarningStyle.Render("⚠"),
rec.Name, rec.Type, value,
cli.DimStyle.Render("needs LB IP (run setup --step=lb first)"))
continue
}
if setupDryRun {
cli.Print(" [dry-run] %s %s -> %s (TTL: %d)\n",
rec.Type, rec.Name, value, rec.TTL)
continue
}
changed, err := dns.EnsureRecord(ctx, zoneName, rec.Name, rec.Type, value, rec.TTL)
if err != nil {
cli.Print(" %s %s %s: %s\n", cli.ErrorStyle.Render("✗"), rec.Type, rec.Name, err)
continue
}
if changed {
cli.Print(" %s %s %s -> %s\n",
cli.SuccessStyle.Render("✓"),
rec.Type, rec.Name, value)
} else {
cli.Print(" %s %s %s (no change)\n",
cli.DimStyle.Render("·"),
rec.Type, rec.Name)
}
}
}
return nil
}

View file

@ -0,0 +1,64 @@
package prod
import (
"fmt"
"os"
"os/exec"
"syscall"
"github.com/host-uk/core/pkg/cli"
"github.com/spf13/cobra"
)
var sshCmd = &cobra.Command{
Use: "ssh <host>",
Short: "SSH into a production host",
Long: `Open an SSH session to a production host defined in infra.yaml.
Examples:
core prod ssh noc
core prod ssh de
core prod ssh de2
core prod ssh build`,
Args: cobra.ExactArgs(1),
RunE: runSSH,
}
func runSSH(cmd *cobra.Command, args []string) error {
cfg, _, err := loadConfig()
if err != nil {
return err
}
name := args[0]
host, ok := cfg.Hosts[name]
if !ok {
// List available hosts
cli.Print("Unknown host '%s'. Available:\n", name)
for n, h := range cfg.Hosts {
cli.Print(" %s %s (%s)\n", cli.BoldStyle.Render(n), h.IP, h.Role)
}
return fmt.Errorf("host '%s' not found in infra.yaml", name)
}
sshArgs := []string{
"ssh",
"-i", host.SSH.Key,
"-p", fmt.Sprintf("%d", host.SSH.Port),
"-o", "StrictHostKeyChecking=accept-new",
fmt.Sprintf("%s@%s", host.SSH.User, host.IP),
}
cli.Print("%s %s@%s (%s)\n",
cli.BoldStyle.Render("▶"),
host.SSH.User, host.FQDN,
cli.DimStyle.Render(host.IP))
sshPath, err := exec.LookPath("ssh")
if err != nil {
return fmt.Errorf("ssh not found: %w", err)
}
// Replace current process with SSH
return syscall.Exec(sshPath, sshArgs, os.Environ())
}

View file

@ -0,0 +1,325 @@
package prod
import (
"context"
"fmt"
"os"
"strings"
"sync"
"time"
"github.com/host-uk/core/pkg/ansible"
"github.com/host-uk/core/pkg/cli"
"github.com/host-uk/core/pkg/infra"
"github.com/spf13/cobra"
)
var statusCmd = &cobra.Command{
Use: "status",
Short: "Show production infrastructure health",
Long: `Check connectivity, services, and cluster health across all production hosts.
Tests:
- SSH connectivity to all hosts
- Docker daemon status
- Coolify controller (noc)
- Galera cluster state (de, de2)
- Redis Sentinel status (de, de2)
- Load balancer health (if HCLOUD_TOKEN set)`,
RunE: runStatus,
}
type hostStatus struct {
Name string
Host *infra.Host
Connected bool
ConnTime time.Duration
OS string
Docker string
Services map[string]string
Error error
}
func runStatus(cmd *cobra.Command, args []string) error {
cfg, cfgPath, err := loadConfig()
if err != nil {
return err
}
cli.Print("%s Infrastructure status from %s\n\n",
cli.BoldStyle.Render("▶"),
cli.DimStyle.Render(cfgPath))
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
// Check all hosts in parallel
var (
wg sync.WaitGroup
mu sync.Mutex
statuses []hostStatus
)
for name, host := range cfg.Hosts {
wg.Add(1)
go func(name string, host *infra.Host) {
defer wg.Done()
s := checkHost(ctx, name, host)
mu.Lock()
statuses = append(statuses, s)
mu.Unlock()
}(name, host)
}
wg.Wait()
// Print results in consistent order
order := []string{"noc", "de", "de2", "build"}
for _, name := range order {
for _, s := range statuses {
if s.Name == name {
printHostStatus(s)
break
}
}
}
// Check LB if token available
if token := os.Getenv("HCLOUD_TOKEN"); token != "" {
fmt.Println()
checkLoadBalancer(ctx, token)
} else {
fmt.Println()
cli.Print("%s Load balancer: %s\n",
cli.DimStyle.Render(" ○"),
cli.DimStyle.Render("HCLOUD_TOKEN not set (skipped)"))
}
return nil
}
func checkHost(ctx context.Context, name string, host *infra.Host) hostStatus {
s := hostStatus{
Name: name,
Host: host,
Services: make(map[string]string),
}
sshCfg := ansible.SSHConfig{
Host: host.IP,
Port: host.SSH.Port,
User: host.SSH.User,
KeyFile: host.SSH.Key,
Timeout: 15 * time.Second,
}
client, err := ansible.NewSSHClient(sshCfg)
if err != nil {
s.Error = fmt.Errorf("create SSH client: %w", err)
return s
}
defer func() { _ = client.Close() }()
start := time.Now()
if err := client.Connect(ctx); err != nil {
s.Error = fmt.Errorf("SSH connect: %w", err)
return s
}
s.Connected = true
s.ConnTime = time.Since(start)
// OS info
stdout, _, _, _ := client.Run(ctx, "cat /etc/os-release 2>/dev/null | grep PRETTY_NAME | cut -d'\"' -f2")
s.OS = strings.TrimSpace(stdout)
// Docker
stdout, _, _, err = client.Run(ctx, "docker --version 2>/dev/null | head -1")
if err == nil && stdout != "" {
s.Docker = strings.TrimSpace(stdout)
}
// Check each expected service
for _, svc := range host.Services {
status := checkService(ctx, client, svc)
s.Services[svc] = status
}
return s
}
func checkService(ctx context.Context, client *ansible.SSHClient, service string) string {
switch service {
case "coolify":
stdout, _, _, _ := client.Run(ctx, "docker ps --format '{{.Names}}' 2>/dev/null | grep -c coolify")
if strings.TrimSpace(stdout) != "0" && strings.TrimSpace(stdout) != "" {
return "running"
}
return "not running"
case "traefik":
stdout, _, _, _ := client.Run(ctx, "docker ps --format '{{.Names}}' 2>/dev/null | grep -c traefik")
if strings.TrimSpace(stdout) != "0" && strings.TrimSpace(stdout) != "" {
return "running"
}
return "not running"
case "galera":
// Check Galera cluster state
stdout, _, _, _ := client.Run(ctx,
"docker exec $(docker ps -q --filter name=mariadb 2>/dev/null || echo none) "+
"mariadb -u root -e \"SHOW STATUS LIKE 'wsrep_cluster_size'\" --skip-column-names 2>/dev/null | awk '{print $2}'")
size := strings.TrimSpace(stdout)
if size != "" && size != "0" {
return fmt.Sprintf("cluster_size=%s", size)
}
// Try non-Docker
stdout, _, _, _ = client.Run(ctx,
"mariadb -u root -e \"SHOW STATUS LIKE 'wsrep_cluster_size'\" --skip-column-names 2>/dev/null | awk '{print $2}'")
size = strings.TrimSpace(stdout)
if size != "" && size != "0" {
return fmt.Sprintf("cluster_size=%s", size)
}
return "not running"
case "redis":
stdout, _, _, _ := client.Run(ctx,
"docker exec $(docker ps -q --filter name=redis 2>/dev/null || echo none) "+
"redis-cli ping 2>/dev/null")
if strings.TrimSpace(stdout) == "PONG" {
return "running"
}
stdout, _, _, _ = client.Run(ctx, "redis-cli ping 2>/dev/null")
if strings.TrimSpace(stdout) == "PONG" {
return "running"
}
return "not running"
case "forgejo-runner":
stdout, _, _, _ := client.Run(ctx, "systemctl is-active forgejo-runner 2>/dev/null || docker ps --format '{{.Names}}' 2>/dev/null | grep -c runner")
val := strings.TrimSpace(stdout)
if val == "active" || (val != "0" && val != "") {
return "running"
}
return "not running"
default:
// Generic docker container check
stdout, _, _, _ := client.Run(ctx,
fmt.Sprintf("docker ps --format '{{.Names}}' 2>/dev/null | grep -c %s", service))
if strings.TrimSpace(stdout) != "0" && strings.TrimSpace(stdout) != "" {
return "running"
}
return "not running"
}
}
func printHostStatus(s hostStatus) {
// Host header
roleStyle := cli.DimStyle
switch s.Host.Role {
case "app":
roleStyle = cli.SuccessStyle
case "bastion":
roleStyle = cli.WarningStyle
case "builder":
roleStyle = cli.InfoStyle
}
cli.Print(" %s %s %s %s\n",
cli.BoldStyle.Render(s.Name),
cli.DimStyle.Render(s.Host.IP),
roleStyle.Render(s.Host.Role),
cli.DimStyle.Render(s.Host.FQDN))
if s.Error != nil {
cli.Print(" %s %s\n", cli.ErrorStyle.Render("✗"), s.Error)
return
}
if !s.Connected {
cli.Print(" %s SSH unreachable\n", cli.ErrorStyle.Render("✗"))
return
}
// Connection info
cli.Print(" %s SSH %s",
cli.SuccessStyle.Render("✓"),
cli.DimStyle.Render(s.ConnTime.Round(time.Millisecond).String()))
if s.OS != "" {
cli.Print(" %s", cli.DimStyle.Render(s.OS))
}
fmt.Println()
if s.Docker != "" {
cli.Print(" %s %s\n", cli.SuccessStyle.Render("✓"), cli.DimStyle.Render(s.Docker))
}
// Services
for _, svc := range s.Host.Services {
status, ok := s.Services[svc]
if !ok {
continue
}
icon := cli.SuccessStyle.Render("●")
style := cli.SuccessStyle
if status == "not running" {
icon = cli.ErrorStyle.Render("○")
style = cli.ErrorStyle
}
cli.Print(" %s %s %s\n", icon, svc, style.Render(status))
}
fmt.Println()
}
func checkLoadBalancer(ctx context.Context, token string) {
hc := infra.NewHCloudClient(token)
lbs, err := hc.ListLoadBalancers(ctx)
if err != nil {
cli.Print(" %s Load balancer: %s\n", cli.ErrorStyle.Render("✗"), err)
return
}
if len(lbs) == 0 {
cli.Print(" %s No load balancers found\n", cli.DimStyle.Render("○"))
return
}
for _, lb := range lbs {
cli.Print(" %s LB: %s IP: %s Targets: %d\n",
cli.SuccessStyle.Render("●"),
cli.BoldStyle.Render(lb.Name),
lb.PublicNet.IPv4.IP,
len(lb.Targets))
for _, t := range lb.Targets {
for _, hs := range t.HealthStatus {
icon := cli.SuccessStyle.Render("●")
if hs.Status != "healthy" {
icon = cli.ErrorStyle.Render("○")
}
ip := ""
if t.IP != nil {
ip = t.IP.IP
}
cli.Print(" %s :%d %s %s\n", icon, hs.ListenPort, hs.Status, cli.DimStyle.Render(ip))
}
}
}
}
func loadConfig() (*infra.Config, string, error) {
if infraFile != "" {
cfg, err := infra.Load(infraFile)
return cfg, infraFile, err
}
cwd, err := os.Getwd()
if err != nil {
return nil, "", err
}
return infra.Discover(cwd)
}

View file

@ -22,6 +22,7 @@
// - monitor: Security monitoring aggregation
// - gitea: Gitea instance management (repos, issues, PRs, mirrors)
// - unifi: UniFi network management (sites, devices, clients)
// - prod: Production infrastructure management
package variants
@ -45,6 +46,7 @@ import (
_ "github.com/host-uk/core/internal/cmd/php"
_ "github.com/host-uk/core/internal/cmd/pkgcmd"
_ "github.com/host-uk/core/internal/cmd/plugin"
_ "github.com/host-uk/core/internal/cmd/prod"
_ "github.com/host-uk/core/internal/cmd/qa"
_ "github.com/host-uk/core/internal/cmd/sdk"
_ "github.com/host-uk/core/internal/cmd/security"

272
pkg/infra/cloudns.go Normal file
View file

@ -0,0 +1,272 @@
package infra
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strconv"
"time"
)
const cloudnsBaseURL = "https://api.cloudns.net"
// CloudNSClient is an HTTP client for the CloudNS DNS API.
type CloudNSClient struct {
authID string
password string
client *http.Client
}
// NewCloudNSClient creates a new CloudNS API client.
// Uses sub-auth-user (auth-id) authentication.
func NewCloudNSClient(authID, password string) *CloudNSClient {
return &CloudNSClient{
authID: authID,
password: password,
client: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// CloudNSZone represents a DNS zone.
type CloudNSZone struct {
Name string `json:"name"`
Type string `json:"type"`
Zone string `json:"zone"`
Status string `json:"status"`
}
// CloudNSRecord represents a DNS record.
type CloudNSRecord struct {
ID string `json:"id"`
Type string `json:"type"`
Host string `json:"host"`
Record string `json:"record"`
TTL string `json:"ttl"`
Priority string `json:"priority,omitempty"`
Status int `json:"status"`
}
// ListZones returns all DNS zones.
func (c *CloudNSClient) ListZones(ctx context.Context) ([]CloudNSZone, error) {
params := c.authParams()
params.Set("page", "1")
params.Set("rows-per-page", "100")
params.Set("search", "")
data, err := c.get(ctx, "/dns/list-zones.json", params)
if err != nil {
return nil, err
}
var zones []CloudNSZone
if err := json.Unmarshal(data, &zones); err != nil {
// CloudNS returns an empty object {} for no results instead of []
return nil, nil
}
return zones, nil
}
// ListRecords returns all DNS records for a zone.
func (c *CloudNSClient) ListRecords(ctx context.Context, domain string) (map[string]CloudNSRecord, error) {
params := c.authParams()
params.Set("domain-name", domain)
data, err := c.get(ctx, "/dns/records.json", params)
if err != nil {
return nil, err
}
var records map[string]CloudNSRecord
if err := json.Unmarshal(data, &records); err != nil {
return nil, fmt.Errorf("parse records: %w", err)
}
return records, nil
}
// CreateRecord creates a DNS record. Returns the record ID.
func (c *CloudNSClient) CreateRecord(ctx context.Context, domain, host, recordType, value string, ttl int) (string, error) {
params := c.authParams()
params.Set("domain-name", domain)
params.Set("host", host)
params.Set("record-type", recordType)
params.Set("record", value)
params.Set("ttl", strconv.Itoa(ttl))
data, err := c.post(ctx, "/dns/add-record.json", params)
if err != nil {
return "", err
}
var result struct {
Status string `json:"status"`
StatusDescription string `json:"statusDescription"`
Data struct {
ID int `json:"id"`
} `json:"data"`
}
if err := json.Unmarshal(data, &result); err != nil {
return "", fmt.Errorf("parse response: %w", err)
}
if result.Status != "Success" {
return "", fmt.Errorf("cloudns: %s", result.StatusDescription)
}
return strconv.Itoa(result.Data.ID), nil
}
// UpdateRecord updates an existing DNS record.
func (c *CloudNSClient) UpdateRecord(ctx context.Context, domain, recordID, host, recordType, value string, ttl int) error {
params := c.authParams()
params.Set("domain-name", domain)
params.Set("record-id", recordID)
params.Set("host", host)
params.Set("record-type", recordType)
params.Set("record", value)
params.Set("ttl", strconv.Itoa(ttl))
data, err := c.post(ctx, "/dns/mod-record.json", params)
if err != nil {
return err
}
var result struct {
Status string `json:"status"`
StatusDescription string `json:"statusDescription"`
}
if err := json.Unmarshal(data, &result); err != nil {
return fmt.Errorf("parse response: %w", err)
}
if result.Status != "Success" {
return fmt.Errorf("cloudns: %s", result.StatusDescription)
}
return nil
}
// DeleteRecord deletes a DNS record by ID.
func (c *CloudNSClient) DeleteRecord(ctx context.Context, domain, recordID string) error {
params := c.authParams()
params.Set("domain-name", domain)
params.Set("record-id", recordID)
data, err := c.post(ctx, "/dns/delete-record.json", params)
if err != nil {
return err
}
var result struct {
Status string `json:"status"`
StatusDescription string `json:"statusDescription"`
}
if err := json.Unmarshal(data, &result); err != nil {
return fmt.Errorf("parse response: %w", err)
}
if result.Status != "Success" {
return fmt.Errorf("cloudns: %s", result.StatusDescription)
}
return nil
}
// EnsureRecord creates or updates a DNS record to match the desired state.
// Returns true if a change was made.
func (c *CloudNSClient) EnsureRecord(ctx context.Context, domain, host, recordType, value string, ttl int) (bool, error) {
records, err := c.ListRecords(ctx, domain)
if err != nil {
return false, fmt.Errorf("list records: %w", err)
}
// Check if record already exists
for id, r := range records {
if r.Host == host && r.Type == recordType {
if r.Record == value {
return false, nil // Already correct
}
// Update existing record
if err := c.UpdateRecord(ctx, domain, id, host, recordType, value, ttl); err != nil {
return false, fmt.Errorf("update record: %w", err)
}
return true, nil
}
}
// Create new record
if _, err := c.CreateRecord(ctx, domain, host, recordType, value, ttl); err != nil {
return false, fmt.Errorf("create record: %w", err)
}
return true, nil
}
// SetACMEChallenge creates a DNS-01 ACME challenge TXT record.
func (c *CloudNSClient) SetACMEChallenge(ctx context.Context, domain, value string) (string, error) {
return c.CreateRecord(ctx, domain, "_acme-challenge", "TXT", value, 60)
}
// ClearACMEChallenge removes the DNS-01 ACME challenge TXT record.
func (c *CloudNSClient) ClearACMEChallenge(ctx context.Context, domain string) error {
records, err := c.ListRecords(ctx, domain)
if err != nil {
return err
}
for id, r := range records {
if r.Host == "_acme-challenge" && r.Type == "TXT" {
if err := c.DeleteRecord(ctx, domain, id); err != nil {
return err
}
}
}
return nil
}
func (c *CloudNSClient) authParams() url.Values {
params := url.Values{}
params.Set("auth-id", c.authID)
params.Set("auth-password", c.password)
return params
}
func (c *CloudNSClient) get(ctx context.Context, path string, params url.Values) ([]byte, error) {
u := cloudnsBaseURL + path + "?" + params.Encode()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return nil, err
}
return c.doRaw(req)
}
func (c *CloudNSClient) post(ctx context.Context, path string, params url.Values) ([]byte, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodPost, cloudnsBaseURL+path, nil)
if err != nil {
return nil, err
}
req.URL.RawQuery = params.Encode()
return c.doRaw(req)
}
func (c *CloudNSClient) doRaw(req *http.Request) ([]byte, error) {
resp, err := c.client.Do(req)
if err != nil {
return nil, fmt.Errorf("cloudns API: %w", err)
}
defer func() { _ = resp.Body.Close() }()
data, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read response: %w", err)
}
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("cloudns API %d: %s", resp.StatusCode, string(data))
}
return data, nil
}

300
pkg/infra/config.go Normal file
View file

@ -0,0 +1,300 @@
// Package infra provides infrastructure configuration and API clients
// for managing the Host UK production environment.
package infra
import (
"fmt"
"os"
"path/filepath"
"gopkg.in/yaml.v3"
)
// Config is the top-level infrastructure configuration parsed from infra.yaml.
type Config struct {
Hosts map[string]*Host `yaml:"hosts"`
LoadBalancer LoadBalancer `yaml:"load_balancer"`
Network Network `yaml:"network"`
DNS DNS `yaml:"dns"`
SSL SSL `yaml:"ssl"`
Database Database `yaml:"database"`
Cache Cache `yaml:"cache"`
Containers map[string]*Container `yaml:"containers"`
S3 S3Config `yaml:"s3"`
CDN CDN `yaml:"cdn"`
CICD CICD `yaml:"cicd"`
Monitoring Monitoring `yaml:"monitoring"`
Backups Backups `yaml:"backups"`
}
// Host represents a server in the infrastructure.
type Host struct {
FQDN string `yaml:"fqdn"`
IP string `yaml:"ip"`
PrivateIP string `yaml:"private_ip,omitempty"`
Type string `yaml:"type"` // hcloud, hrobot
Role string `yaml:"role"` // bastion, app, builder
SSH SSHConf `yaml:"ssh"`
Services []string `yaml:"services"`
}
// SSHConf holds SSH connection details for a host.
type SSHConf struct {
User string `yaml:"user"`
Key string `yaml:"key"`
Port int `yaml:"port"`
}
// LoadBalancer represents a Hetzner managed load balancer.
type LoadBalancer struct {
Name string `yaml:"name"`
FQDN string `yaml:"fqdn"`
Provider string `yaml:"provider"`
Type string `yaml:"type"`
Location string `yaml:"location"`
Algorithm string `yaml:"algorithm"`
Backends []Backend `yaml:"backends"`
Health HealthCheck `yaml:"health_check"`
Listeners []Listener `yaml:"listeners"`
SSL LBCert `yaml:"ssl"`
}
// Backend is a load balancer backend target.
type Backend struct {
Host string `yaml:"host"`
Port int `yaml:"port"`
}
// HealthCheck configures load balancer health checking.
type HealthCheck struct {
Protocol string `yaml:"protocol"`
Path string `yaml:"path"`
Interval int `yaml:"interval"`
}
// Listener maps a frontend port to a backend port.
type Listener struct {
Frontend int `yaml:"frontend"`
Backend int `yaml:"backend"`
Protocol string `yaml:"protocol"`
ProxyProtocol bool `yaml:"proxy_protocol"`
}
// LBCert holds the SSL certificate configuration for the load balancer.
type LBCert struct {
Certificate string `yaml:"certificate"`
SAN []string `yaml:"san"`
}
// Network describes the private network.
type Network struct {
CIDR string `yaml:"cidr"`
Name string `yaml:"name"`
}
// DNS holds DNS provider configuration and zone records.
type DNS struct {
Provider string `yaml:"provider"`
Nameservers []string `yaml:"nameservers"`
Zones map[string]*Zone `yaml:"zones"`
}
// Zone is a DNS zone with its records.
type Zone struct {
Records []DNSRecord `yaml:"records"`
}
// DNSRecord is a single DNS record.
type DNSRecord struct {
Name string `yaml:"name"`
Type string `yaml:"type"`
Value string `yaml:"value"`
TTL int `yaml:"ttl"`
}
// SSL holds SSL certificate configuration.
type SSL struct {
Wildcard WildcardCert `yaml:"wildcard"`
}
// WildcardCert describes a wildcard SSL certificate.
type WildcardCert struct {
Domains []string `yaml:"domains"`
Method string `yaml:"method"`
DNSProvider string `yaml:"dns_provider"`
Termination string `yaml:"termination"`
}
// Database describes the database cluster.
type Database struct {
Engine string `yaml:"engine"`
Version string `yaml:"version"`
Cluster string `yaml:"cluster"`
Nodes []DBNode `yaml:"nodes"`
SSTMethod string `yaml:"sst_method"`
Backup BackupConfig `yaml:"backup"`
}
// DBNode is a database cluster node.
type DBNode struct {
Host string `yaml:"host"`
Port int `yaml:"port"`
}
// BackupConfig describes automated backup settings.
type BackupConfig struct {
Schedule string `yaml:"schedule"`
Destination string `yaml:"destination"`
Bucket string `yaml:"bucket"`
Prefix string `yaml:"prefix"`
}
// Cache describes the cache/session cluster.
type Cache struct {
Engine string `yaml:"engine"`
Version string `yaml:"version"`
Sentinel bool `yaml:"sentinel"`
Nodes []CacheNode `yaml:"nodes"`
}
// CacheNode is a cache cluster node.
type CacheNode struct {
Host string `yaml:"host"`
Port int `yaml:"port"`
}
// Container describes a container deployment.
type Container struct {
Image string `yaml:"image"`
Port int `yaml:"port,omitempty"`
Runtime string `yaml:"runtime,omitempty"`
Command string `yaml:"command,omitempty"`
Replicas int `yaml:"replicas,omitempty"`
DependsOn []string `yaml:"depends_on,omitempty"`
}
// S3Config describes object storage.
type S3Config struct {
Endpoint string `yaml:"endpoint"`
Buckets map[string]*S3Bucket `yaml:"buckets"`
}
// S3Bucket is an S3 bucket configuration.
type S3Bucket struct {
Purpose string `yaml:"purpose"`
Paths []string `yaml:"paths"`
}
// CDN describes CDN configuration.
type CDN struct {
Provider string `yaml:"provider"`
Origin string `yaml:"origin"`
Zones []string `yaml:"zones"`
}
// CICD describes CI/CD configuration.
type CICD struct {
Provider string `yaml:"provider"`
URL string `yaml:"url"`
Runner string `yaml:"runner"`
Registry string `yaml:"registry"`
DeployHook string `yaml:"deploy_hook"`
}
// Monitoring describes monitoring configuration.
type Monitoring struct {
HealthEndpoints []HealthEndpoint `yaml:"health_endpoints"`
Alerts map[string]int `yaml:"alerts"`
}
// HealthEndpoint is a URL to monitor.
type HealthEndpoint struct {
URL string `yaml:"url"`
Interval int `yaml:"interval"`
}
// Backups describes backup schedules.
type Backups struct {
Daily []BackupJob `yaml:"daily"`
Weekly []BackupJob `yaml:"weekly"`
}
// BackupJob is a scheduled backup task.
type BackupJob struct {
Name string `yaml:"name"`
Type string `yaml:"type"`
Destination string `yaml:"destination,omitempty"`
Hosts []string `yaml:"hosts,omitempty"`
}
// Load reads and parses an infra.yaml file.
func Load(path string) (*Config, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read infra config: %w", err)
}
var cfg Config
if err := yaml.Unmarshal(data, &cfg); err != nil {
return nil, fmt.Errorf("parse infra config: %w", err)
}
// Expand SSH key paths
for _, h := range cfg.Hosts {
if h.SSH.Key != "" {
h.SSH.Key = expandPath(h.SSH.Key)
}
if h.SSH.Port == 0 {
h.SSH.Port = 22
}
}
return &cfg, nil
}
// Discover searches for infra.yaml in the given directory and parent directories.
func Discover(startDir string) (*Config, string, error) {
dir := startDir
for {
path := filepath.Join(dir, "infra.yaml")
if _, err := os.Stat(path); err == nil {
cfg, err := Load(path)
return cfg, path, err
}
parent := filepath.Dir(dir)
if parent == dir {
break
}
dir = parent
}
return nil, "", fmt.Errorf("infra.yaml not found (searched from %s)", startDir)
}
// HostsByRole returns all hosts matching the given role.
func (c *Config) HostsByRole(role string) map[string]*Host {
result := make(map[string]*Host)
for name, h := range c.Hosts {
if h.Role == role {
result[name] = h
}
}
return result
}
// AppServers returns hosts with role "app".
func (c *Config) AppServers() map[string]*Host {
return c.HostsByRole("app")
}
// expandPath expands ~ to home directory.
func expandPath(path string) string {
if len(path) > 0 && path[0] == '~' {
home, err := os.UserHomeDir()
if err != nil {
return path
}
return filepath.Join(home, path[1:])
}
return path
}

100
pkg/infra/config_test.go Normal file
View file

@ -0,0 +1,100 @@
package infra
import (
"os"
"path/filepath"
"testing"
)
func TestLoad_Good(t *testing.T) {
// Find infra.yaml relative to test
// Walk up from test dir to find it
dir, err := os.Getwd()
if err != nil {
t.Fatal(err)
}
cfg, path, err := Discover(dir)
if err != nil {
t.Skipf("infra.yaml not found from %s: %v", dir, err)
}
t.Logf("Loaded %s", path)
if len(cfg.Hosts) == 0 {
t.Error("expected at least one host")
}
// Check required hosts exist
for _, name := range []string{"noc", "de", "de2", "build"} {
if _, ok := cfg.Hosts[name]; !ok {
t.Errorf("expected host %q in config", name)
}
}
// Check de host details
de := cfg.Hosts["de"]
if de.IP != "116.202.82.115" {
t.Errorf("de IP = %q, want 116.202.82.115", de.IP)
}
if de.Role != "app" {
t.Errorf("de role = %q, want app", de.Role)
}
// Check LB config
if cfg.LoadBalancer.Name != "hermes" {
t.Errorf("LB name = %q, want hermes", cfg.LoadBalancer.Name)
}
if cfg.LoadBalancer.Type != "lb11" {
t.Errorf("LB type = %q, want lb11", cfg.LoadBalancer.Type)
}
if len(cfg.LoadBalancer.Backends) != 2 {
t.Errorf("LB backends = %d, want 2", len(cfg.LoadBalancer.Backends))
}
// Check app servers helper
apps := cfg.AppServers()
if len(apps) != 2 {
t.Errorf("AppServers() = %d, want 2", len(apps))
}
}
func TestLoad_Bad(t *testing.T) {
_, err := Load("/nonexistent/infra.yaml")
if err == nil {
t.Error("expected error for nonexistent file")
}
}
func TestLoad_Ugly(t *testing.T) {
// Invalid YAML
tmp := filepath.Join(t.TempDir(), "infra.yaml")
if err := os.WriteFile(tmp, []byte("{{invalid yaml"), 0644); err != nil {
t.Fatal(err)
}
_, err := Load(tmp)
if err == nil {
t.Error("expected error for invalid YAML")
}
}
func TestExpandPath(t *testing.T) {
home, _ := os.UserHomeDir()
tests := []struct {
input string
want string
}{
{"~/.ssh/id_rsa", filepath.Join(home, ".ssh/id_rsa")},
{"/absolute/path", "/absolute/path"},
{"relative/path", "relative/path"},
}
for _, tt := range tests {
got := expandPath(tt.input)
if got != tt.want {
t.Errorf("expandPath(%q) = %q, want %q", tt.input, got, tt.want)
}
}
}

381
pkg/infra/hetzner.go Normal file
View file

@ -0,0 +1,381 @@
package infra
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
)
const (
hcloudBaseURL = "https://api.hetzner.cloud/v1"
hrobotBaseURL = "https://robot-ws.your-server.de"
)
// HCloudClient is an HTTP client for the Hetzner Cloud API.
type HCloudClient struct {
token string
client *http.Client
}
// NewHCloudClient creates a new Hetzner Cloud API client.
func NewHCloudClient(token string) *HCloudClient {
return &HCloudClient{
token: token,
client: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// HCloudServer represents a Hetzner Cloud server.
type HCloudServer struct {
ID int `json:"id"`
Name string `json:"name"`
Status string `json:"status"`
PublicNet HCloudPublicNet `json:"public_net"`
PrivateNet []HCloudPrivateNet `json:"private_net"`
ServerType HCloudServerType `json:"server_type"`
Datacenter HCloudDatacenter `json:"datacenter"`
Labels map[string]string `json:"labels"`
}
// HCloudPublicNet holds public network info.
type HCloudPublicNet struct {
IPv4 HCloudIPv4 `json:"ipv4"`
}
// HCloudIPv4 holds an IPv4 address.
type HCloudIPv4 struct {
IP string `json:"ip"`
}
// HCloudPrivateNet holds private network info.
type HCloudPrivateNet struct {
IP string `json:"ip"`
Network int `json:"network"`
}
// HCloudServerType holds server type info.
type HCloudServerType struct {
Name string `json:"name"`
Description string `json:"description"`
Cores int `json:"cores"`
Memory float64 `json:"memory"`
Disk int `json:"disk"`
}
// HCloudDatacenter holds datacenter info.
type HCloudDatacenter struct {
Name string `json:"name"`
Description string `json:"description"`
}
// HCloudLoadBalancer represents a Hetzner Cloud load balancer.
type HCloudLoadBalancer struct {
ID int `json:"id"`
Name string `json:"name"`
PublicNet HCloudLBPublicNet `json:"public_net"`
Algorithm HCloudLBAlgorithm `json:"algorithm"`
Services []HCloudLBService `json:"services"`
Targets []HCloudLBTarget `json:"targets"`
Location HCloudDatacenter `json:"location"`
Labels map[string]string `json:"labels"`
}
// HCloudLBPublicNet holds LB public network info.
type HCloudLBPublicNet struct {
Enabled bool `json:"enabled"`
IPv4 HCloudIPv4 `json:"ipv4"`
}
// HCloudLBAlgorithm holds the LB algorithm.
type HCloudLBAlgorithm struct {
Type string `json:"type"`
}
// HCloudLBService describes an LB listener.
type HCloudLBService struct {
Protocol string `json:"protocol"`
ListenPort int `json:"listen_port"`
DestinationPort int `json:"destination_port"`
Proxyprotocol bool `json:"proxyprotocol"`
HTTP *HCloudLBHTTP `json:"http,omitempty"`
HealthCheck *HCloudLBHealthCheck `json:"health_check,omitempty"`
}
// HCloudLBHTTP holds HTTP-specific LB options.
type HCloudLBHTTP struct {
RedirectHTTP bool `json:"redirect_http"`
}
// HCloudLBHealthCheck holds LB health check config.
type HCloudLBHealthCheck struct {
Protocol string `json:"protocol"`
Port int `json:"port"`
Interval int `json:"interval"`
Timeout int `json:"timeout"`
Retries int `json:"retries"`
HTTP *HCloudLBHCHTTP `json:"http,omitempty"`
}
// HCloudLBHCHTTP holds HTTP health check options.
type HCloudLBHCHTTP struct {
Path string `json:"path"`
StatusCode string `json:"status_codes"`
}
// HCloudLBTarget is a load balancer backend target.
type HCloudLBTarget struct {
Type string `json:"type"`
IP *HCloudLBTargetIP `json:"ip,omitempty"`
Server *HCloudLBTargetServer `json:"server,omitempty"`
HealthStatus []HCloudLBHealthStatus `json:"health_status"`
}
// HCloudLBTargetIP is an IP-based LB target.
type HCloudLBTargetIP struct {
IP string `json:"ip"`
}
// HCloudLBTargetServer is a server-based LB target.
type HCloudLBTargetServer struct {
ID int `json:"id"`
}
// HCloudLBHealthStatus holds target health info.
type HCloudLBHealthStatus struct {
ListenPort int `json:"listen_port"`
Status string `json:"status"`
}
// HCloudLBCreateRequest holds load balancer creation params.
type HCloudLBCreateRequest struct {
Name string `json:"name"`
LoadBalancerType string `json:"load_balancer_type"`
Location string `json:"location"`
Algorithm HCloudLBAlgorithm `json:"algorithm"`
Services []HCloudLBService `json:"services"`
Targets []HCloudLBCreateTarget `json:"targets"`
Labels map[string]string `json:"labels"`
}
// HCloudLBCreateTarget is a target for LB creation.
type HCloudLBCreateTarget struct {
Type string `json:"type"`
IP *HCloudLBTargetIP `json:"ip,omitempty"`
}
// ListServers returns all Hetzner Cloud servers.
func (c *HCloudClient) ListServers(ctx context.Context) ([]HCloudServer, error) {
var result struct {
Servers []HCloudServer `json:"servers"`
}
if err := c.get(ctx, "/servers", &result); err != nil {
return nil, err
}
return result.Servers, nil
}
// ListLoadBalancers returns all load balancers.
func (c *HCloudClient) ListLoadBalancers(ctx context.Context) ([]HCloudLoadBalancer, error) {
var result struct {
LoadBalancers []HCloudLoadBalancer `json:"load_balancers"`
}
if err := c.get(ctx, "/load_balancers", &result); err != nil {
return nil, err
}
return result.LoadBalancers, nil
}
// GetLoadBalancer returns a load balancer by ID.
func (c *HCloudClient) GetLoadBalancer(ctx context.Context, id int) (*HCloudLoadBalancer, error) {
var result struct {
LoadBalancer HCloudLoadBalancer `json:"load_balancer"`
}
if err := c.get(ctx, fmt.Sprintf("/load_balancers/%d", id), &result); err != nil {
return nil, err
}
return &result.LoadBalancer, nil
}
// CreateLoadBalancer creates a new load balancer.
func (c *HCloudClient) CreateLoadBalancer(ctx context.Context, req HCloudLBCreateRequest) (*HCloudLoadBalancer, error) {
body, err := json.Marshal(req)
if err != nil {
return nil, fmt.Errorf("marshal request: %w", err)
}
var result struct {
LoadBalancer HCloudLoadBalancer `json:"load_balancer"`
}
if err := c.post(ctx, "/load_balancers", body, &result); err != nil {
return nil, err
}
return &result.LoadBalancer, nil
}
// DeleteLoadBalancer deletes a load balancer by ID.
func (c *HCloudClient) DeleteLoadBalancer(ctx context.Context, id int) error {
return c.delete(ctx, fmt.Sprintf("/load_balancers/%d", id))
}
// CreateSnapshot creates a server snapshot.
func (c *HCloudClient) CreateSnapshot(ctx context.Context, serverID int, description string) error {
body, _ := json.Marshal(map[string]string{
"description": description,
"type": "snapshot",
})
return c.post(ctx, fmt.Sprintf("/servers/%d/actions/create_image", serverID), body, nil)
}
func (c *HCloudClient) get(ctx context.Context, path string, result any) error {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, hcloudBaseURL+path, nil)
if err != nil {
return err
}
return c.do(req, result)
}
func (c *HCloudClient) post(ctx context.Context, path string, body []byte, result any) error {
req, err := http.NewRequestWithContext(ctx, http.MethodPost, hcloudBaseURL+path, strings.NewReader(string(body)))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
return c.do(req, result)
}
func (c *HCloudClient) delete(ctx context.Context, path string) error {
req, err := http.NewRequestWithContext(ctx, http.MethodDelete, hcloudBaseURL+path, nil)
if err != nil {
return err
}
return c.do(req, nil)
}
func (c *HCloudClient) do(req *http.Request, result any) error {
req.Header.Set("Authorization", "Bearer "+c.token)
resp, err := c.client.Do(req)
if err != nil {
return fmt.Errorf("hcloud API: %w", err)
}
defer func() { _ = resp.Body.Close() }()
data, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("read response: %w", err)
}
if resp.StatusCode >= 400 {
var apiErr struct {
Error struct {
Code string `json:"code"`
Message string `json:"message"`
} `json:"error"`
}
if json.Unmarshal(data, &apiErr) == nil && apiErr.Error.Message != "" {
return fmt.Errorf("hcloud API %d: %s — %s", resp.StatusCode, apiErr.Error.Code, apiErr.Error.Message)
}
return fmt.Errorf("hcloud API %d: %s", resp.StatusCode, string(data))
}
if result != nil {
if err := json.Unmarshal(data, result); err != nil {
return fmt.Errorf("decode response: %w", err)
}
}
return nil
}
// --- Hetzner Robot API ---
// HRobotClient is an HTTP client for the Hetzner Robot API.
type HRobotClient struct {
user string
password string
client *http.Client
}
// NewHRobotClient creates a new Hetzner Robot API client.
func NewHRobotClient(user, password string) *HRobotClient {
return &HRobotClient{
user: user,
password: password,
client: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// HRobotServer represents a Hetzner Robot dedicated server.
type HRobotServer struct {
ServerIP string `json:"server_ip"`
ServerName string `json:"server_name"`
Product string `json:"product"`
Datacenter string `json:"dc"`
Status string `json:"status"`
Cancelled bool `json:"cancelled"`
PaidUntil string `json:"paid_until"`
}
// ListServers returns all Robot dedicated servers.
func (c *HRobotClient) ListServers(ctx context.Context) ([]HRobotServer, error) {
var raw []struct {
Server HRobotServer `json:"server"`
}
if err := c.get(ctx, "/server", &raw); err != nil {
return nil, err
}
servers := make([]HRobotServer, len(raw))
for i, s := range raw {
servers[i] = s.Server
}
return servers, nil
}
// GetServer returns a Robot server by IP.
func (c *HRobotClient) GetServer(ctx context.Context, ip string) (*HRobotServer, error) {
var raw struct {
Server HRobotServer `json:"server"`
}
if err := c.get(ctx, "/server/"+ip, &raw); err != nil {
return nil, err
}
return &raw.Server, nil
}
func (c *HRobotClient) get(ctx context.Context, path string, result any) error {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, hrobotBaseURL+path, nil)
if err != nil {
return err
}
req.SetBasicAuth(c.user, c.password)
resp, err := c.client.Do(req)
if err != nil {
return fmt.Errorf("hrobot API: %w", err)
}
defer func() { _ = resp.Body.Close() }()
data, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("read response: %w", err)
}
if resp.StatusCode >= 400 {
return fmt.Errorf("hrobot API %d: %s", resp.StatusCode, string(data))
}
if result != nil {
if err := json.Unmarshal(data, result); err != nil {
return fmt.Errorf("decode response: %w", err)
}
}
return nil
}

View file

@ -0,0 +1,63 @@
# Galera Database Backup
# Dumps the database and uploads to Hetzner S3
#
# Usage:
# core deploy ansible playbooks/galera-backup.yml -i playbooks/inventory.yml -l de
---
- name: Backup Galera Database to S3
hosts: app_servers
become: true
vars:
db_root_password: "{{ lookup('env', 'DB_ROOT_PASSWORD') }}"
s3_endpoint: "{{ lookup('env', 'HETZNER_S3_ENDPOINT') | default('fsn1.your-objectstorage.com', true) }}"
s3_bucket: "{{ lookup('env', 'HETZNER_S3_BUCKET') | default('hostuk', true) }}"
s3_access_key: "{{ lookup('env', 'HETZNER_S3_ACCESS_KEY') }}"
s3_secret_key: "{{ lookup('env', 'HETZNER_S3_SECRET_KEY') }}"
backup_prefix: backup/galera
backup_retain_days: 30
tasks:
- name: Create backup directory
file:
path: /opt/backup
state: directory
mode: "0700"
- name: Dump database
shell: |
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
DUMP_FILE="/opt/backup/hostuk-${TIMESTAMP}-{{ galera_node_name }}.sql.gz"
docker exec galera mariadb-dump \
-u root -p{{ db_root_password }} \
--all-databases \
--single-transaction \
--routines \
--triggers \
--events \
| gzip > "${DUMP_FILE}"
echo "${DUMP_FILE}"
register: dump_result
- name: Install s3cmd if missing
shell: |
which s3cmd 2>/dev/null || pip3 install s3cmd
changed_when: false
- name: Upload to S3
shell: |
s3cmd put {{ dump_result.stdout | trim }} \
s3://{{ s3_bucket }}/{{ backup_prefix }}/$(basename {{ dump_result.stdout | trim }}) \
--host={{ s3_endpoint }} \
--host-bucket='%(bucket)s.{{ s3_endpoint }}' \
--access_key={{ s3_access_key }} \
--secret_key={{ s3_secret_key }}
when: s3_access_key != ""
- name: Clean old local backups
shell: |
find /opt/backup -name "hostuk-*.sql.gz" -mtime +{{ backup_retain_days }} -delete
changed_when: false
- name: Show backup result
debug:
msg: "Backup completed: {{ dump_result.stdout | trim }}"

View file

@ -0,0 +1,96 @@
# MariaDB Galera Cluster Deployment
# Deploys a 2-node Galera cluster on de + de2
#
# Usage:
# core deploy ansible playbooks/galera-deploy.yml -i playbooks/inventory.yml
# core deploy ansible playbooks/galera-deploy.yml -i playbooks/inventory.yml -l de # Single node
#
# First-time bootstrap:
# Set galera_bootstrap=true for the first node:
# core deploy ansible playbooks/galera-deploy.yml -i playbooks/inventory.yml -l de -e galera_bootstrap=true
---
- name: Deploy MariaDB Galera Cluster
hosts: app_servers
become: true
vars:
mariadb_version: "11"
galera_cluster_address: "gcomm://116.202.82.115,88.99.195.41"
galera_bootstrap: false
db_root_password: "{{ lookup('env', 'DB_ROOT_PASSWORD') }}"
db_password: "{{ lookup('env', 'DB_PASSWORD') }}"
tasks:
- name: Create MariaDB data directory
file:
path: /opt/galera/data
state: directory
mode: "0755"
- name: Create MariaDB config directory
file:
path: /opt/galera/conf.d
state: directory
mode: "0755"
- name: Write Galera configuration
copy:
dest: /opt/galera/conf.d/galera.cnf
content: |
[mysqld]
wsrep_on=ON
wsrep_provider=/usr/lib/galera/libgalera_smm.so
wsrep_cluster_name={{ galera_cluster_name }}
wsrep_cluster_address={{ 'gcomm://' if galera_bootstrap else galera_cluster_address }}
wsrep_node_address={{ galera_node_address }}
wsrep_node_name={{ galera_node_name }}
wsrep_sst_method={{ galera_sst_method }}
binlog_format=ROW
default_storage_engine=InnoDB
innodb_autoinc_lock_mode=2
innodb_buffer_pool_size=1G
innodb_log_file_size=256M
character_set_server=utf8mb4
collation_server=utf8mb4_unicode_ci
- name: Stop existing MariaDB container
shell: docker stop galera 2>/dev/null || true
changed_when: false
- name: Remove existing MariaDB container
shell: docker rm galera 2>/dev/null || true
changed_when: false
- name: Start MariaDB Galera container
shell: |
docker run -d \
--name galera \
--restart unless-stopped \
--network host \
-v /opt/galera/data:/var/lib/mysql \
-v /opt/galera/conf.d:/etc/mysql/conf.d \
-e MARIADB_ROOT_PASSWORD={{ db_root_password }} \
-e MARIADB_DATABASE={{ db_name }} \
-e MARIADB_USER={{ db_user }} \
-e MARIADB_PASSWORD={{ db_password }} \
mariadb:{{ mariadb_version }}
- name: Wait for MariaDB to be ready
shell: |
for i in $(seq 1 60); do
docker exec galera mariadb -u root -p{{ db_root_password }} -e "SELECT 1" 2>/dev/null && exit 0
sleep 2
done
exit 1
changed_when: false
- name: Check Galera cluster status
shell: |
docker exec galera mariadb -u root -p{{ db_root_password }} \
-e "SHOW STATUS WHERE Variable_name IN ('wsrep_cluster_size','wsrep_ready','wsrep_cluster_status')" \
--skip-column-names
register: galera_status
changed_when: false
- name: Display cluster status
debug:
var: galera_status.stdout_lines

36
playbooks/inventory.yml Normal file
View file

@ -0,0 +1,36 @@
# Ansible inventory for Host UK production
# Used by: core deploy ansible <playbook> -i playbooks/inventory.yml
all:
vars:
ansible_user: root
ansible_ssh_private_key_file: ~/.ssh/hostuk
children:
bastion:
hosts:
noc:
ansible_host: 77.42.42.205
private_ip: 10.0.0.4
app_servers:
hosts:
de:
ansible_host: 116.202.82.115
galera_node_name: de
galera_node_address: 116.202.82.115
de2:
ansible_host: 88.99.195.41
galera_node_name: de2
galera_node_address: 88.99.195.41
vars:
galera_cluster_name: hostuk-galera
galera_sst_method: mariabackup
db_name: hostuk
db_user: hostuk
redis_maxmemory: 512mb
builders:
hosts:
build:
ansible_host: 46.224.93.62
private_ip: 10.0.0.5

View file

@ -0,0 +1,98 @@
# Redis Sentinel Deployment
# Deploys Redis with Sentinel on de + de2
#
# Usage:
# core deploy ansible playbooks/redis-deploy.yml -i playbooks/inventory.yml
---
- name: Deploy Redis with Sentinel
hosts: app_servers
become: true
vars:
redis_version: "7"
redis_password: "{{ lookup('env', 'REDIS_PASSWORD') | default('', true) }}"
tasks:
- name: Create Redis data directory
file:
path: /opt/redis/data
state: directory
mode: "0755"
- name: Create Redis config directory
file:
path: /opt/redis/conf
state: directory
mode: "0755"
- name: Write Redis configuration
copy:
dest: /opt/redis/conf/redis.conf
content: |
maxmemory {{ redis_maxmemory }}
maxmemory-policy allkeys-lru
appendonly yes
appendfsync everysec
tcp-keepalive 300
timeout 0
{% if redis_password %}
requirepass {{ redis_password }}
masterauth {{ redis_password }}
{% endif %}
- name: Write Sentinel configuration
copy:
dest: /opt/redis/conf/sentinel.conf
content: |
port 26379
sentinel monitor hostuk-redis 116.202.82.115 6379 2
sentinel down-after-milliseconds hostuk-redis 5000
sentinel failover-timeout hostuk-redis 60000
sentinel parallel-syncs hostuk-redis 1
{% if redis_password %}
sentinel auth-pass hostuk-redis {{ redis_password }}
{% endif %}
- name: Stop existing Redis containers
shell: |
docker stop redis redis-sentinel 2>/dev/null || true
docker rm redis redis-sentinel 2>/dev/null || true
changed_when: false
- name: Start Redis container
shell: |
docker run -d \
--name redis \
--restart unless-stopped \
--network host \
-v /opt/redis/data:/data \
-v /opt/redis/conf/redis.conf:/usr/local/etc/redis/redis.conf \
redis:{{ redis_version }}-alpine \
redis-server /usr/local/etc/redis/redis.conf
- name: Start Redis Sentinel container
shell: |
docker run -d \
--name redis-sentinel \
--restart unless-stopped \
--network host \
-v /opt/redis/conf/sentinel.conf:/usr/local/etc/redis/sentinel.conf \
redis:{{ redis_version }}-alpine \
redis-sentinel /usr/local/etc/redis/sentinel.conf
- name: Wait for Redis to be ready
shell: |
for i in $(seq 1 30); do
docker exec redis redis-cli ping 2>/dev/null | grep -q PONG && exit 0
sleep 1
done
exit 1
changed_when: false
- name: Check Redis info
shell: docker exec redis redis-cli info replication | head -10
register: redis_info
changed_when: false
- name: Display Redis info
debug:
var: redis_info.stdout_lines