From 1b861494f15924c2fd0db598837c27f7f8253456 Mon Sep 17 00:00:00 2001 From: Snider Date: Fri, 6 Feb 2026 03:03:29 +0000 Subject: [PATCH] feat(prod): add production infrastructure management MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `core prod` command with full production infrastructure tooling: - `core prod status` — parallel SSH health checks across all hosts, Galera cluster state, Redis sentinel, Docker, LB health - `core prod setup` — Phase 1 foundation: Hetzner topology discovery, managed LB creation, CloudNS DNS record management - `core prod dns` — CloudNS record CRUD with idempotent EnsureRecord - `core prod lb` — Hetzner Cloud LB status and creation - `core prod ssh ` — SSH into hosts defined in infra.yaml New packages: - pkg/infra: config parsing, Hetzner Cloud/Robot API, CloudNS DNS API - infra.yaml: declarative production topology (hosts, LB, DNS, SSL, Galera, Redis, containers, S3, CDN, CI/CD, monitoring, backups) Docker: - Dockerfile.app (PHP 8.3-FPM, multi-stage) - Dockerfile.web (Nginx + security headers) - docker-compose.prod.yml (app, web, horizon, scheduler, mcp, redis, galera) Ansible playbooks (runnable via `core deploy ansible`): - galera-deploy.yml, redis-deploy.yml, galera-backup.yml - inventory.yml with all production hosts CI/CD: - .forgejo/workflows/deploy.yml for Forgejo Actions pipeline Co-Authored-By: Claude Opus 4.6 --- .forgejo/workflows/deploy.yml | 146 +++++++++++ docker/Dockerfile.app | 107 ++++++++ docker/Dockerfile.web | 19 ++ docker/docker-compose.prod.yml | 200 +++++++++++++++ docker/nginx/default.conf | 59 +++++ docker/nginx/security-headers.conf | 6 + docker/php/opcache.ini | 10 + docker/php/php-fpm.conf | 22 ++ infra.yaml | 268 ++++++++++++++++++++ internal/cmd/prod/cmd_commands.go | 15 ++ internal/cmd/prod/cmd_dns.go | 129 ++++++++++ internal/cmd/prod/cmd_lb.go | 113 +++++++++ internal/cmd/prod/cmd_prod.go | 35 +++ internal/cmd/prod/cmd_setup.go | 284 +++++++++++++++++++++ internal/cmd/prod/cmd_ssh.go | 64 +++++ internal/cmd/prod/cmd_status.go | 325 ++++++++++++++++++++++++ internal/variants/full.go | 2 + pkg/infra/cloudns.go | 272 ++++++++++++++++++++ pkg/infra/config.go | 300 +++++++++++++++++++++++ pkg/infra/config_test.go | 100 ++++++++ pkg/infra/hetzner.go | 381 +++++++++++++++++++++++++++++ playbooks/galera-backup.yml | 63 +++++ playbooks/galera-deploy.yml | 96 ++++++++ playbooks/inventory.yml | 36 +++ playbooks/redis-deploy.yml | 98 ++++++++ 25 files changed, 3150 insertions(+) create mode 100644 .forgejo/workflows/deploy.yml create mode 100644 docker/Dockerfile.app create mode 100644 docker/Dockerfile.web create mode 100644 docker/docker-compose.prod.yml create mode 100644 docker/nginx/default.conf create mode 100644 docker/nginx/security-headers.conf create mode 100644 docker/php/opcache.ini create mode 100644 docker/php/php-fpm.conf create mode 100644 infra.yaml create mode 100644 internal/cmd/prod/cmd_commands.go create mode 100644 internal/cmd/prod/cmd_dns.go create mode 100644 internal/cmd/prod/cmd_lb.go create mode 100644 internal/cmd/prod/cmd_prod.go create mode 100644 internal/cmd/prod/cmd_setup.go create mode 100644 internal/cmd/prod/cmd_ssh.go create mode 100644 internal/cmd/prod/cmd_status.go create mode 100644 pkg/infra/cloudns.go create mode 100644 pkg/infra/config.go create mode 100644 pkg/infra/config_test.go create mode 100644 pkg/infra/hetzner.go create mode 100644 playbooks/galera-backup.yml create mode 100644 playbooks/galera-deploy.yml create mode 100644 playbooks/inventory.yml create mode 100644 playbooks/redis-deploy.yml diff --git a/.forgejo/workflows/deploy.yml b/.forgejo/workflows/deploy.yml new file mode 100644 index 00000000..bc689cad --- /dev/null +++ b/.forgejo/workflows/deploy.yml @@ -0,0 +1,146 @@ +# Host UK Production Deployment Pipeline +# Runs on Forgejo Actions (gitea.snider.dev) +# Runner: build.de.host.uk.com +# +# Workflow: +# 1. composer install + test +# 2. npm ci + build +# 3. docker build + push +# 4. Coolify deploy webhook (rolling restart) + +name: Deploy + +on: + push: + branches: [main] + workflow_dispatch: + +env: + REGISTRY: gitea.snider.dev + IMAGE_APP: host-uk/app + IMAGE_WEB: host-uk/web + IMAGE_CORE: host-uk/core + +jobs: + test: + name: Test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup PHP + uses: shivammathur/setup-php@v2 + with: + php-version: "8.3" + extensions: bcmath, gd, intl, mbstring, pdo_mysql, redis, zip + coverage: none + + - name: Install Composer dependencies + run: composer install --no-interaction --prefer-dist + + - name: Run tests + run: composer test + + - name: Check code style + run: ./vendor/bin/pint --test + + build-app: + name: Build App Image + needs: test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: "22" + cache: "npm" + + - name: Login to registry + run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login ${{ env.REGISTRY }} -u ${{ secrets.REGISTRY_USER }} --password-stdin + + - name: Build and push app image + run: | + SHA=$(git rev-parse --short HEAD) + docker build \ + -f docker/Dockerfile.app \ + -t ${{ env.REGISTRY }}/${{ env.IMAGE_APP }}:${SHA} \ + -t ${{ env.REGISTRY }}/${{ env.IMAGE_APP }}:latest \ + . + docker push ${{ env.REGISTRY }}/${{ env.IMAGE_APP }}:${SHA} + docker push ${{ env.REGISTRY }}/${{ env.IMAGE_APP }}:latest + + build-web: + name: Build Web Image + needs: test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Login to registry + run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login ${{ env.REGISTRY }} -u ${{ secrets.REGISTRY_USER }} --password-stdin + + - name: Build and push web image + run: | + SHA=$(git rev-parse --short HEAD) + docker build \ + -f docker/Dockerfile.web \ + -t ${{ env.REGISTRY }}/${{ env.IMAGE_WEB }}:${SHA} \ + -t ${{ env.REGISTRY }}/${{ env.IMAGE_WEB }}:latest \ + . + docker push ${{ env.REGISTRY }}/${{ env.IMAGE_WEB }}:${SHA} + docker push ${{ env.REGISTRY }}/${{ env.IMAGE_WEB }}:latest + + build-core: + name: Build Core Image + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: "1.25" + + - name: Build core binary + run: | + go build -ldflags '-s -w' -o bin/core . + + - name: Login to registry + run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login ${{ env.REGISTRY }} -u ${{ secrets.REGISTRY_USER }} --password-stdin + + - name: Build and push core image + run: | + SHA=$(git rev-parse --short HEAD) + cat > Dockerfile.core <<'EOF' + FROM alpine:3.20 + RUN apk add --no-cache ca-certificates + COPY bin/core /usr/local/bin/core + ENTRYPOINT ["core"] + EOF + docker build \ + -f Dockerfile.core \ + -t ${{ env.REGISTRY }}/${{ env.IMAGE_CORE }}:${SHA} \ + -t ${{ env.REGISTRY }}/${{ env.IMAGE_CORE }}:latest \ + . + docker push ${{ env.REGISTRY }}/${{ env.IMAGE_CORE }}:${SHA} + docker push ${{ env.REGISTRY }}/${{ env.IMAGE_CORE }}:latest + + deploy: + name: Deploy to Production + needs: [build-app, build-web, build-core] + runs-on: ubuntu-latest + steps: + - name: Trigger Coolify deploy + run: | + curl -s -X POST \ + -H "Authorization: Bearer ${{ secrets.COOLIFY_TOKEN }}" \ + "${{ secrets.COOLIFY_URL }}/api/v1/deploy" \ + -H "Content-Type: application/json" \ + -d '{"uuid": "${{ secrets.COOLIFY_APP_UUID }}", "force": false}' + + - name: Wait for deployment + run: | + echo "Deployment triggered. Coolify will perform rolling restart." + echo "Monitor at: ${{ secrets.COOLIFY_URL }}" diff --git a/docker/Dockerfile.app b/docker/Dockerfile.app new file mode 100644 index 00000000..a75b3fe7 --- /dev/null +++ b/docker/Dockerfile.app @@ -0,0 +1,107 @@ +# Host UK — Laravel Application Container +# PHP 8.3-FPM with all extensions required by the federated monorepo +# +# Build: docker build -f docker/Dockerfile.app -t host-uk/app:latest .. +# (run from host-uk/ workspace root, not core/) + +FROM php:8.3-fpm-alpine AS base + +# System dependencies +RUN apk add --no-cache \ + git \ + curl \ + libpng-dev \ + libjpeg-turbo-dev \ + freetype-dev \ + libwebp-dev \ + libzip-dev \ + icu-dev \ + oniguruma-dev \ + libxml2-dev \ + linux-headers \ + $PHPIZE_DEPS + +# PHP extensions +RUN docker-php-ext-configure gd \ + --with-freetype \ + --with-jpeg \ + --with-webp \ + && docker-php-ext-install -j$(nproc) \ + bcmath \ + exif \ + gd \ + intl \ + mbstring \ + opcache \ + pcntl \ + pdo_mysql \ + soap \ + xml \ + zip + +# Redis extension +RUN pecl install redis && docker-php-ext-enable redis + +# Composer +COPY --from=composer:2 /usr/bin/composer /usr/bin/composer + +# PHP configuration +RUN mv "$PHP_INI_DIR/php.ini-production" "$PHP_INI_DIR/php.ini" +COPY docker/php/opcache.ini $PHP_INI_DIR/conf.d/opcache.ini +COPY docker/php/php-fpm.conf /usr/local/etc/php-fpm.d/zz-host-uk.conf + +# --- Build stage --- +FROM base AS build + +WORKDIR /app + +# Install dependencies first (cache layer) +COPY composer.json composer.lock ./ +RUN composer install \ + --no-dev \ + --no-scripts \ + --no-autoloader \ + --prefer-dist \ + --no-interaction + +# Copy application +COPY . . + +# Generate autoloader and run post-install +RUN composer dump-autoload --optimize --no-dev \ + && php artisan package:discover --ansi + +# Build frontend assets +RUN if [ -f package.json ]; then \ + apk add --no-cache nodejs npm && \ + npm ci --production=false && \ + npm run build && \ + rm -rf node_modules; \ + fi + +# --- Production stage --- +FROM base AS production + +WORKDIR /app + +# Copy built application +COPY --from=build /app /app + +# Create storage directories +RUN mkdir -p \ + storage/framework/cache/data \ + storage/framework/sessions \ + storage/framework/views \ + storage/logs \ + bootstrap/cache + +# Permissions +RUN chown -R www-data:www-data storage bootstrap/cache + +# Health check +HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \ + CMD php-fpm-healthcheck || exit 1 + +USER www-data + +EXPOSE 9000 diff --git a/docker/Dockerfile.web b/docker/Dockerfile.web new file mode 100644 index 00000000..f57b472a --- /dev/null +++ b/docker/Dockerfile.web @@ -0,0 +1,19 @@ +# Host UK — Nginx Web Server +# Serves static files and proxies PHP to FPM container +# +# Build: docker build -f docker/Dockerfile.web -t host-uk/web:latest . + +FROM nginx:1.27-alpine + +# Copy nginx configuration +COPY docker/nginx/default.conf /etc/nginx/conf.d/default.conf +COPY docker/nginx/security-headers.conf /etc/nginx/snippets/security-headers.conf + +# Copy static assets from app build +# (In production, these are volume-mounted from the app container) +# COPY --from=host-uk/app:latest /app/public /app/public + +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD wget -qO- http://localhost/health || exit 1 + +EXPOSE 80 diff --git a/docker/docker-compose.prod.yml b/docker/docker-compose.prod.yml new file mode 100644 index 00000000..7f25fa74 --- /dev/null +++ b/docker/docker-compose.prod.yml @@ -0,0 +1,200 @@ +# Host UK Production Docker Compose +# Deployed to de.host.uk.com and de2.host.uk.com via Coolify +# +# Container topology per app server: +# app - PHP 8.3-FPM (all Laravel modules) +# web - Nginx (static files + FastCGI proxy) +# horizon - Laravel Horizon (queue worker) +# scheduler - Laravel scheduler +# mcp - Go MCP server +# redis - Redis 7 (local cache + sessions) +# galera - MariaDB 11 (Galera cluster node) + +services: + app: + image: ${REGISTRY:-gitea.snider.dev}/host-uk/app:${TAG:-latest} + restart: unless-stopped + volumes: + - app-storage:/app/storage + environment: + - APP_ENV=production + - APP_DEBUG=false + - APP_URL=${APP_URL:-https://host.uk.com} + - DB_HOST=galera + - DB_PORT=3306 + - DB_DATABASE=${DB_DATABASE:-hostuk} + - DB_USERNAME=${DB_USERNAME:-hostuk} + - DB_PASSWORD=${DB_PASSWORD} + - REDIS_HOST=redis + - REDIS_PORT=6379 + - CACHE_DRIVER=redis + - SESSION_DRIVER=redis + - QUEUE_CONNECTION=redis + depends_on: + redis: + condition: service_healthy + galera: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "php-fpm-healthcheck || exit 1"] + interval: 30s + timeout: 3s + start_period: 10s + retries: 3 + networks: + - app-net + + web: + image: ${REGISTRY:-gitea.snider.dev}/host-uk/web:${TAG:-latest} + restart: unless-stopped + ports: + - "${WEB_PORT:-80}:80" + volumes: + - app-storage:/app/storage:ro + depends_on: + app: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost/health"] + interval: 30s + timeout: 3s + start_period: 5s + retries: 3 + networks: + - app-net + + horizon: + image: ${REGISTRY:-gitea.snider.dev}/host-uk/app:${TAG:-latest} + restart: unless-stopped + command: php artisan horizon + volumes: + - app-storage:/app/storage + environment: + - APP_ENV=production + - DB_HOST=galera + - DB_PORT=3306 + - DB_DATABASE=${DB_DATABASE:-hostuk} + - DB_USERNAME=${DB_USERNAME:-hostuk} + - DB_PASSWORD=${DB_PASSWORD} + - REDIS_HOST=redis + - REDIS_PORT=6379 + depends_on: + app: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "php artisan horizon:status | grep -q running"] + interval: 60s + timeout: 5s + start_period: 30s + retries: 3 + networks: + - app-net + + scheduler: + image: ${REGISTRY:-gitea.snider.dev}/host-uk/app:${TAG:-latest} + restart: unless-stopped + command: php artisan schedule:work + volumes: + - app-storage:/app/storage + environment: + - APP_ENV=production + - DB_HOST=galera + - DB_PORT=3306 + - DB_DATABASE=${DB_DATABASE:-hostuk} + - DB_USERNAME=${DB_USERNAME:-hostuk} + - DB_PASSWORD=${DB_PASSWORD} + - REDIS_HOST=redis + - REDIS_PORT=6379 + depends_on: + app: + condition: service_healthy + networks: + - app-net + + mcp: + image: ${REGISTRY:-gitea.snider.dev}/host-uk/core:${TAG:-latest} + restart: unless-stopped + command: core mcp serve + ports: + - "${MCP_PORT:-9001}:9000" + environment: + - MCP_ADDR=:9000 + healthcheck: + test: ["CMD-SHELL", "nc -z localhost 9000 || exit 1"] + interval: 30s + timeout: 3s + retries: 3 + networks: + - app-net + + redis: + image: redis:7-alpine + restart: unless-stopped + command: > + redis-server + --maxmemory 512mb + --maxmemory-policy allkeys-lru + --appendonly yes + --appendfsync everysec + volumes: + - redis-data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 3s + retries: 5 + networks: + - app-net + + galera: + image: mariadb:11 + restart: unless-stopped + environment: + - MARIADB_ROOT_PASSWORD=${DB_ROOT_PASSWORD} + - MARIADB_DATABASE=${DB_DATABASE:-hostuk} + - MARIADB_USER=${DB_USERNAME:-hostuk} + - MARIADB_PASSWORD=${DB_PASSWORD} + - WSREP_CLUSTER_NAME=hostuk-galera + - WSREP_CLUSTER_ADDRESS=${GALERA_CLUSTER_ADDRESS:-gcomm://} + - WSREP_NODE_ADDRESS=${GALERA_NODE_ADDRESS} + - WSREP_NODE_NAME=${GALERA_NODE_NAME} + - WSREP_SST_METHOD=mariabackup + command: > + --wsrep-on=ON + --wsrep-provider=/usr/lib/galera/libgalera_smm.so + --wsrep-cluster-name=hostuk-galera + --wsrep-cluster-address=${GALERA_CLUSTER_ADDRESS:-gcomm://} + --wsrep-node-address=${GALERA_NODE_ADDRESS} + --wsrep-node-name=${GALERA_NODE_NAME} + --wsrep-sst-method=mariabackup + --binlog-format=ROW + --default-storage-engine=InnoDB + --innodb-autoinc-lock-mode=2 + --innodb-buffer-pool-size=1G + --innodb-log-file-size=256M + --character-set-server=utf8mb4 + --collation-server=utf8mb4_unicode_ci + volumes: + - galera-data:/var/lib/mysql + ports: + - "${GALERA_PORT:-3306}:3306" + - "4567:4567" + - "4568:4568" + - "4444:4444" + healthcheck: + test: ["CMD-SHELL", "mariadb -u root -p${DB_ROOT_PASSWORD} -e 'SHOW STATUS LIKE \"wsrep_ready\"' | grep -q ON"] + interval: 30s + timeout: 10s + start_period: 60s + retries: 5 + networks: + - app-net + +volumes: + app-storage: + redis-data: + galera-data: + +networks: + app-net: + driver: bridge diff --git a/docker/nginx/default.conf b/docker/nginx/default.conf new file mode 100644 index 00000000..b05018e4 --- /dev/null +++ b/docker/nginx/default.conf @@ -0,0 +1,59 @@ +# Host UK Nginx Configuration +# Proxies PHP to the app (FPM) container, serves static files directly + +server { + listen 80; + server_name _; + + root /app/public; + index index.php; + + charset utf-8; + + # Security headers + include /etc/nginx/snippets/security-headers.conf; + + # Health check endpoint (no logging) + location = /health { + access_log off; + try_files $uri /index.php?$query_string; + } + + # Static file caching + location ~* \.(css|js|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot|webp|avif)$ { + expires 1y; + add_header Cache-Control "public, immutable"; + access_log off; + try_files $uri =404; + } + + # Laravel application + location / { + try_files $uri $uri/ /index.php?$query_string; + } + + # PHP-FPM upstream + location ~ \.php$ { + fastcgi_pass app:9000; + fastcgi_param SCRIPT_FILENAME $realpath_root$fastcgi_script_name; + include fastcgi_params; + + fastcgi_hide_header X-Powered-By; + fastcgi_buffer_size 32k; + fastcgi_buffers 16 16k; + fastcgi_read_timeout 300; + + # Pass real client IP from LB proxy protocol + fastcgi_param REMOTE_ADDR $http_x_forwarded_for; + } + + # Block dotfiles (except .well-known) + location ~ /\.(?!well-known) { + deny all; + } + + # Block access to sensitive files + location ~* \.(env|log|yaml|yml|toml|lock|bak|sql)$ { + deny all; + } +} diff --git a/docker/nginx/security-headers.conf b/docker/nginx/security-headers.conf new file mode 100644 index 00000000..3917d7a2 --- /dev/null +++ b/docker/nginx/security-headers.conf @@ -0,0 +1,6 @@ +# Security headers for Host UK +add_header X-Frame-Options "SAMEORIGIN" always; +add_header X-Content-Type-Options "nosniff" always; +add_header X-XSS-Protection "1; mode=block" always; +add_header Referrer-Policy "strict-origin-when-cross-origin" always; +add_header Permissions-Policy "camera=(), microphone=(), geolocation=(), payment=()" always; diff --git a/docker/php/opcache.ini b/docker/php/opcache.ini new file mode 100644 index 00000000..61a65c18 --- /dev/null +++ b/docker/php/opcache.ini @@ -0,0 +1,10 @@ +; OPcache configuration for production +opcache.enable=1 +opcache.memory_consumption=256 +opcache.interned_strings_buffer=16 +opcache.max_accelerated_files=20000 +opcache.validate_timestamps=0 +opcache.save_comments=1 +opcache.fast_shutdown=1 +opcache.jit_buffer_size=128M +opcache.jit=1255 diff --git a/docker/php/php-fpm.conf b/docker/php/php-fpm.conf new file mode 100644 index 00000000..c19e21c9 --- /dev/null +++ b/docker/php/php-fpm.conf @@ -0,0 +1,22 @@ +; Host UK PHP-FPM pool configuration +[www] +pm = dynamic +pm.max_children = 50 +pm.start_servers = 10 +pm.min_spare_servers = 5 +pm.max_spare_servers = 20 +pm.max_requests = 1000 +pm.process_idle_timeout = 10s + +; Status page for health checks +pm.status_path = /fpm-status +ping.path = /fpm-ping +ping.response = pong + +; Logging +access.log = /proc/self/fd/2 +slowlog = /proc/self/fd/2 +request_slowlog_timeout = 5s + +; Security +security.limit_extensions = .php diff --git a/infra.yaml b/infra.yaml new file mode 100644 index 00000000..8afc7b2d --- /dev/null +++ b/infra.yaml @@ -0,0 +1,268 @@ +# Infrastructure Configuration — Host UK Production +# This file is the source of truth for production topology. +# Used by: core prod status, core prod setup, core deploy ansible + +# --- Hosts --- +hosts: + noc: + fqdn: noc.host.uk.com + ip: 77.42.42.205 + private_ip: 10.0.0.4 + type: hcloud + role: bastion + ssh: + user: root + key: ~/.ssh/hostuk + port: 22 + services: + - coolify + + de: + fqdn: de.host.uk.com + ip: 116.202.82.115 + type: hrobot + role: app + ssh: + user: root + key: ~/.ssh/hostuk + port: 22 + services: + - traefik + - app + - web + - horizon + - scheduler + - mcp + - redis + - galera + + de2: + fqdn: de2.host.uk.com + ip: 88.99.195.41 + type: hrobot + role: app + ssh: + user: root + key: ~/.ssh/hostuk + port: 22 + services: + - traefik + - app + - web + - horizon + - scheduler + - mcp + - redis + - galera + + build: + fqdn: build.de.host.uk.com + ip: 46.224.93.62 + private_ip: 10.0.0.5 + type: hcloud + role: builder + ssh: + user: root + key: ~/.ssh/hostuk + port: 22 + services: + - forgejo-runner + +# --- Load Balancer --- +load_balancer: + name: hermes + fqdn: hermes.lb.host.uk.com + provider: hetzner + type: lb11 + location: fsn1 + algorithm: round_robin + backends: + - host: de + port: 80 + - host: de2 + port: 80 + health_check: + protocol: http + path: /health + interval: 15 + listeners: + - frontend: 443 + backend: 80 + protocol: https + proxy_protocol: true + ssl: + certificate: "*.host.uk.com" + san: + - host.uk.com + +# --- Private Network --- +network: + cidr: 10.0.0.0/16 + name: host-uk-internal + +# --- DNS --- +dns: + provider: cloudns + nameservers: + - ns1.lthn.io + - ns2.lthn.io + - ns3.lthn.io + - ns4.lthn.io + zones: + host.uk.com: + records: + - name: "@" + type: A + value: "{{.lb_ip}}" + ttl: 300 + - name: "*" + type: CNAME + value: hermes.lb.host.uk.com + ttl: 300 + - name: hermes.lb + type: A + value: "{{.lb_ip}}" + ttl: 300 + - name: noc + type: A + value: 77.42.42.205 + ttl: 300 + - name: de + type: A + value: 116.202.82.115 + ttl: 300 + - name: de2 + type: A + value: 88.99.195.41 + ttl: 300 + - name: build.de + type: A + value: 46.224.93.62 + ttl: 300 + +# --- SSL --- +ssl: + wildcard: + domains: + - "*.host.uk.com" + - host.uk.com + method: dns-01 + dns_provider: cloudns + termination: load_balancer + +# --- Database --- +database: + engine: mariadb + version: "11" + cluster: galera + nodes: + - host: de + port: 3306 + - host: de2 + port: 3306 + sst_method: mariabackup + backup: + schedule: "0 3 * * *" + destination: s3 + bucket: hostuk + prefix: backup/galera/ + +# --- Cache --- +cache: + engine: redis + version: "7" + sentinel: true + nodes: + - host: de + port: 6379 + - host: de2 + port: 6379 + +# --- Containers (per app server) --- +containers: + app: + image: host-uk/app:latest + port: 9000 + runtime: php-fpm + replicas: 1 + + web: + image: host-uk/web:latest + port: 80 + runtime: nginx + depends_on: [app] + + horizon: + image: host-uk/app:latest + command: php artisan horizon + replicas: 1 + + scheduler: + image: host-uk/app:latest + command: php artisan schedule:work + replicas: 1 + + mcp: + image: host-uk/core:latest + port: 9000 + command: core mcp serve + replicas: 1 + +# --- Object Storage --- +s3: + endpoint: fsn1.your-objectstorage.com + buckets: + hostuk: + purpose: infra + paths: + - backup/galera/ + - backup/coolify/ + - backup/certs/ + host-uk: + purpose: media + paths: + - uploads/ + - assets/ + +# --- CDN --- +cdn: + provider: bunnycdn + origin: hermes.lb.host.uk.com + zones: + - "*.host.uk.com" + +# --- CI/CD --- +cicd: + provider: forgejo + url: https://gitea.snider.dev + runner: build.de + registry: gitea.snider.dev + deploy_hook: coolify + +# --- Monitoring --- +monitoring: + health_endpoints: + - url: https://host.uk.com/health + interval: 60 + - url: https://bio.host.uk.com/health + interval: 60 + alerts: + galera_cluster_size: 2 + redis_sentinel_quorum: 2 + +# --- Backups --- +backups: + daily: + - name: galera + type: mysqldump + destination: s3://hostuk/backup/galera/ + - name: coolify + type: tar + destination: s3://hostuk/backup/coolify/ + - name: certs + type: tar + destination: s3://hostuk/backup/certs/ + weekly: + - name: snapshot + type: hcloud-snapshot + hosts: [noc, build] diff --git a/internal/cmd/prod/cmd_commands.go b/internal/cmd/prod/cmd_commands.go new file mode 100644 index 00000000..e6e78a13 --- /dev/null +++ b/internal/cmd/prod/cmd_commands.go @@ -0,0 +1,15 @@ +package prod + +import ( + "github.com/host-uk/core/pkg/cli" + "github.com/spf13/cobra" +) + +func init() { + cli.RegisterCommands(AddProdCommands) +} + +// AddProdCommands registers the 'prod' command and all subcommands. +func AddProdCommands(root *cobra.Command) { + root.AddCommand(Cmd) +} diff --git a/internal/cmd/prod/cmd_dns.go b/internal/cmd/prod/cmd_dns.go new file mode 100644 index 00000000..1ce9364a --- /dev/null +++ b/internal/cmd/prod/cmd_dns.go @@ -0,0 +1,129 @@ +package prod + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/host-uk/core/pkg/cli" + "github.com/host-uk/core/pkg/infra" + "github.com/spf13/cobra" +) + +var dnsCmd = &cobra.Command{ + Use: "dns", + Short: "Manage DNS records via CloudNS", + Long: `View and manage DNS records for host.uk.com via CloudNS API. + +Requires: + CLOUDNS_AUTH_ID CloudNS auth ID + CLOUDNS_AUTH_PASSWORD CloudNS auth password`, +} + +var dnsListCmd = &cobra.Command{ + Use: "list [zone]", + Short: "List DNS records", + Args: cobra.MaximumNArgs(1), + RunE: runDNSList, +} + +var dnsSetCmd = &cobra.Command{ + Use: "set ", + Short: "Create or update a DNS record", + Long: `Create or update a DNS record. Example: + core prod dns set hermes.lb A 1.2.3.4 + core prod dns set "*.host.uk.com" CNAME hermes.lb.host.uk.com`, + Args: cobra.ExactArgs(3), + RunE: runDNSSet, +} + +var ( + dnsZone string + dnsTTL int +) + +func init() { + dnsCmd.PersistentFlags().StringVar(&dnsZone, "zone", "host.uk.com", "DNS zone") + + dnsSetCmd.Flags().IntVar(&dnsTTL, "ttl", 300, "Record TTL in seconds") + + dnsCmd.AddCommand(dnsListCmd) + dnsCmd.AddCommand(dnsSetCmd) +} + +func getDNSClient() (*infra.CloudNSClient, error) { + authID := os.Getenv("CLOUDNS_AUTH_ID") + authPass := os.Getenv("CLOUDNS_AUTH_PASSWORD") + if authID == "" || authPass == "" { + return nil, fmt.Errorf("CLOUDNS_AUTH_ID and CLOUDNS_AUTH_PASSWORD required") + } + return infra.NewCloudNSClient(authID, authPass), nil +} + +func runDNSList(cmd *cobra.Command, args []string) error { + dns, err := getDNSClient() + if err != nil { + return err + } + + zone := dnsZone + if len(args) > 0 { + zone = args[0] + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + records, err := dns.ListRecords(ctx, zone) + if err != nil { + return fmt.Errorf("list records: %w", err) + } + + cli.Print("%s DNS records for %s\n\n", cli.BoldStyle.Render("▶"), cli.TitleStyle.Render(zone)) + + if len(records) == 0 { + cli.Print(" No records found\n") + return nil + } + + for id, r := range records { + cli.Print(" %s %-6s %-30s %s TTL:%s\n", + cli.DimStyle.Render(id), + cli.BoldStyle.Render(r.Type), + r.Host, + r.Record, + r.TTL) + } + + return nil +} + +func runDNSSet(cmd *cobra.Command, args []string) error { + dns, err := getDNSClient() + if err != nil { + return err + } + + host := args[0] + recordType := args[1] + value := args[2] + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + changed, err := dns.EnsureRecord(ctx, dnsZone, host, recordType, value, dnsTTL) + if err != nil { + return fmt.Errorf("set record: %w", err) + } + + if changed { + cli.Print("%s %s %s %s -> %s\n", + cli.SuccessStyle.Render("✓"), + recordType, host, dnsZone, value) + } else { + cli.Print("%s Record already correct\n", cli.DimStyle.Render("·")) + } + + return nil +} diff --git a/internal/cmd/prod/cmd_lb.go b/internal/cmd/prod/cmd_lb.go new file mode 100644 index 00000000..59882957 --- /dev/null +++ b/internal/cmd/prod/cmd_lb.go @@ -0,0 +1,113 @@ +package prod + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/host-uk/core/pkg/cli" + "github.com/host-uk/core/pkg/infra" + "github.com/spf13/cobra" +) + +var lbCmd = &cobra.Command{ + Use: "lb", + Short: "Manage Hetzner load balancer", + Long: `View and manage the Hetzner Cloud managed load balancer. + +Requires: HCLOUD_TOKEN`, +} + +var lbStatusCmd = &cobra.Command{ + Use: "status", + Short: "Show load balancer status and target health", + RunE: runLBStatus, +} + +var lbCreateCmd = &cobra.Command{ + Use: "create", + Short: "Create load balancer from infra.yaml", + RunE: runLBCreate, +} + +func init() { + lbCmd.AddCommand(lbStatusCmd) + lbCmd.AddCommand(lbCreateCmd) +} + +func getHCloudClient() (*infra.HCloudClient, error) { + token := os.Getenv("HCLOUD_TOKEN") + if token == "" { + return nil, fmt.Errorf("HCLOUD_TOKEN environment variable required") + } + return infra.NewHCloudClient(token), nil +} + +func runLBStatus(cmd *cobra.Command, args []string) error { + hc, err := getHCloudClient() + if err != nil { + return err + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + lbs, err := hc.ListLoadBalancers(ctx) + if err != nil { + return fmt.Errorf("list load balancers: %w", err) + } + + if len(lbs) == 0 { + cli.Print("No load balancers found\n") + return nil + } + + for _, lb := range lbs { + cli.Print("%s %s\n", cli.BoldStyle.Render("▶"), cli.TitleStyle.Render(lb.Name)) + cli.Print(" ID: %d\n", lb.ID) + cli.Print(" IP: %s\n", lb.PublicNet.IPv4.IP) + cli.Print(" Algorithm: %s\n", lb.Algorithm.Type) + cli.Print(" Location: %s\n", lb.Location.Name) + + if len(lb.Services) > 0 { + cli.Print("\n Services:\n") + for _, s := range lb.Services { + cli.Print(" %s :%d -> :%d proxy_protocol=%v\n", + s.Protocol, s.ListenPort, s.DestinationPort, s.Proxyprotocol) + } + } + + if len(lb.Targets) > 0 { + cli.Print("\n Targets:\n") + for _, t := range lb.Targets { + ip := "" + if t.IP != nil { + ip = t.IP.IP + } + for _, hs := range t.HealthStatus { + icon := cli.SuccessStyle.Render("●") + if hs.Status != "healthy" { + icon = cli.ErrorStyle.Render("○") + } + cli.Print(" %s %s :%d %s\n", icon, ip, hs.ListenPort, hs.Status) + } + } + } + fmt.Println() + } + + return nil +} + +func runLBCreate(cmd *cobra.Command, args []string) error { + cfg, _, err := loadConfig() + if err != nil { + return err + } + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + + return stepLoadBalancer(ctx, cfg) +} diff --git a/internal/cmd/prod/cmd_prod.go b/internal/cmd/prod/cmd_prod.go new file mode 100644 index 00000000..6489654d --- /dev/null +++ b/internal/cmd/prod/cmd_prod.go @@ -0,0 +1,35 @@ +package prod + +import ( + "github.com/spf13/cobra" +) + +var ( + infraFile string +) + +// Cmd is the root prod command. +var Cmd = &cobra.Command{ + Use: "prod", + Short: "Production infrastructure management", + Long: `Manage the Host UK production infrastructure. + +Commands: + status Show infrastructure health and connectivity + setup Phase 1: discover topology, create LB, configure DNS + dns Manage DNS records via CloudNS + lb Manage Hetzner load balancer + ssh SSH into a production host + +Configuration is read from infra.yaml in the project root.`, +} + +func init() { + Cmd.PersistentFlags().StringVar(&infraFile, "config", "", "Path to infra.yaml (auto-discovered if not set)") + + Cmd.AddCommand(statusCmd) + Cmd.AddCommand(setupCmd) + Cmd.AddCommand(dnsCmd) + Cmd.AddCommand(lbCmd) + Cmd.AddCommand(sshCmd) +} diff --git a/internal/cmd/prod/cmd_setup.go b/internal/cmd/prod/cmd_setup.go new file mode 100644 index 00000000..a93455fb --- /dev/null +++ b/internal/cmd/prod/cmd_setup.go @@ -0,0 +1,284 @@ +package prod + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/host-uk/core/pkg/cli" + "github.com/host-uk/core/pkg/infra" + "github.com/spf13/cobra" +) + +var setupCmd = &cobra.Command{ + Use: "setup", + Short: "Phase 1: discover topology, create LB, configure DNS", + Long: `Run the Phase 1 foundation setup: + + 1. Discover Hetzner topology (Cloud + Robot servers) + 2. Create Hetzner managed load balancer + 3. Configure DNS records via CloudNS + 4. Verify connectivity to all hosts + +Required environment variables: + HCLOUD_TOKEN Hetzner Cloud API token + HETZNER_ROBOT_USER Hetzner Robot username + HETZNER_ROBOT_PASS Hetzner Robot password + CLOUDNS_AUTH_ID CloudNS auth ID + CLOUDNS_AUTH_PASSWORD CloudNS auth password`, + RunE: runSetup, +} + +var ( + setupDryRun bool + setupStep string +) + +func init() { + setupCmd.Flags().BoolVar(&setupDryRun, "dry-run", false, "Show what would be done without making changes") + setupCmd.Flags().StringVar(&setupStep, "step", "", "Run a specific step only (discover, lb, dns)") +} + +func runSetup(cmd *cobra.Command, args []string) error { + cfg, cfgPath, err := loadConfig() + if err != nil { + return err + } + + cli.Print("%s Production setup from %s\n\n", + cli.BoldStyle.Render("▶"), + cli.DimStyle.Render(cfgPath)) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + steps := []struct { + name string + fn func(context.Context, *infra.Config) error + }{ + {"discover", stepDiscover}, + {"lb", stepLoadBalancer}, + {"dns", stepDNS}, + } + + for _, step := range steps { + if setupStep != "" && setupStep != step.name { + continue + } + + cli.Print("\n%s Step: %s\n", cli.BoldStyle.Render("━━"), cli.TitleStyle.Render(step.name)) + + if err := step.fn(ctx, cfg); err != nil { + cli.Print(" %s %s: %s\n", cli.ErrorStyle.Render("✗"), step.name, err) + return fmt.Errorf("step %s failed: %w", step.name, err) + } + + cli.Print(" %s %s complete\n", cli.SuccessStyle.Render("✓"), step.name) + } + + cli.Print("\n%s Setup complete\n", cli.SuccessStyle.Render("✓")) + return nil +} + +func stepDiscover(ctx context.Context, cfg *infra.Config) error { + // Discover HCloud servers + hcloudToken := os.Getenv("HCLOUD_TOKEN") + if hcloudToken != "" { + cli.Print(" Discovering Hetzner Cloud servers...\n") + + hc := infra.NewHCloudClient(hcloudToken) + servers, err := hc.ListServers(ctx) + if err != nil { + return fmt.Errorf("list HCloud servers: %w", err) + } + + for _, s := range servers { + cli.Print(" %s %s %s %s %s\n", + cli.SuccessStyle.Render("●"), + cli.BoldStyle.Render(s.Name), + s.PublicNet.IPv4.IP, + s.ServerType.Name, + cli.DimStyle.Render(s.Datacenter.Name)) + } + } else { + cli.Print(" %s HCLOUD_TOKEN not set — skipping Cloud discovery\n", + cli.WarningStyle.Render("⚠")) + } + + // Discover Robot servers + robotUser := os.Getenv("HETZNER_ROBOT_USER") + robotPass := os.Getenv("HETZNER_ROBOT_PASS") + if robotUser != "" && robotPass != "" { + cli.Print(" Discovering Hetzner Robot servers...\n") + + hr := infra.NewHRobotClient(robotUser, robotPass) + servers, err := hr.ListServers(ctx) + if err != nil { + return fmt.Errorf("list Robot servers: %w", err) + } + + for _, s := range servers { + status := cli.SuccessStyle.Render("●") + if s.Status != "ready" { + status = cli.WarningStyle.Render("○") + } + cli.Print(" %s %s %s %s %s\n", + status, + cli.BoldStyle.Render(s.ServerName), + s.ServerIP, + s.Product, + cli.DimStyle.Render(s.Datacenter)) + } + } else { + cli.Print(" %s HETZNER_ROBOT_USER/PASS not set — skipping Robot discovery\n", + cli.WarningStyle.Render("⚠")) + } + + return nil +} + +func stepLoadBalancer(ctx context.Context, cfg *infra.Config) error { + hcloudToken := os.Getenv("HCLOUD_TOKEN") + if hcloudToken == "" { + return fmt.Errorf("HCLOUD_TOKEN required for load balancer management") + } + + hc := infra.NewHCloudClient(hcloudToken) + + // Check if LB already exists + lbs, err := hc.ListLoadBalancers(ctx) + if err != nil { + return fmt.Errorf("list load balancers: %w", err) + } + + for _, lb := range lbs { + if lb.Name == cfg.LoadBalancer.Name { + cli.Print(" Load balancer '%s' already exists (ID: %d, IP: %s)\n", + lb.Name, lb.ID, lb.PublicNet.IPv4.IP) + return nil + } + } + + if setupDryRun { + cli.Print(" [dry-run] Would create load balancer '%s' (%s) in %s\n", + cfg.LoadBalancer.Name, cfg.LoadBalancer.Type, cfg.LoadBalancer.Location) + for _, b := range cfg.LoadBalancer.Backends { + if host, ok := cfg.Hosts[b.Host]; ok { + cli.Print(" [dry-run] Backend: %s (%s:%d)\n", b.Host, host.IP, b.Port) + } + } + return nil + } + + // Build targets from config + targets := make([]infra.HCloudLBCreateTarget, 0, len(cfg.LoadBalancer.Backends)) + for _, b := range cfg.LoadBalancer.Backends { + host, ok := cfg.Hosts[b.Host] + if !ok { + return fmt.Errorf("backend host '%s' not found in config", b.Host) + } + targets = append(targets, infra.HCloudLBCreateTarget{ + Type: "ip", + IP: &infra.HCloudLBTargetIP{IP: host.IP}, + }) + } + + // Build services + services := make([]infra.HCloudLBService, 0, len(cfg.LoadBalancer.Listeners)) + for _, l := range cfg.LoadBalancer.Listeners { + svc := infra.HCloudLBService{ + Protocol: l.Protocol, + ListenPort: l.Frontend, + DestinationPort: l.Backend, + Proxyprotocol: l.ProxyProtocol, + HealthCheck: &infra.HCloudLBHealthCheck{ + Protocol: cfg.LoadBalancer.Health.Protocol, + Port: l.Backend, + Interval: cfg.LoadBalancer.Health.Interval, + Timeout: 10, + Retries: 3, + HTTP: &infra.HCloudLBHCHTTP{ + Path: cfg.LoadBalancer.Health.Path, + StatusCode: "2??", + }, + }, + } + services = append(services, svc) + } + + req := infra.HCloudLBCreateRequest{ + Name: cfg.LoadBalancer.Name, + LoadBalancerType: cfg.LoadBalancer.Type, + Location: cfg.LoadBalancer.Location, + Algorithm: infra.HCloudLBAlgorithm{Type: cfg.LoadBalancer.Algorithm}, + Services: services, + Targets: targets, + Labels: map[string]string{ + "project": "host-uk", + "managed": "core-cli", + }, + } + + cli.Print(" Creating load balancer '%s'...\n", cfg.LoadBalancer.Name) + + lb, err := hc.CreateLoadBalancer(ctx, req) + if err != nil { + return fmt.Errorf("create load balancer: %w", err) + } + + cli.Print(" Created: %s (ID: %d, IP: %s)\n", + cli.BoldStyle.Render(lb.Name), lb.ID, lb.PublicNet.IPv4.IP) + + return nil +} + +func stepDNS(ctx context.Context, cfg *infra.Config) error { + authID := os.Getenv("CLOUDNS_AUTH_ID") + authPass := os.Getenv("CLOUDNS_AUTH_PASSWORD") + if authID == "" || authPass == "" { + return fmt.Errorf("CLOUDNS_AUTH_ID and CLOUDNS_AUTH_PASSWORD required") + } + + dns := infra.NewCloudNSClient(authID, authPass) + + for zoneName, zone := range cfg.DNS.Zones { + cli.Print(" Zone: %s\n", cli.BoldStyle.Render(zoneName)) + + for _, rec := range zone.Records { + value := rec.Value + // Skip templated values (need LB IP first) + if value == "{{.lb_ip}}" { + cli.Print(" %s %s %s %s — %s\n", + cli.WarningStyle.Render("⚠"), + rec.Name, rec.Type, value, + cli.DimStyle.Render("needs LB IP (run setup --step=lb first)")) + continue + } + + if setupDryRun { + cli.Print(" [dry-run] %s %s -> %s (TTL: %d)\n", + rec.Type, rec.Name, value, rec.TTL) + continue + } + + changed, err := dns.EnsureRecord(ctx, zoneName, rec.Name, rec.Type, value, rec.TTL) + if err != nil { + cli.Print(" %s %s %s: %s\n", cli.ErrorStyle.Render("✗"), rec.Type, rec.Name, err) + continue + } + + if changed { + cli.Print(" %s %s %s -> %s\n", + cli.SuccessStyle.Render("✓"), + rec.Type, rec.Name, value) + } else { + cli.Print(" %s %s %s (no change)\n", + cli.DimStyle.Render("·"), + rec.Type, rec.Name) + } + } + } + + return nil +} diff --git a/internal/cmd/prod/cmd_ssh.go b/internal/cmd/prod/cmd_ssh.go new file mode 100644 index 00000000..f39e22c6 --- /dev/null +++ b/internal/cmd/prod/cmd_ssh.go @@ -0,0 +1,64 @@ +package prod + +import ( + "fmt" + "os" + "os/exec" + "syscall" + + "github.com/host-uk/core/pkg/cli" + "github.com/spf13/cobra" +) + +var sshCmd = &cobra.Command{ + Use: "ssh ", + Short: "SSH into a production host", + Long: `Open an SSH session to a production host defined in infra.yaml. + +Examples: + core prod ssh noc + core prod ssh de + core prod ssh de2 + core prod ssh build`, + Args: cobra.ExactArgs(1), + RunE: runSSH, +} + +func runSSH(cmd *cobra.Command, args []string) error { + cfg, _, err := loadConfig() + if err != nil { + return err + } + + name := args[0] + host, ok := cfg.Hosts[name] + if !ok { + // List available hosts + cli.Print("Unknown host '%s'. Available:\n", name) + for n, h := range cfg.Hosts { + cli.Print(" %s %s (%s)\n", cli.BoldStyle.Render(n), h.IP, h.Role) + } + return fmt.Errorf("host '%s' not found in infra.yaml", name) + } + + sshArgs := []string{ + "ssh", + "-i", host.SSH.Key, + "-p", fmt.Sprintf("%d", host.SSH.Port), + "-o", "StrictHostKeyChecking=accept-new", + fmt.Sprintf("%s@%s", host.SSH.User, host.IP), + } + + cli.Print("%s %s@%s (%s)\n", + cli.BoldStyle.Render("▶"), + host.SSH.User, host.FQDN, + cli.DimStyle.Render(host.IP)) + + sshPath, err := exec.LookPath("ssh") + if err != nil { + return fmt.Errorf("ssh not found: %w", err) + } + + // Replace current process with SSH + return syscall.Exec(sshPath, sshArgs, os.Environ()) +} diff --git a/internal/cmd/prod/cmd_status.go b/internal/cmd/prod/cmd_status.go new file mode 100644 index 00000000..8a7ee3a5 --- /dev/null +++ b/internal/cmd/prod/cmd_status.go @@ -0,0 +1,325 @@ +package prod + +import ( + "context" + "fmt" + "os" + "strings" + "sync" + "time" + + "github.com/host-uk/core/pkg/ansible" + "github.com/host-uk/core/pkg/cli" + "github.com/host-uk/core/pkg/infra" + "github.com/spf13/cobra" +) + +var statusCmd = &cobra.Command{ + Use: "status", + Short: "Show production infrastructure health", + Long: `Check connectivity, services, and cluster health across all production hosts. + +Tests: + - SSH connectivity to all hosts + - Docker daemon status + - Coolify controller (noc) + - Galera cluster state (de, de2) + - Redis Sentinel status (de, de2) + - Load balancer health (if HCLOUD_TOKEN set)`, + RunE: runStatus, +} + +type hostStatus struct { + Name string + Host *infra.Host + Connected bool + ConnTime time.Duration + OS string + Docker string + Services map[string]string + Error error +} + +func runStatus(cmd *cobra.Command, args []string) error { + cfg, cfgPath, err := loadConfig() + if err != nil { + return err + } + + cli.Print("%s Infrastructure status from %s\n\n", + cli.BoldStyle.Render("▶"), + cli.DimStyle.Render(cfgPath)) + + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + // Check all hosts in parallel + var ( + wg sync.WaitGroup + mu sync.Mutex + statuses []hostStatus + ) + + for name, host := range cfg.Hosts { + wg.Add(1) + go func(name string, host *infra.Host) { + defer wg.Done() + s := checkHost(ctx, name, host) + mu.Lock() + statuses = append(statuses, s) + mu.Unlock() + }(name, host) + } + + wg.Wait() + + // Print results in consistent order + order := []string{"noc", "de", "de2", "build"} + for _, name := range order { + for _, s := range statuses { + if s.Name == name { + printHostStatus(s) + break + } + } + } + + // Check LB if token available + if token := os.Getenv("HCLOUD_TOKEN"); token != "" { + fmt.Println() + checkLoadBalancer(ctx, token) + } else { + fmt.Println() + cli.Print("%s Load balancer: %s\n", + cli.DimStyle.Render(" ○"), + cli.DimStyle.Render("HCLOUD_TOKEN not set (skipped)")) + } + + return nil +} + +func checkHost(ctx context.Context, name string, host *infra.Host) hostStatus { + s := hostStatus{ + Name: name, + Host: host, + Services: make(map[string]string), + } + + sshCfg := ansible.SSHConfig{ + Host: host.IP, + Port: host.SSH.Port, + User: host.SSH.User, + KeyFile: host.SSH.Key, + Timeout: 15 * time.Second, + } + + client, err := ansible.NewSSHClient(sshCfg) + if err != nil { + s.Error = fmt.Errorf("create SSH client: %w", err) + return s + } + defer func() { _ = client.Close() }() + + start := time.Now() + if err := client.Connect(ctx); err != nil { + s.Error = fmt.Errorf("SSH connect: %w", err) + return s + } + s.Connected = true + s.ConnTime = time.Since(start) + + // OS info + stdout, _, _, _ := client.Run(ctx, "cat /etc/os-release 2>/dev/null | grep PRETTY_NAME | cut -d'\"' -f2") + s.OS = strings.TrimSpace(stdout) + + // Docker + stdout, _, _, err = client.Run(ctx, "docker --version 2>/dev/null | head -1") + if err == nil && stdout != "" { + s.Docker = strings.TrimSpace(stdout) + } + + // Check each expected service + for _, svc := range host.Services { + status := checkService(ctx, client, svc) + s.Services[svc] = status + } + + return s +} + +func checkService(ctx context.Context, client *ansible.SSHClient, service string) string { + switch service { + case "coolify": + stdout, _, _, _ := client.Run(ctx, "docker ps --format '{{.Names}}' 2>/dev/null | grep -c coolify") + if strings.TrimSpace(stdout) != "0" && strings.TrimSpace(stdout) != "" { + return "running" + } + return "not running" + + case "traefik": + stdout, _, _, _ := client.Run(ctx, "docker ps --format '{{.Names}}' 2>/dev/null | grep -c traefik") + if strings.TrimSpace(stdout) != "0" && strings.TrimSpace(stdout) != "" { + return "running" + } + return "not running" + + case "galera": + // Check Galera cluster state + stdout, _, _, _ := client.Run(ctx, + "docker exec $(docker ps -q --filter name=mariadb 2>/dev/null || echo none) "+ + "mariadb -u root -e \"SHOW STATUS LIKE 'wsrep_cluster_size'\" --skip-column-names 2>/dev/null | awk '{print $2}'") + size := strings.TrimSpace(stdout) + if size != "" && size != "0" { + return fmt.Sprintf("cluster_size=%s", size) + } + // Try non-Docker + stdout, _, _, _ = client.Run(ctx, + "mariadb -u root -e \"SHOW STATUS LIKE 'wsrep_cluster_size'\" --skip-column-names 2>/dev/null | awk '{print $2}'") + size = strings.TrimSpace(stdout) + if size != "" && size != "0" { + return fmt.Sprintf("cluster_size=%s", size) + } + return "not running" + + case "redis": + stdout, _, _, _ := client.Run(ctx, + "docker exec $(docker ps -q --filter name=redis 2>/dev/null || echo none) "+ + "redis-cli ping 2>/dev/null") + if strings.TrimSpace(stdout) == "PONG" { + return "running" + } + stdout, _, _, _ = client.Run(ctx, "redis-cli ping 2>/dev/null") + if strings.TrimSpace(stdout) == "PONG" { + return "running" + } + return "not running" + + case "forgejo-runner": + stdout, _, _, _ := client.Run(ctx, "systemctl is-active forgejo-runner 2>/dev/null || docker ps --format '{{.Names}}' 2>/dev/null | grep -c runner") + val := strings.TrimSpace(stdout) + if val == "active" || (val != "0" && val != "") { + return "running" + } + return "not running" + + default: + // Generic docker container check + stdout, _, _, _ := client.Run(ctx, + fmt.Sprintf("docker ps --format '{{.Names}}' 2>/dev/null | grep -c %s", service)) + if strings.TrimSpace(stdout) != "0" && strings.TrimSpace(stdout) != "" { + return "running" + } + return "not running" + } +} + +func printHostStatus(s hostStatus) { + // Host header + roleStyle := cli.DimStyle + switch s.Host.Role { + case "app": + roleStyle = cli.SuccessStyle + case "bastion": + roleStyle = cli.WarningStyle + case "builder": + roleStyle = cli.InfoStyle + } + + cli.Print(" %s %s %s %s\n", + cli.BoldStyle.Render(s.Name), + cli.DimStyle.Render(s.Host.IP), + roleStyle.Render(s.Host.Role), + cli.DimStyle.Render(s.Host.FQDN)) + + if s.Error != nil { + cli.Print(" %s %s\n", cli.ErrorStyle.Render("✗"), s.Error) + return + } + + if !s.Connected { + cli.Print(" %s SSH unreachable\n", cli.ErrorStyle.Render("✗")) + return + } + + // Connection info + cli.Print(" %s SSH %s", + cli.SuccessStyle.Render("✓"), + cli.DimStyle.Render(s.ConnTime.Round(time.Millisecond).String())) + if s.OS != "" { + cli.Print(" %s", cli.DimStyle.Render(s.OS)) + } + fmt.Println() + + if s.Docker != "" { + cli.Print(" %s %s\n", cli.SuccessStyle.Render("✓"), cli.DimStyle.Render(s.Docker)) + } + + // Services + for _, svc := range s.Host.Services { + status, ok := s.Services[svc] + if !ok { + continue + } + + icon := cli.SuccessStyle.Render("●") + style := cli.SuccessStyle + if status == "not running" { + icon = cli.ErrorStyle.Render("○") + style = cli.ErrorStyle + } + + cli.Print(" %s %s %s\n", icon, svc, style.Render(status)) + } + + fmt.Println() +} + +func checkLoadBalancer(ctx context.Context, token string) { + hc := infra.NewHCloudClient(token) + lbs, err := hc.ListLoadBalancers(ctx) + if err != nil { + cli.Print(" %s Load balancer: %s\n", cli.ErrorStyle.Render("✗"), err) + return + } + + if len(lbs) == 0 { + cli.Print(" %s No load balancers found\n", cli.DimStyle.Render("○")) + return + } + + for _, lb := range lbs { + cli.Print(" %s LB: %s IP: %s Targets: %d\n", + cli.SuccessStyle.Render("●"), + cli.BoldStyle.Render(lb.Name), + lb.PublicNet.IPv4.IP, + len(lb.Targets)) + + for _, t := range lb.Targets { + for _, hs := range t.HealthStatus { + icon := cli.SuccessStyle.Render("●") + if hs.Status != "healthy" { + icon = cli.ErrorStyle.Render("○") + } + ip := "" + if t.IP != nil { + ip = t.IP.IP + } + cli.Print(" %s :%d %s %s\n", icon, hs.ListenPort, hs.Status, cli.DimStyle.Render(ip)) + } + } + } +} + +func loadConfig() (*infra.Config, string, error) { + if infraFile != "" { + cfg, err := infra.Load(infraFile) + return cfg, infraFile, err + } + + cwd, err := os.Getwd() + if err != nil { + return nil, "", err + } + + return infra.Discover(cwd) +} diff --git a/internal/variants/full.go b/internal/variants/full.go index f80e34f7..fd0de167 100644 --- a/internal/variants/full.go +++ b/internal/variants/full.go @@ -22,6 +22,7 @@ // - monitor: Security monitoring aggregation // - gitea: Gitea instance management (repos, issues, PRs, mirrors) // - unifi: UniFi network management (sites, devices, clients) +// - prod: Production infrastructure management package variants @@ -45,6 +46,7 @@ import ( _ "github.com/host-uk/core/internal/cmd/php" _ "github.com/host-uk/core/internal/cmd/pkgcmd" _ "github.com/host-uk/core/internal/cmd/plugin" + _ "github.com/host-uk/core/internal/cmd/prod" _ "github.com/host-uk/core/internal/cmd/qa" _ "github.com/host-uk/core/internal/cmd/sdk" _ "github.com/host-uk/core/internal/cmd/security" diff --git a/pkg/infra/cloudns.go b/pkg/infra/cloudns.go new file mode 100644 index 00000000..dd419fe4 --- /dev/null +++ b/pkg/infra/cloudns.go @@ -0,0 +1,272 @@ +package infra + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strconv" + "time" +) + +const cloudnsBaseURL = "https://api.cloudns.net" + +// CloudNSClient is an HTTP client for the CloudNS DNS API. +type CloudNSClient struct { + authID string + password string + client *http.Client +} + +// NewCloudNSClient creates a new CloudNS API client. +// Uses sub-auth-user (auth-id) authentication. +func NewCloudNSClient(authID, password string) *CloudNSClient { + return &CloudNSClient{ + authID: authID, + password: password, + client: &http.Client{ + Timeout: 30 * time.Second, + }, + } +} + +// CloudNSZone represents a DNS zone. +type CloudNSZone struct { + Name string `json:"name"` + Type string `json:"type"` + Zone string `json:"zone"` + Status string `json:"status"` +} + +// CloudNSRecord represents a DNS record. +type CloudNSRecord struct { + ID string `json:"id"` + Type string `json:"type"` + Host string `json:"host"` + Record string `json:"record"` + TTL string `json:"ttl"` + Priority string `json:"priority,omitempty"` + Status int `json:"status"` +} + +// ListZones returns all DNS zones. +func (c *CloudNSClient) ListZones(ctx context.Context) ([]CloudNSZone, error) { + params := c.authParams() + params.Set("page", "1") + params.Set("rows-per-page", "100") + params.Set("search", "") + + data, err := c.get(ctx, "/dns/list-zones.json", params) + if err != nil { + return nil, err + } + + var zones []CloudNSZone + if err := json.Unmarshal(data, &zones); err != nil { + // CloudNS returns an empty object {} for no results instead of [] + return nil, nil + } + return zones, nil +} + +// ListRecords returns all DNS records for a zone. +func (c *CloudNSClient) ListRecords(ctx context.Context, domain string) (map[string]CloudNSRecord, error) { + params := c.authParams() + params.Set("domain-name", domain) + + data, err := c.get(ctx, "/dns/records.json", params) + if err != nil { + return nil, err + } + + var records map[string]CloudNSRecord + if err := json.Unmarshal(data, &records); err != nil { + return nil, fmt.Errorf("parse records: %w", err) + } + return records, nil +} + +// CreateRecord creates a DNS record. Returns the record ID. +func (c *CloudNSClient) CreateRecord(ctx context.Context, domain, host, recordType, value string, ttl int) (string, error) { + params := c.authParams() + params.Set("domain-name", domain) + params.Set("host", host) + params.Set("record-type", recordType) + params.Set("record", value) + params.Set("ttl", strconv.Itoa(ttl)) + + data, err := c.post(ctx, "/dns/add-record.json", params) + if err != nil { + return "", err + } + + var result struct { + Status string `json:"status"` + StatusDescription string `json:"statusDescription"` + Data struct { + ID int `json:"id"` + } `json:"data"` + } + if err := json.Unmarshal(data, &result); err != nil { + return "", fmt.Errorf("parse response: %w", err) + } + + if result.Status != "Success" { + return "", fmt.Errorf("cloudns: %s", result.StatusDescription) + } + + return strconv.Itoa(result.Data.ID), nil +} + +// UpdateRecord updates an existing DNS record. +func (c *CloudNSClient) UpdateRecord(ctx context.Context, domain, recordID, host, recordType, value string, ttl int) error { + params := c.authParams() + params.Set("domain-name", domain) + params.Set("record-id", recordID) + params.Set("host", host) + params.Set("record-type", recordType) + params.Set("record", value) + params.Set("ttl", strconv.Itoa(ttl)) + + data, err := c.post(ctx, "/dns/mod-record.json", params) + if err != nil { + return err + } + + var result struct { + Status string `json:"status"` + StatusDescription string `json:"statusDescription"` + } + if err := json.Unmarshal(data, &result); err != nil { + return fmt.Errorf("parse response: %w", err) + } + + if result.Status != "Success" { + return fmt.Errorf("cloudns: %s", result.StatusDescription) + } + + return nil +} + +// DeleteRecord deletes a DNS record by ID. +func (c *CloudNSClient) DeleteRecord(ctx context.Context, domain, recordID string) error { + params := c.authParams() + params.Set("domain-name", domain) + params.Set("record-id", recordID) + + data, err := c.post(ctx, "/dns/delete-record.json", params) + if err != nil { + return err + } + + var result struct { + Status string `json:"status"` + StatusDescription string `json:"statusDescription"` + } + if err := json.Unmarshal(data, &result); err != nil { + return fmt.Errorf("parse response: %w", err) + } + + if result.Status != "Success" { + return fmt.Errorf("cloudns: %s", result.StatusDescription) + } + + return nil +} + +// EnsureRecord creates or updates a DNS record to match the desired state. +// Returns true if a change was made. +func (c *CloudNSClient) EnsureRecord(ctx context.Context, domain, host, recordType, value string, ttl int) (bool, error) { + records, err := c.ListRecords(ctx, domain) + if err != nil { + return false, fmt.Errorf("list records: %w", err) + } + + // Check if record already exists + for id, r := range records { + if r.Host == host && r.Type == recordType { + if r.Record == value { + return false, nil // Already correct + } + // Update existing record + if err := c.UpdateRecord(ctx, domain, id, host, recordType, value, ttl); err != nil { + return false, fmt.Errorf("update record: %w", err) + } + return true, nil + } + } + + // Create new record + if _, err := c.CreateRecord(ctx, domain, host, recordType, value, ttl); err != nil { + return false, fmt.Errorf("create record: %w", err) + } + return true, nil +} + +// SetACMEChallenge creates a DNS-01 ACME challenge TXT record. +func (c *CloudNSClient) SetACMEChallenge(ctx context.Context, domain, value string) (string, error) { + return c.CreateRecord(ctx, domain, "_acme-challenge", "TXT", value, 60) +} + +// ClearACMEChallenge removes the DNS-01 ACME challenge TXT record. +func (c *CloudNSClient) ClearACMEChallenge(ctx context.Context, domain string) error { + records, err := c.ListRecords(ctx, domain) + if err != nil { + return err + } + + for id, r := range records { + if r.Host == "_acme-challenge" && r.Type == "TXT" { + if err := c.DeleteRecord(ctx, domain, id); err != nil { + return err + } + } + } + return nil +} + +func (c *CloudNSClient) authParams() url.Values { + params := url.Values{} + params.Set("auth-id", c.authID) + params.Set("auth-password", c.password) + return params +} + +func (c *CloudNSClient) get(ctx context.Context, path string, params url.Values) ([]byte, error) { + u := cloudnsBaseURL + path + "?" + params.Encode() + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, err + } + return c.doRaw(req) +} + +func (c *CloudNSClient) post(ctx context.Context, path string, params url.Values) ([]byte, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodPost, cloudnsBaseURL+path, nil) + if err != nil { + return nil, err + } + req.URL.RawQuery = params.Encode() + return c.doRaw(req) +} + +func (c *CloudNSClient) doRaw(req *http.Request) ([]byte, error) { + resp, err := c.client.Do(req) + if err != nil { + return nil, fmt.Errorf("cloudns API: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + data, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read response: %w", err) + } + + if resp.StatusCode >= 400 { + return nil, fmt.Errorf("cloudns API %d: %s", resp.StatusCode, string(data)) + } + + return data, nil +} diff --git a/pkg/infra/config.go b/pkg/infra/config.go new file mode 100644 index 00000000..aaffbba0 --- /dev/null +++ b/pkg/infra/config.go @@ -0,0 +1,300 @@ +// Package infra provides infrastructure configuration and API clients +// for managing the Host UK production environment. +package infra + +import ( + "fmt" + "os" + "path/filepath" + + "gopkg.in/yaml.v3" +) + +// Config is the top-level infrastructure configuration parsed from infra.yaml. +type Config struct { + Hosts map[string]*Host `yaml:"hosts"` + LoadBalancer LoadBalancer `yaml:"load_balancer"` + Network Network `yaml:"network"` + DNS DNS `yaml:"dns"` + SSL SSL `yaml:"ssl"` + Database Database `yaml:"database"` + Cache Cache `yaml:"cache"` + Containers map[string]*Container `yaml:"containers"` + S3 S3Config `yaml:"s3"` + CDN CDN `yaml:"cdn"` + CICD CICD `yaml:"cicd"` + Monitoring Monitoring `yaml:"monitoring"` + Backups Backups `yaml:"backups"` +} + +// Host represents a server in the infrastructure. +type Host struct { + FQDN string `yaml:"fqdn"` + IP string `yaml:"ip"` + PrivateIP string `yaml:"private_ip,omitempty"` + Type string `yaml:"type"` // hcloud, hrobot + Role string `yaml:"role"` // bastion, app, builder + SSH SSHConf `yaml:"ssh"` + Services []string `yaml:"services"` +} + +// SSHConf holds SSH connection details for a host. +type SSHConf struct { + User string `yaml:"user"` + Key string `yaml:"key"` + Port int `yaml:"port"` +} + +// LoadBalancer represents a Hetzner managed load balancer. +type LoadBalancer struct { + Name string `yaml:"name"` + FQDN string `yaml:"fqdn"` + Provider string `yaml:"provider"` + Type string `yaml:"type"` + Location string `yaml:"location"` + Algorithm string `yaml:"algorithm"` + Backends []Backend `yaml:"backends"` + Health HealthCheck `yaml:"health_check"` + Listeners []Listener `yaml:"listeners"` + SSL LBCert `yaml:"ssl"` +} + +// Backend is a load balancer backend target. +type Backend struct { + Host string `yaml:"host"` + Port int `yaml:"port"` +} + +// HealthCheck configures load balancer health checking. +type HealthCheck struct { + Protocol string `yaml:"protocol"` + Path string `yaml:"path"` + Interval int `yaml:"interval"` +} + +// Listener maps a frontend port to a backend port. +type Listener struct { + Frontend int `yaml:"frontend"` + Backend int `yaml:"backend"` + Protocol string `yaml:"protocol"` + ProxyProtocol bool `yaml:"proxy_protocol"` +} + +// LBCert holds the SSL certificate configuration for the load balancer. +type LBCert struct { + Certificate string `yaml:"certificate"` + SAN []string `yaml:"san"` +} + +// Network describes the private network. +type Network struct { + CIDR string `yaml:"cidr"` + Name string `yaml:"name"` +} + +// DNS holds DNS provider configuration and zone records. +type DNS struct { + Provider string `yaml:"provider"` + Nameservers []string `yaml:"nameservers"` + Zones map[string]*Zone `yaml:"zones"` +} + +// Zone is a DNS zone with its records. +type Zone struct { + Records []DNSRecord `yaml:"records"` +} + +// DNSRecord is a single DNS record. +type DNSRecord struct { + Name string `yaml:"name"` + Type string `yaml:"type"` + Value string `yaml:"value"` + TTL int `yaml:"ttl"` +} + +// SSL holds SSL certificate configuration. +type SSL struct { + Wildcard WildcardCert `yaml:"wildcard"` +} + +// WildcardCert describes a wildcard SSL certificate. +type WildcardCert struct { + Domains []string `yaml:"domains"` + Method string `yaml:"method"` + DNSProvider string `yaml:"dns_provider"` + Termination string `yaml:"termination"` +} + +// Database describes the database cluster. +type Database struct { + Engine string `yaml:"engine"` + Version string `yaml:"version"` + Cluster string `yaml:"cluster"` + Nodes []DBNode `yaml:"nodes"` + SSTMethod string `yaml:"sst_method"` + Backup BackupConfig `yaml:"backup"` +} + +// DBNode is a database cluster node. +type DBNode struct { + Host string `yaml:"host"` + Port int `yaml:"port"` +} + +// BackupConfig describes automated backup settings. +type BackupConfig struct { + Schedule string `yaml:"schedule"` + Destination string `yaml:"destination"` + Bucket string `yaml:"bucket"` + Prefix string `yaml:"prefix"` +} + +// Cache describes the cache/session cluster. +type Cache struct { + Engine string `yaml:"engine"` + Version string `yaml:"version"` + Sentinel bool `yaml:"sentinel"` + Nodes []CacheNode `yaml:"nodes"` +} + +// CacheNode is a cache cluster node. +type CacheNode struct { + Host string `yaml:"host"` + Port int `yaml:"port"` +} + +// Container describes a container deployment. +type Container struct { + Image string `yaml:"image"` + Port int `yaml:"port,omitempty"` + Runtime string `yaml:"runtime,omitempty"` + Command string `yaml:"command,omitempty"` + Replicas int `yaml:"replicas,omitempty"` + DependsOn []string `yaml:"depends_on,omitempty"` +} + +// S3Config describes object storage. +type S3Config struct { + Endpoint string `yaml:"endpoint"` + Buckets map[string]*S3Bucket `yaml:"buckets"` +} + +// S3Bucket is an S3 bucket configuration. +type S3Bucket struct { + Purpose string `yaml:"purpose"` + Paths []string `yaml:"paths"` +} + +// CDN describes CDN configuration. +type CDN struct { + Provider string `yaml:"provider"` + Origin string `yaml:"origin"` + Zones []string `yaml:"zones"` +} + +// CICD describes CI/CD configuration. +type CICD struct { + Provider string `yaml:"provider"` + URL string `yaml:"url"` + Runner string `yaml:"runner"` + Registry string `yaml:"registry"` + DeployHook string `yaml:"deploy_hook"` +} + +// Monitoring describes monitoring configuration. +type Monitoring struct { + HealthEndpoints []HealthEndpoint `yaml:"health_endpoints"` + Alerts map[string]int `yaml:"alerts"` +} + +// HealthEndpoint is a URL to monitor. +type HealthEndpoint struct { + URL string `yaml:"url"` + Interval int `yaml:"interval"` +} + +// Backups describes backup schedules. +type Backups struct { + Daily []BackupJob `yaml:"daily"` + Weekly []BackupJob `yaml:"weekly"` +} + +// BackupJob is a scheduled backup task. +type BackupJob struct { + Name string `yaml:"name"` + Type string `yaml:"type"` + Destination string `yaml:"destination,omitempty"` + Hosts []string `yaml:"hosts,omitempty"` +} + +// Load reads and parses an infra.yaml file. +func Load(path string) (*Config, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read infra config: %w", err) + } + + var cfg Config + if err := yaml.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("parse infra config: %w", err) + } + + // Expand SSH key paths + for _, h := range cfg.Hosts { + if h.SSH.Key != "" { + h.SSH.Key = expandPath(h.SSH.Key) + } + if h.SSH.Port == 0 { + h.SSH.Port = 22 + } + } + + return &cfg, nil +} + +// Discover searches for infra.yaml in the given directory and parent directories. +func Discover(startDir string) (*Config, string, error) { + dir := startDir + for { + path := filepath.Join(dir, "infra.yaml") + if _, err := os.Stat(path); err == nil { + cfg, err := Load(path) + return cfg, path, err + } + + parent := filepath.Dir(dir) + if parent == dir { + break + } + dir = parent + } + return nil, "", fmt.Errorf("infra.yaml not found (searched from %s)", startDir) +} + +// HostsByRole returns all hosts matching the given role. +func (c *Config) HostsByRole(role string) map[string]*Host { + result := make(map[string]*Host) + for name, h := range c.Hosts { + if h.Role == role { + result[name] = h + } + } + return result +} + +// AppServers returns hosts with role "app". +func (c *Config) AppServers() map[string]*Host { + return c.HostsByRole("app") +} + +// expandPath expands ~ to home directory. +func expandPath(path string) string { + if len(path) > 0 && path[0] == '~' { + home, err := os.UserHomeDir() + if err != nil { + return path + } + return filepath.Join(home, path[1:]) + } + return path +} diff --git a/pkg/infra/config_test.go b/pkg/infra/config_test.go new file mode 100644 index 00000000..1ec8b595 --- /dev/null +++ b/pkg/infra/config_test.go @@ -0,0 +1,100 @@ +package infra + +import ( + "os" + "path/filepath" + "testing" +) + +func TestLoad_Good(t *testing.T) { + // Find infra.yaml relative to test + // Walk up from test dir to find it + dir, err := os.Getwd() + if err != nil { + t.Fatal(err) + } + + cfg, path, err := Discover(dir) + if err != nil { + t.Skipf("infra.yaml not found from %s: %v", dir, err) + } + + t.Logf("Loaded %s", path) + + if len(cfg.Hosts) == 0 { + t.Error("expected at least one host") + } + + // Check required hosts exist + for _, name := range []string{"noc", "de", "de2", "build"} { + if _, ok := cfg.Hosts[name]; !ok { + t.Errorf("expected host %q in config", name) + } + } + + // Check de host details + de := cfg.Hosts["de"] + if de.IP != "116.202.82.115" { + t.Errorf("de IP = %q, want 116.202.82.115", de.IP) + } + if de.Role != "app" { + t.Errorf("de role = %q, want app", de.Role) + } + + // Check LB config + if cfg.LoadBalancer.Name != "hermes" { + t.Errorf("LB name = %q, want hermes", cfg.LoadBalancer.Name) + } + if cfg.LoadBalancer.Type != "lb11" { + t.Errorf("LB type = %q, want lb11", cfg.LoadBalancer.Type) + } + if len(cfg.LoadBalancer.Backends) != 2 { + t.Errorf("LB backends = %d, want 2", len(cfg.LoadBalancer.Backends)) + } + + // Check app servers helper + apps := cfg.AppServers() + if len(apps) != 2 { + t.Errorf("AppServers() = %d, want 2", len(apps)) + } +} + +func TestLoad_Bad(t *testing.T) { + _, err := Load("/nonexistent/infra.yaml") + if err == nil { + t.Error("expected error for nonexistent file") + } +} + +func TestLoad_Ugly(t *testing.T) { + // Invalid YAML + tmp := filepath.Join(t.TempDir(), "infra.yaml") + if err := os.WriteFile(tmp, []byte("{{invalid yaml"), 0644); err != nil { + t.Fatal(err) + } + + _, err := Load(tmp) + if err == nil { + t.Error("expected error for invalid YAML") + } +} + +func TestExpandPath(t *testing.T) { + home, _ := os.UserHomeDir() + + tests := []struct { + input string + want string + }{ + {"~/.ssh/id_rsa", filepath.Join(home, ".ssh/id_rsa")}, + {"/absolute/path", "/absolute/path"}, + {"relative/path", "relative/path"}, + } + + for _, tt := range tests { + got := expandPath(tt.input) + if got != tt.want { + t.Errorf("expandPath(%q) = %q, want %q", tt.input, got, tt.want) + } + } +} diff --git a/pkg/infra/hetzner.go b/pkg/infra/hetzner.go new file mode 100644 index 00000000..93ab8192 --- /dev/null +++ b/pkg/infra/hetzner.go @@ -0,0 +1,381 @@ +package infra + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +const ( + hcloudBaseURL = "https://api.hetzner.cloud/v1" + hrobotBaseURL = "https://robot-ws.your-server.de" +) + +// HCloudClient is an HTTP client for the Hetzner Cloud API. +type HCloudClient struct { + token string + client *http.Client +} + +// NewHCloudClient creates a new Hetzner Cloud API client. +func NewHCloudClient(token string) *HCloudClient { + return &HCloudClient{ + token: token, + client: &http.Client{ + Timeout: 30 * time.Second, + }, + } +} + +// HCloudServer represents a Hetzner Cloud server. +type HCloudServer struct { + ID int `json:"id"` + Name string `json:"name"` + Status string `json:"status"` + PublicNet HCloudPublicNet `json:"public_net"` + PrivateNet []HCloudPrivateNet `json:"private_net"` + ServerType HCloudServerType `json:"server_type"` + Datacenter HCloudDatacenter `json:"datacenter"` + Labels map[string]string `json:"labels"` +} + +// HCloudPublicNet holds public network info. +type HCloudPublicNet struct { + IPv4 HCloudIPv4 `json:"ipv4"` +} + +// HCloudIPv4 holds an IPv4 address. +type HCloudIPv4 struct { + IP string `json:"ip"` +} + +// HCloudPrivateNet holds private network info. +type HCloudPrivateNet struct { + IP string `json:"ip"` + Network int `json:"network"` +} + +// HCloudServerType holds server type info. +type HCloudServerType struct { + Name string `json:"name"` + Description string `json:"description"` + Cores int `json:"cores"` + Memory float64 `json:"memory"` + Disk int `json:"disk"` +} + +// HCloudDatacenter holds datacenter info. +type HCloudDatacenter struct { + Name string `json:"name"` + Description string `json:"description"` +} + +// HCloudLoadBalancer represents a Hetzner Cloud load balancer. +type HCloudLoadBalancer struct { + ID int `json:"id"` + Name string `json:"name"` + PublicNet HCloudLBPublicNet `json:"public_net"` + Algorithm HCloudLBAlgorithm `json:"algorithm"` + Services []HCloudLBService `json:"services"` + Targets []HCloudLBTarget `json:"targets"` + Location HCloudDatacenter `json:"location"` + Labels map[string]string `json:"labels"` +} + +// HCloudLBPublicNet holds LB public network info. +type HCloudLBPublicNet struct { + Enabled bool `json:"enabled"` + IPv4 HCloudIPv4 `json:"ipv4"` +} + +// HCloudLBAlgorithm holds the LB algorithm. +type HCloudLBAlgorithm struct { + Type string `json:"type"` +} + +// HCloudLBService describes an LB listener. +type HCloudLBService struct { + Protocol string `json:"protocol"` + ListenPort int `json:"listen_port"` + DestinationPort int `json:"destination_port"` + Proxyprotocol bool `json:"proxyprotocol"` + HTTP *HCloudLBHTTP `json:"http,omitempty"` + HealthCheck *HCloudLBHealthCheck `json:"health_check,omitempty"` +} + +// HCloudLBHTTP holds HTTP-specific LB options. +type HCloudLBHTTP struct { + RedirectHTTP bool `json:"redirect_http"` +} + +// HCloudLBHealthCheck holds LB health check config. +type HCloudLBHealthCheck struct { + Protocol string `json:"protocol"` + Port int `json:"port"` + Interval int `json:"interval"` + Timeout int `json:"timeout"` + Retries int `json:"retries"` + HTTP *HCloudLBHCHTTP `json:"http,omitempty"` +} + +// HCloudLBHCHTTP holds HTTP health check options. +type HCloudLBHCHTTP struct { + Path string `json:"path"` + StatusCode string `json:"status_codes"` +} + +// HCloudLBTarget is a load balancer backend target. +type HCloudLBTarget struct { + Type string `json:"type"` + IP *HCloudLBTargetIP `json:"ip,omitempty"` + Server *HCloudLBTargetServer `json:"server,omitempty"` + HealthStatus []HCloudLBHealthStatus `json:"health_status"` +} + +// HCloudLBTargetIP is an IP-based LB target. +type HCloudLBTargetIP struct { + IP string `json:"ip"` +} + +// HCloudLBTargetServer is a server-based LB target. +type HCloudLBTargetServer struct { + ID int `json:"id"` +} + +// HCloudLBHealthStatus holds target health info. +type HCloudLBHealthStatus struct { + ListenPort int `json:"listen_port"` + Status string `json:"status"` +} + +// HCloudLBCreateRequest holds load balancer creation params. +type HCloudLBCreateRequest struct { + Name string `json:"name"` + LoadBalancerType string `json:"load_balancer_type"` + Location string `json:"location"` + Algorithm HCloudLBAlgorithm `json:"algorithm"` + Services []HCloudLBService `json:"services"` + Targets []HCloudLBCreateTarget `json:"targets"` + Labels map[string]string `json:"labels"` +} + +// HCloudLBCreateTarget is a target for LB creation. +type HCloudLBCreateTarget struct { + Type string `json:"type"` + IP *HCloudLBTargetIP `json:"ip,omitempty"` +} + +// ListServers returns all Hetzner Cloud servers. +func (c *HCloudClient) ListServers(ctx context.Context) ([]HCloudServer, error) { + var result struct { + Servers []HCloudServer `json:"servers"` + } + if err := c.get(ctx, "/servers", &result); err != nil { + return nil, err + } + return result.Servers, nil +} + +// ListLoadBalancers returns all load balancers. +func (c *HCloudClient) ListLoadBalancers(ctx context.Context) ([]HCloudLoadBalancer, error) { + var result struct { + LoadBalancers []HCloudLoadBalancer `json:"load_balancers"` + } + if err := c.get(ctx, "/load_balancers", &result); err != nil { + return nil, err + } + return result.LoadBalancers, nil +} + +// GetLoadBalancer returns a load balancer by ID. +func (c *HCloudClient) GetLoadBalancer(ctx context.Context, id int) (*HCloudLoadBalancer, error) { + var result struct { + LoadBalancer HCloudLoadBalancer `json:"load_balancer"` + } + if err := c.get(ctx, fmt.Sprintf("/load_balancers/%d", id), &result); err != nil { + return nil, err + } + return &result.LoadBalancer, nil +} + +// CreateLoadBalancer creates a new load balancer. +func (c *HCloudClient) CreateLoadBalancer(ctx context.Context, req HCloudLBCreateRequest) (*HCloudLoadBalancer, error) { + body, err := json.Marshal(req) + if err != nil { + return nil, fmt.Errorf("marshal request: %w", err) + } + + var result struct { + LoadBalancer HCloudLoadBalancer `json:"load_balancer"` + } + if err := c.post(ctx, "/load_balancers", body, &result); err != nil { + return nil, err + } + return &result.LoadBalancer, nil +} + +// DeleteLoadBalancer deletes a load balancer by ID. +func (c *HCloudClient) DeleteLoadBalancer(ctx context.Context, id int) error { + return c.delete(ctx, fmt.Sprintf("/load_balancers/%d", id)) +} + +// CreateSnapshot creates a server snapshot. +func (c *HCloudClient) CreateSnapshot(ctx context.Context, serverID int, description string) error { + body, _ := json.Marshal(map[string]string{ + "description": description, + "type": "snapshot", + }) + return c.post(ctx, fmt.Sprintf("/servers/%d/actions/create_image", serverID), body, nil) +} + +func (c *HCloudClient) get(ctx context.Context, path string, result any) error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, hcloudBaseURL+path, nil) + if err != nil { + return err + } + return c.do(req, result) +} + +func (c *HCloudClient) post(ctx context.Context, path string, body []byte, result any) error { + req, err := http.NewRequestWithContext(ctx, http.MethodPost, hcloudBaseURL+path, strings.NewReader(string(body))) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + return c.do(req, result) +} + +func (c *HCloudClient) delete(ctx context.Context, path string) error { + req, err := http.NewRequestWithContext(ctx, http.MethodDelete, hcloudBaseURL+path, nil) + if err != nil { + return err + } + return c.do(req, nil) +} + +func (c *HCloudClient) do(req *http.Request, result any) error { + req.Header.Set("Authorization", "Bearer "+c.token) + + resp, err := c.client.Do(req) + if err != nil { + return fmt.Errorf("hcloud API: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + data, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("read response: %w", err) + } + + if resp.StatusCode >= 400 { + var apiErr struct { + Error struct { + Code string `json:"code"` + Message string `json:"message"` + } `json:"error"` + } + if json.Unmarshal(data, &apiErr) == nil && apiErr.Error.Message != "" { + return fmt.Errorf("hcloud API %d: %s — %s", resp.StatusCode, apiErr.Error.Code, apiErr.Error.Message) + } + return fmt.Errorf("hcloud API %d: %s", resp.StatusCode, string(data)) + } + + if result != nil { + if err := json.Unmarshal(data, result); err != nil { + return fmt.Errorf("decode response: %w", err) + } + } + return nil +} + +// --- Hetzner Robot API --- + +// HRobotClient is an HTTP client for the Hetzner Robot API. +type HRobotClient struct { + user string + password string + client *http.Client +} + +// NewHRobotClient creates a new Hetzner Robot API client. +func NewHRobotClient(user, password string) *HRobotClient { + return &HRobotClient{ + user: user, + password: password, + client: &http.Client{ + Timeout: 30 * time.Second, + }, + } +} + +// HRobotServer represents a Hetzner Robot dedicated server. +type HRobotServer struct { + ServerIP string `json:"server_ip"` + ServerName string `json:"server_name"` + Product string `json:"product"` + Datacenter string `json:"dc"` + Status string `json:"status"` + Cancelled bool `json:"cancelled"` + PaidUntil string `json:"paid_until"` +} + +// ListServers returns all Robot dedicated servers. +func (c *HRobotClient) ListServers(ctx context.Context) ([]HRobotServer, error) { + var raw []struct { + Server HRobotServer `json:"server"` + } + if err := c.get(ctx, "/server", &raw); err != nil { + return nil, err + } + + servers := make([]HRobotServer, len(raw)) + for i, s := range raw { + servers[i] = s.Server + } + return servers, nil +} + +// GetServer returns a Robot server by IP. +func (c *HRobotClient) GetServer(ctx context.Context, ip string) (*HRobotServer, error) { + var raw struct { + Server HRobotServer `json:"server"` + } + if err := c.get(ctx, "/server/"+ip, &raw); err != nil { + return nil, err + } + return &raw.Server, nil +} + +func (c *HRobotClient) get(ctx context.Context, path string, result any) error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, hrobotBaseURL+path, nil) + if err != nil { + return err + } + req.SetBasicAuth(c.user, c.password) + + resp, err := c.client.Do(req) + if err != nil { + return fmt.Errorf("hrobot API: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + data, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("read response: %w", err) + } + + if resp.StatusCode >= 400 { + return fmt.Errorf("hrobot API %d: %s", resp.StatusCode, string(data)) + } + + if result != nil { + if err := json.Unmarshal(data, result); err != nil { + return fmt.Errorf("decode response: %w", err) + } + } + return nil +} diff --git a/playbooks/galera-backup.yml b/playbooks/galera-backup.yml new file mode 100644 index 00000000..0109d5a9 --- /dev/null +++ b/playbooks/galera-backup.yml @@ -0,0 +1,63 @@ +# Galera Database Backup +# Dumps the database and uploads to Hetzner S3 +# +# Usage: +# core deploy ansible playbooks/galera-backup.yml -i playbooks/inventory.yml -l de +--- +- name: Backup Galera Database to S3 + hosts: app_servers + become: true + vars: + db_root_password: "{{ lookup('env', 'DB_ROOT_PASSWORD') }}" + s3_endpoint: "{{ lookup('env', 'HETZNER_S3_ENDPOINT') | default('fsn1.your-objectstorage.com', true) }}" + s3_bucket: "{{ lookup('env', 'HETZNER_S3_BUCKET') | default('hostuk', true) }}" + s3_access_key: "{{ lookup('env', 'HETZNER_S3_ACCESS_KEY') }}" + s3_secret_key: "{{ lookup('env', 'HETZNER_S3_SECRET_KEY') }}" + backup_prefix: backup/galera + backup_retain_days: 30 + + tasks: + - name: Create backup directory + file: + path: /opt/backup + state: directory + mode: "0700" + + - name: Dump database + shell: | + TIMESTAMP=$(date +%Y%m%d-%H%M%S) + DUMP_FILE="/opt/backup/hostuk-${TIMESTAMP}-{{ galera_node_name }}.sql.gz" + docker exec galera mariadb-dump \ + -u root -p{{ db_root_password }} \ + --all-databases \ + --single-transaction \ + --routines \ + --triggers \ + --events \ + | gzip > "${DUMP_FILE}" + echo "${DUMP_FILE}" + register: dump_result + + - name: Install s3cmd if missing + shell: | + which s3cmd 2>/dev/null || pip3 install s3cmd + changed_when: false + + - name: Upload to S3 + shell: | + s3cmd put {{ dump_result.stdout | trim }} \ + s3://{{ s3_bucket }}/{{ backup_prefix }}/$(basename {{ dump_result.stdout | trim }}) \ + --host={{ s3_endpoint }} \ + --host-bucket='%(bucket)s.{{ s3_endpoint }}' \ + --access_key={{ s3_access_key }} \ + --secret_key={{ s3_secret_key }} + when: s3_access_key != "" + + - name: Clean old local backups + shell: | + find /opt/backup -name "hostuk-*.sql.gz" -mtime +{{ backup_retain_days }} -delete + changed_when: false + + - name: Show backup result + debug: + msg: "Backup completed: {{ dump_result.stdout | trim }}" diff --git a/playbooks/galera-deploy.yml b/playbooks/galera-deploy.yml new file mode 100644 index 00000000..58594fb4 --- /dev/null +++ b/playbooks/galera-deploy.yml @@ -0,0 +1,96 @@ +# MariaDB Galera Cluster Deployment +# Deploys a 2-node Galera cluster on de + de2 +# +# Usage: +# core deploy ansible playbooks/galera-deploy.yml -i playbooks/inventory.yml +# core deploy ansible playbooks/galera-deploy.yml -i playbooks/inventory.yml -l de # Single node +# +# First-time bootstrap: +# Set galera_bootstrap=true for the first node: +# core deploy ansible playbooks/galera-deploy.yml -i playbooks/inventory.yml -l de -e galera_bootstrap=true +--- +- name: Deploy MariaDB Galera Cluster + hosts: app_servers + become: true + vars: + mariadb_version: "11" + galera_cluster_address: "gcomm://116.202.82.115,88.99.195.41" + galera_bootstrap: false + db_root_password: "{{ lookup('env', 'DB_ROOT_PASSWORD') }}" + db_password: "{{ lookup('env', 'DB_PASSWORD') }}" + + tasks: + - name: Create MariaDB data directory + file: + path: /opt/galera/data + state: directory + mode: "0755" + + - name: Create MariaDB config directory + file: + path: /opt/galera/conf.d + state: directory + mode: "0755" + + - name: Write Galera configuration + copy: + dest: /opt/galera/conf.d/galera.cnf + content: | + [mysqld] + wsrep_on=ON + wsrep_provider=/usr/lib/galera/libgalera_smm.so + wsrep_cluster_name={{ galera_cluster_name }} + wsrep_cluster_address={{ 'gcomm://' if galera_bootstrap else galera_cluster_address }} + wsrep_node_address={{ galera_node_address }} + wsrep_node_name={{ galera_node_name }} + wsrep_sst_method={{ galera_sst_method }} + binlog_format=ROW + default_storage_engine=InnoDB + innodb_autoinc_lock_mode=2 + innodb_buffer_pool_size=1G + innodb_log_file_size=256M + character_set_server=utf8mb4 + collation_server=utf8mb4_unicode_ci + + - name: Stop existing MariaDB container + shell: docker stop galera 2>/dev/null || true + changed_when: false + + - name: Remove existing MariaDB container + shell: docker rm galera 2>/dev/null || true + changed_when: false + + - name: Start MariaDB Galera container + shell: | + docker run -d \ + --name galera \ + --restart unless-stopped \ + --network host \ + -v /opt/galera/data:/var/lib/mysql \ + -v /opt/galera/conf.d:/etc/mysql/conf.d \ + -e MARIADB_ROOT_PASSWORD={{ db_root_password }} \ + -e MARIADB_DATABASE={{ db_name }} \ + -e MARIADB_USER={{ db_user }} \ + -e MARIADB_PASSWORD={{ db_password }} \ + mariadb:{{ mariadb_version }} + + - name: Wait for MariaDB to be ready + shell: | + for i in $(seq 1 60); do + docker exec galera mariadb -u root -p{{ db_root_password }} -e "SELECT 1" 2>/dev/null && exit 0 + sleep 2 + done + exit 1 + changed_when: false + + - name: Check Galera cluster status + shell: | + docker exec galera mariadb -u root -p{{ db_root_password }} \ + -e "SHOW STATUS WHERE Variable_name IN ('wsrep_cluster_size','wsrep_ready','wsrep_cluster_status')" \ + --skip-column-names + register: galera_status + changed_when: false + + - name: Display cluster status + debug: + var: galera_status.stdout_lines diff --git a/playbooks/inventory.yml b/playbooks/inventory.yml new file mode 100644 index 00000000..3e24226e --- /dev/null +++ b/playbooks/inventory.yml @@ -0,0 +1,36 @@ +# Ansible inventory for Host UK production +# Used by: core deploy ansible -i playbooks/inventory.yml +all: + vars: + ansible_user: root + ansible_ssh_private_key_file: ~/.ssh/hostuk + + children: + bastion: + hosts: + noc: + ansible_host: 77.42.42.205 + private_ip: 10.0.0.4 + + app_servers: + hosts: + de: + ansible_host: 116.202.82.115 + galera_node_name: de + galera_node_address: 116.202.82.115 + de2: + ansible_host: 88.99.195.41 + galera_node_name: de2 + galera_node_address: 88.99.195.41 + vars: + galera_cluster_name: hostuk-galera + galera_sst_method: mariabackup + db_name: hostuk + db_user: hostuk + redis_maxmemory: 512mb + + builders: + hosts: + build: + ansible_host: 46.224.93.62 + private_ip: 10.0.0.5 diff --git a/playbooks/redis-deploy.yml b/playbooks/redis-deploy.yml new file mode 100644 index 00000000..ed3b86e7 --- /dev/null +++ b/playbooks/redis-deploy.yml @@ -0,0 +1,98 @@ +# Redis Sentinel Deployment +# Deploys Redis with Sentinel on de + de2 +# +# Usage: +# core deploy ansible playbooks/redis-deploy.yml -i playbooks/inventory.yml +--- +- name: Deploy Redis with Sentinel + hosts: app_servers + become: true + vars: + redis_version: "7" + redis_password: "{{ lookup('env', 'REDIS_PASSWORD') | default('', true) }}" + + tasks: + - name: Create Redis data directory + file: + path: /opt/redis/data + state: directory + mode: "0755" + + - name: Create Redis config directory + file: + path: /opt/redis/conf + state: directory + mode: "0755" + + - name: Write Redis configuration + copy: + dest: /opt/redis/conf/redis.conf + content: | + maxmemory {{ redis_maxmemory }} + maxmemory-policy allkeys-lru + appendonly yes + appendfsync everysec + tcp-keepalive 300 + timeout 0 + {% if redis_password %} + requirepass {{ redis_password }} + masterauth {{ redis_password }} + {% endif %} + + - name: Write Sentinel configuration + copy: + dest: /opt/redis/conf/sentinel.conf + content: | + port 26379 + sentinel monitor hostuk-redis 116.202.82.115 6379 2 + sentinel down-after-milliseconds hostuk-redis 5000 + sentinel failover-timeout hostuk-redis 60000 + sentinel parallel-syncs hostuk-redis 1 + {% if redis_password %} + sentinel auth-pass hostuk-redis {{ redis_password }} + {% endif %} + + - name: Stop existing Redis containers + shell: | + docker stop redis redis-sentinel 2>/dev/null || true + docker rm redis redis-sentinel 2>/dev/null || true + changed_when: false + + - name: Start Redis container + shell: | + docker run -d \ + --name redis \ + --restart unless-stopped \ + --network host \ + -v /opt/redis/data:/data \ + -v /opt/redis/conf/redis.conf:/usr/local/etc/redis/redis.conf \ + redis:{{ redis_version }}-alpine \ + redis-server /usr/local/etc/redis/redis.conf + + - name: Start Redis Sentinel container + shell: | + docker run -d \ + --name redis-sentinel \ + --restart unless-stopped \ + --network host \ + -v /opt/redis/conf/sentinel.conf:/usr/local/etc/redis/sentinel.conf \ + redis:{{ redis_version }}-alpine \ + redis-sentinel /usr/local/etc/redis/sentinel.conf + + - name: Wait for Redis to be ready + shell: | + for i in $(seq 1 30); do + docker exec redis redis-cli ping 2>/dev/null | grep -q PONG && exit 0 + sleep 1 + done + exit 1 + changed_when: false + + - name: Check Redis info + shell: docker exec redis redis-cli info replication | head -10 + register: redis_info + changed_when: false + + - name: Display Redis info + debug: + var: redis_info.stdout_lines