php-agentic/Services/AgentDetection.php

<?php

declare(strict_types=1);

namespace Core\Mod\Agentic\Services;

use Core\Mod\Agentic\Models\AgentApiKey;
use Core\Mod\Agentic\Support\AgentIdentity;
use Illuminate\Http\Request;

/**
 * Service for detecting AI agents from HTTP requests.
 *
 * Identifies AI agent providers (Anthropic, OpenAI, Google, etc.) from:
 * - User-Agent string patterns
 * - MCP token headers
 * - Absence of typical browser indicators
 *
 * Part of the Trees for Agents system for rewarding AI agent referrals.
 */
class AgentDetection
{
    /**
     * User-Agent patterns for known AI providers.
     *
     * @var array<string, array{pattern: string, model_pattern: ?string}>
     */
    protected const PROVIDER_PATTERNS = [
        'anthropic' => [
            'patterns' => [
                '/claude[\s\-_]?code/i',
                '/\banthopic\b/i',
                '/\banthropic[\s\-_]?api\b/i',
                '/\bclaude\b.*\bai\b/i',
                '/\bclaude\b.*\bassistant\b/i',
            ],
            'model_patterns' => [
                'claude-opus' => '/claude[\s\-_]?opus/i',
                'claude-sonnet' => '/claude[\s\-_]?sonnet/i',
                'claude-haiku' => '/claude[\s\-_]?haiku/i',
            ],
        ],
        'openai' => [
            'patterns' => [
                '/\bChatGPT\b/i',
                '/\bOpenAI\b/i',
                '/\bGPT[\s\-_]?4\b/i',
                '/\bGPT[\s\-_]?3\.?5\b/i',
                '/\bo1[\s\-_]?preview\b/i',
                '/\bo1[\s\-_]?mini\b/i',
            ],
            'model_patterns' => [
                'gpt-4' => '/\bGPT[\s\-_]?4/i',
                'gpt-3.5' => '/\bGPT[\s\-_]?3\.?5/i',
                'o1' => '/\bo1[\s\-_]?(preview|mini)?\b/i',
            ],
        ],
        'google' => [
            'patterns' => [
                '/\bGoogle[\s\-_]?AI\b/i',
                '/\bGemini\b/i',
                '/\bBard\b/i',
                '/\bPaLM\b/i',
            ],
            'model_patterns' => [
                'gemini-pro' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?pro/i',
                'gemini-ultra' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?ultra/i',
                'gemini-flash' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?flash/i',
            ],
        ],
        'meta' => [
            'patterns' => [
                '/\bMeta[\s\-_]?AI\b/i',
                '/\bLLaMA\b/i',
                '/\bLlama[\s\-_]?[23]\b/i',
            ],
            'model_patterns' => [
                'llama-3' => '/llama[\s\-_]?3/i',
                'llama-2' => '/llama[\s\-_]?2/i',
            ],
        ],
        'mistral' => [
            'patterns' => [
                '/\bMistral\b/i',
                '/\bMixtral\b/i',
            ],
            'model_patterns' => [
                'mistral-large' => '/mistral[\s\-_]?large/i',
                'mistral-medium' => '/mistral[\s\-_]?medium/i',
                'mixtral' => '/mixtral/i',
            ],
        ],
    ];

    /**
     * Patterns that indicate a typical web browser.
     * If none of these are present, it might be programmatic access.
     */
    protected const BROWSER_INDICATORS = [
        '/\bMozilla\b/i',
        '/\bChrome\b/i',
        '/\bSafari\b/i',
        '/\bFirefox\b/i',
        '/\bEdge\b/i',
        '/\bOpera\b/i',
        '/\bMSIE\b/i',
        '/\bTrident\b/i',
    ];

    /**
     * Known bot patterns that are NOT AI agents.
     * These should return notAnAgent, not unknown.
     */
    protected const NON_AGENT_BOTS = [
        '/\bGooglebot\b/i',
        '/\bBingbot\b/i',
        '/\bYandexBot\b/i',
        '/\bDuckDuckBot\b/i',
        '/\bBaiduspider\b/i',
        '/\bfacebookexternalhit\b/i',
        '/\bTwitterbot\b/i',
        '/\bLinkedInBot\b/i',
        '/\bSlackbot\b/i',
        '/\bDiscordBot\b/i',
        '/\bTelegramBot\b/i',
        '/\bWhatsApp\//i',
        '/\bApplebot\b/i',
        '/\bSEMrushBot\b/i',
        '/\bAhrefsBot\b/i',
        '/\bcurl\b/i',
        '/\bwget\b/i',
        '/\bpython-requests\b/i',
        '/\bgo-http-client\b/i',
        '/\bPostman\b/i',
        '/\bInsomnia\b/i',
        '/\baxios\b/i',
        '/\bnode-fetch\b/i',
        '/\bUptimeRobot\b/i',
        '/\bPingdom\b/i',
        '/\bDatadog\b/i',
        '/\bNewRelic\b/i',
    ];

    /**
     * The MCP token header name.
     */
    protected const MCP_TOKEN_HEADER = 'X-MCP-Token';

    /**
     * Identify an agent from an HTTP request.
     */
    public function identify(Request $request): AgentIdentity
    {
        // First, check for MCP token (highest priority)
        $mcpToken = $request->header(self::MCP_TOKEN_HEADER);
        if ($mcpToken) {
            return $this->identifyFromMcpToken($mcpToken);
        }

        // Then check User-Agent
        $userAgent = $request->userAgent();

        return $this->identifyFromUserAgent($userAgent);
    }

    /**
     * Identify an agent from a User-Agent string.
     */
    public function identifyFromUserAgent(?string $userAgent): AgentIdentity
    {
        if (! $userAgent || trim($userAgent) === '') {
            // Empty User-Agent is suspicious but not definitive
            return AgentIdentity::unknownAgent();
        }

        // Check for known AI providers first (highest confidence)
        foreach (self::PROVIDER_PATTERNS as $provider => $config) {
            foreach ($config['patterns'] as $pattern) {
                if (preg_match($pattern, $userAgent)) {
                    $model = $this->detectModel($userAgent, $config['model_patterns']);

                    return $this->createProviderIdentity($provider, $model, AgentIdentity::CONFIDENCE_HIGH);
                }
            }
        }

        // Check for non-agent bots (search engines, monitoring, etc.)
        foreach (self::NON_AGENT_BOTS as $pattern) {
            if (preg_match($pattern, $userAgent)) {
                return AgentIdentity::notAnAgent();
            }
        }

        // Check if it looks like a normal browser
        if ($this->looksLikeBrowser($userAgent)) {
            return AgentIdentity::notAnAgent();
        }

        // No browser indicators and not a known bot — might be an unknown agent
        return AgentIdentity::unknownAgent();
    }

    /**
     * Identify an agent from an MCP token.
     *
     * MCP tokens can encode provider and model information for registered agents.
     * Supports two token formats:
     * - Structured: "provider:model:secret" (e.g., "anthropic:claude-opus:abc123")
     * - Opaque: "ak_xxxx..." (registered AgentApiKey, looked up in database)
     */
    public function identifyFromMcpToken(string $token): AgentIdentity
    {
        // Check for opaque token format (AgentApiKey)
        // AgentApiKey tokens start with "ak_" prefix
        if (str_starts_with($token, 'ak_')) {
            return $this->identifyFromAgentApiKey($token);
        }

        // Try structured token format: "provider:model:secret"
        // Expected token formats:
        // - "anthropic:claude-opus:abc123" (provider:model:secret)
        // - "openai:gpt-4:xyz789"
        $parts = explode(':', $token, 3);

        if (count($parts) >= 2) {
            $provider = strtolower($parts[0]);
            $model = $parts[1] ?? null;

            // Validate provider is in our known list
            if ($this->isValidProvider($provider)) {
                return $this->createProviderIdentity($provider, $model, AgentIdentity::CONFIDENCE_HIGH);
            }
        }

        // Unrecognised token format — return unknown with medium confidence
        // (token present suggests agent, but we cannot identify provider)
        return new AgentIdentity('unknown', null, AgentIdentity::CONFIDENCE_MEDIUM);
    }

    /**
     * Identify an agent from a registered AgentApiKey token.
     *
     * Looks up the token in the database and extracts provider/model
     * from the key's metadata if available.
     */
    protected function identifyFromAgentApiKey(string $token): AgentIdentity
    {
        $apiKey = AgentApiKey::findByKey($token);

        if ($apiKey === null) {
            // Token not found in database — invalid or revoked
            return AgentIdentity::unknownAgent();
        }

        // Check if the key is active
        if (! $apiKey->isActive()) {
            // Expired or revoked key — still an agent, but unknown
            return AgentIdentity::unknownAgent();
        }

        // Extract provider and model from key name or permissions
        // Key names often follow pattern: "Claude Opus Agent" or "GPT-4 Integration"
        $provider = $this->extractProviderFromKeyName($apiKey->name);
        $model = $this->extractModelFromKeyName($apiKey->name);

        if ($provider !== null) {
            return $this->createProviderIdentity($provider, $model, AgentIdentity::CONFIDENCE_HIGH);
        }

        // Valid key but cannot determine provider — return unknown with high confidence
        // (we know it's a registered agent, just not which provider)
        return new AgentIdentity('unknown', null, AgentIdentity::CONFIDENCE_HIGH);
    }

    /**
     * Extract provider from an API key name.
     *
     * Attempts to identify provider from common naming patterns:
     * - "Claude Agent", "Anthropic Integration" => anthropic
     * - "GPT-4 Agent", "OpenAI Integration" => openai
     * - "Gemini Agent", "Google AI" => google
     */
    protected function extractProviderFromKeyName(string $name): ?string
    {
        $nameLower = strtolower($name);

        // Check for provider keywords
        $providerPatterns = [
            'anthropic' => ['anthropic', 'claude'],
            'openai' => ['openai', 'gpt', 'chatgpt', 'o1-'],
            'google' => ['google', 'gemini', 'bard', 'palm'],
            'meta' => ['meta', 'llama'],
            'mistral' => ['mistral', 'mixtral'],
        ];

        foreach ($providerPatterns as $provider => $keywords) {
            foreach ($keywords as $keyword) {
                if (str_contains($nameLower, $keyword)) {
                    return $provider;
                }
            }
        }

        return null;
    }

    /**
     * Extract model from an API key name.
     *
     * Attempts to identify specific model from naming patterns:
     * - "Claude Opus Agent" => claude-opus
     * - "GPT-4 Integration" => gpt-4
     */
    protected function extractModelFromKeyName(string $name): ?string
    {
        $nameLower = strtolower($name);

        // Check for model keywords
        $modelPatterns = [
            'claude-opus' => ['opus'],
            'claude-sonnet' => ['sonnet'],
            'claude-haiku' => ['haiku'],
            'gpt-4' => ['gpt-4', 'gpt4'],
            'gpt-3.5' => ['gpt-3.5', 'gpt3.5', 'turbo'],
            'o1' => ['o1-preview', 'o1-mini', 'o1 '],
            'gemini-pro' => ['gemini pro', 'gemini-pro'],
            'gemini-flash' => ['gemini flash', 'gemini-flash'],
            'llama-3' => ['llama 3', 'llama-3', 'llama3'],
        ];

        foreach ($modelPatterns as $model => $keywords) {
            foreach ($keywords as $keyword) {
                if (str_contains($nameLower, $keyword)) {
                    return $model;
                }
            }
        }

        return null;
    }

    /**
     * Check if the User-Agent looks like a normal web browser.
     */
    protected function looksLikeBrowser(?string $userAgent): bool
    {
        if (! $userAgent) {
            return false;
        }

        foreach (self::BROWSER_INDICATORS as $pattern) {
            if (preg_match($pattern, $userAgent)) {
                return true;
            }
        }

        return false;
    }

    /**
     * Detect the model from User-Agent patterns.
     *
     * @param  array<string, string>  $modelPatterns
     */
    protected function detectModel(string $userAgent, array $modelPatterns): ?string
    {
        foreach ($modelPatterns as $model => $pattern) {
            if (preg_match($pattern, $userAgent)) {
                return $model;
            }
        }

        return null;
    }

    /**
     * Create an identity for a known provider.
     */
    protected function createProviderIdentity(string $provider, ?string $model, string $confidence): AgentIdentity
    {
        return match ($provider) {
            'anthropic' => AgentIdentity::anthropic($model, $confidence),
            'openai' => AgentIdentity::openai($model, $confidence),
            'google' => AgentIdentity::google($model, $confidence),
            'meta' => AgentIdentity::meta($model, $confidence),
            'mistral' => AgentIdentity::mistral($model, $confidence),
            'local' => AgentIdentity::local($model, $confidence),
            default => new AgentIdentity($provider, $model, $confidence),
        };
    }

    /**
     * Check if a provider name is valid.
     */
    public function isValidProvider(string $provider): bool
    {
        return in_array($provider, [
            'anthropic',
            'openai',
            'google',
            'meta',
            'mistral',
            'local',
            'unknown',
        ], true);
    }

    /**
     * Get the list of valid providers.
     *
     * @return string[]
     */
    public function getValidProviders(): array
    {
        return [
            'anthropic',
            'openai',
            'google',
            'meta',
            'mistral',
            'local',
            'unknown',
        ];
    }

    /**
     * Check if a request appears to be from an AI agent.
     */
    public function isAgent(Request $request): bool
    {
        return $this->identify($request)->isAgent();
    }

    /**
     * Check if a User-Agent appears to be from an AI agent.
     */
    public function isAgentUserAgent(?string $userAgent): bool
    {
        return $this->identifyFromUserAgent($userAgent)->isAgent();
    }
}