2026-02-23 12:04:18 +00:00
2 changed files with 181 additions and 49 deletions
--- a/Services/AgentDetection.php
+++ b/Services/AgentDetection.php
@ -17,106 +17,221 @@ use Illuminate\Http\Request;
 * - Absence of typical browser indicators
 *
 * Part of the Trees for Agents system for rewarding AI agent referrals.
+ *
+ * Detection priority (highest to lowest):
+ * 1. MCP token header (X-MCP-Token) — registered agents with explicit identity
+ * 2. User-Agent provider patterns — matches known AI client strings
+ * 3. Non-agent bot patterns — rules out search crawlers and monitoring tools
+ * 4. Browser indicators — rules out real browser traffic
+ * 5. Unknown agent fallback — programmatic access with no identifying UA
+ *
+ * Usage:
+ * ```php
+ * $detection = app(AgentDetection::class);
+ *
+ * // From a full HTTP request (checks MCP token first, then User-Agent)
+ * $identity = $detection->identify($request);
+ *
+ * // From a User-Agent string directly
+ * $identity = $detection->identifyFromUserAgent('claude-code/1.0 anthropic-api');
+ *
+ * // Quick boolean check
+ * if ($detection->isAgent($request)) {
+ *     // credit the referral tree
+ * }
+ *
+ * // Inspect the result
+ * echo $identity->provider;    // e.g. "anthropic"
+ * echo $identity->model;       // e.g. "claude-sonnet" or null
+ * echo $identity->confidence;  // e.g. "high"
+ * echo $identity->isAgent();   // true / false
+ * ```
 */
 class AgentDetection
 {
    /**
     * User-Agent patterns for known AI providers.
     *
-     * @var array<string, array{pattern: string, model_pattern: ?string}>
+     * Each entry maps a provider key to an array of detection patterns and optional
+     * model-specific sub-patterns. Patterns are tested in order; the first match wins.
+     *
+     * Provider patterns (case-insensitive):
+     *
+     * - anthropic:
+     *   Examples: "claude-code/1.0", "Anthropic-API/2.0 claude-sonnet",
+     *             "Claude AI Assistant/1.0", "claude code (agentic)"
+     *
+     * - openai:
+     *   Examples: "ChatGPT-User/1.0", "OpenAI/1.0 python-httpx/0.26",
+     *             "GPT-4-turbo/2024-04", "o1-preview/2024-09", "o1-mini/1.0"
+     *
+     * - google:
+     *   Examples: "Google-AI/1.0", "Gemini/1.5-pro", "Google Bard/0.1",
+     *             "PaLM API/1.0 google-generativeai/0.3"
+     *
+     * - meta:
+     *   Examples: "Meta AI/1.0", "LLaMA/2.0 meta-ai", "Llama-3/2024-04",
+     *             "Llama-2-chat/70B"
+     *
+     * - mistral:
+     *   Examples: "Mistral/0.1.0 mistralai-python/0.1", "Mixtral-8x7B/1.0",
+     *             "MistralAI-Large/latest"
+     *
+     * Model patterns narrow the detection to a specific model variant within a provider
+     * when the User-Agent includes version/model information.
+     *
+     * @var array<string, array{patterns: string[], model_patterns: array<string, string>}>
     */
    protected const PROVIDER_PATTERNS = [
        'anthropic' => [
            'patterns' => [
-                '/claude[\s\-_]?code/i',
-                '/\banthopic\b/i',
-                '/\banthropic[\s\-_]?api\b/i',
-                '/\bclaude\b.*\bai\b/i',
-                '/\bclaude\b.*\bassistant\b/i',
+                '/claude[\s\-_]?code/i',       // e.g. "claude-code/1.0", "claude code"
+                '/\banthopic\b/i',              // e.g. "Anthropic/1.0" (intentional typo tolerance)
+                '/\banthropic[\s\-_]?api\b/i',  // e.g. "Anthropic-API/2.0"
+                '/\bclaude\b.*\bai\b/i',        // e.g. "Claude AI Assistant/1.0"
+                '/\bclaude\b.*\bassistant\b/i', // e.g. "Claude-Assistant/2.1"
            ],
            'model_patterns' => [
-                'claude-opus' => '/claude[\s\-_]?opus/i',
-                'claude-sonnet' => '/claude[\s\-_]?sonnet/i',
-                'claude-haiku' => '/claude[\s\-_]?haiku/i',
+                'claude-opus'   => '/claude[\s\-_]?opus/i',   // e.g. "claude-opus-4-5"
+                'claude-sonnet' => '/claude[\s\-_]?sonnet/i', // e.g. "claude-sonnet-4-6"
+                'claude-haiku'  => '/claude[\s\-_]?haiku/i',  // e.g. "claude-haiku-4-5"
            ],
        ],
        'openai' => [
            'patterns' => [
-                '/\bChatGPT\b/i',
-                '/\bOpenAI\b/i',
-                '/\bGPT[\s\-_]?4\b/i',
-                '/\bGPT[\s\-_]?3\.?5\b/i',
-                '/\bo1[\s\-_]?preview\b/i',
-                '/\bo1[\s\-_]?mini\b/i',
+                '/\bChatGPT\b/i',           // e.g. "ChatGPT-User/1.0"
+                '/\bOpenAI\b/i',            // e.g. "OpenAI/1.0 python-httpx/0.26"
+                '/\bGPT[\s\-_]?4\b/i',      // e.g. "GPT-4-turbo/2024-04"
+                '/\bGPT[\s\-_]?3\.?5\b/i',  // e.g. "GPT-3.5-turbo/1.0"
+                '/\bo1[\s\-_]?preview\b/i', // e.g. "o1-preview/2024-09"
+                '/\bo1[\s\-_]?mini\b/i',    // e.g. "o1-mini/1.0"
            ],
            'model_patterns' => [
-                'gpt-4' => '/\bGPT[\s\-_]?4/i',
-                'gpt-3.5' => '/\bGPT[\s\-_]?3\.?5/i',
-                'o1' => '/\bo1[\s\-_]?(preview|mini)?\b/i',
+                'gpt-4'   => '/\bGPT[\s\-_]?4/i',           // e.g. "GPT-4o", "GPT-4-turbo"
+                'gpt-3.5' => '/\bGPT[\s\-_]?3\.?5/i',       // e.g. "GPT-3.5-turbo"
+                'o1'      => '/\bo1[\s\-_]?(preview|mini)?\b/i', // e.g. "o1", "o1-preview", "o1-mini"
            ],
        ],
        'google' => [
            'patterns' => [
-                '/\bGoogle[\s\-_]?AI\b/i',
-                '/\bGemini\b/i',
-                '/\bBard\b/i',
-                '/\bPaLM\b/i',
+                '/\bGoogle[\s\-_]?AI\b/i', // e.g. "Google-AI/1.0"
+                '/\bGemini\b/i',           // e.g. "Gemini/1.5-pro", "gemini-flash"
+                '/\bBard\b/i',             // e.g. "Google Bard/0.1" (legacy)
+                '/\bPaLM\b/i',             // e.g. "PaLM API/1.0" (legacy)
            ],
            'model_patterns' => [
-                'gemini-pro' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?pro/i',
-                'gemini-ultra' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?ultra/i',
-                'gemini-flash' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?flash/i',
+                'gemini-pro'   => '/gemini[\s\-_]?(1\.5[\s\-_]?)?pro/i',   // e.g. "gemini-1.5-pro"
+                'gemini-ultra' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?ultra/i', // e.g. "gemini-ultra"
+                'gemini-flash' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?flash/i', // e.g. "gemini-1.5-flash"
            ],
        ],
        'meta' => [
            'patterns' => [
-                '/\bMeta[\s\-_]?AI\b/i',
-                '/\bLLaMA\b/i',
-                '/\bLlama[\s\-_]?[23]\b/i',
+                '/\bMeta[\s\-_]?AI\b/i',    // e.g. "Meta AI/1.0"
+                '/\bLLaMA\b/i',             // e.g. "LLaMA/2.0 meta-ai"
+                '/\bLlama[\s\-_]?[23]\b/i', // e.g. "Llama-3/2024-04", "Llama-2-chat"
            ],
            'model_patterns' => [
-                'llama-3' => '/llama[\s\-_]?3/i',
-                'llama-2' => '/llama[\s\-_]?2/i',
+                'llama-3' => '/llama[\s\-_]?3/i', // e.g. "Llama-3-8B", "llama3-70b"
+                'llama-2' => '/llama[\s\-_]?2/i', // e.g. "Llama-2-chat/70B"
            ],
        ],
        'mistral' => [
            'patterns' => [
-                '/\bMistral\b/i',
-                '/\bMixtral\b/i',
+                '/\bMistral\b/i', // e.g. "Mistral/0.1.0 mistralai-python/0.1"
+                '/\bMixtral\b/i', // e.g. "Mixtral-8x7B/1.0"
            ],
            'model_patterns' => [
-                'mistral-large' => '/mistral[\s\-_]?large/i',
-                'mistral-medium' => '/mistral[\s\-_]?medium/i',
-                'mixtral' => '/mixtral/i',
+                'mistral-large'  => '/mistral[\s\-_]?large/i',  // e.g. "mistral-large-latest"
+                'mistral-medium' => '/mistral[\s\-_]?medium/i', // e.g. "mistral-medium"
+                'mixtral'        => '/mixtral/i',               // e.g. "Mixtral-8x7B-Instruct"
            ],
        ],
    ];

    /**
     * Patterns that indicate a typical web browser.
-     * If none of these are present, it might be programmatic access.
+     *
+     * If none of these tokens appear in a User-Agent string, the request is likely
+     * programmatic (a script, CLI tool, or potential agent). The patterns cover all
+     * major browser families and legacy rendering engine identifiers.
+     *
+     * Examples of matching User-Agents:
+     * - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0"
+     * - "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2) ... Safari/537.36"
+     * - "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0"
+     * - "Mozilla/5.0 ... Edg/120.0"        — Microsoft Edge (Chromium)
+     * - "Opera/9.80 ... OPR/106.0"         — Opera
+     * - "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)" — Internet Explorer
+     * - "Mozilla/5.0 ... Trident/7.0; rv:11.0" — IE 11 (Trident engine)
     */
    protected const BROWSER_INDICATORS = [
-        '/\bMozilla\b/i',
-        '/\bChrome\b/i',
-        '/\bSafari\b/i',
-        '/\bFirefox\b/i',
-        '/\bEdge\b/i',
-        '/\bOpera\b/i',
-        '/\bMSIE\b/i',
-        '/\bTrident\b/i',
+        '/\bMozilla\b/i', // All Gecko/WebKit/Blink browsers include "Mozilla/5.0"
+        '/\bChrome\b/i',  // Chrome, Chromium, and most Chromium-based browsers
+        '/\bSafari\b/i',  // Safari and WebKit-based browsers
+        '/\bFirefox\b/i', // Mozilla Firefox
+        '/\bEdge\b/i',    // Microsoft Edge (legacy "Edge/" and Chromium "Edg/")
+        '/\bOpera\b/i',   // Opera ("Opera/" classic, "OPR/" modern)
+        '/\bMSIE\b/i',    // Internet Explorer (e.g. "MSIE 11.0")
+        '/\bTrident\b/i', // IE 11 Trident rendering engine token
    ];

    /**
     * Known bot patterns that are NOT AI agents.
-     * These should return notAnAgent, not unknown.
+     *
+     * These should resolve to `AgentIdentity::notAnAgent()` rather than
+     * `AgentIdentity::unknownAgent()`, because we can positively identify them
+     * as a specific non-AI automated client (crawler, monitoring, HTTP library, etc.).
+     *
+     * Categories and example User-Agents:
+     *
+     * Search engine crawlers:
+     * - "Googlebot/2.1 (+http://www.google.com/bot.html)"
+     * - "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
+     * - "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
+     * - "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)"
+     * - "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
+     * - "Applebot/0.1 (+http://www.apple.com/go/applebot)"
+     *
+     * Social media / link-preview bots:
+     * - "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"
+     * - "Twitterbot/1.0"
+     * - "LinkedInBot/1.0 (compatible; Mozilla/5.0; Apache-HttpClient/4.5)"
+     * - "Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)"
+     * - "DiscordBot (https://discordapp.com) 1.0"
+     * - "TelegramBot (like TwitterBot)"
+     * - "WhatsApp/2.23.20 A"
+     *
+     * SEO / analytics crawlers:
+     * - "Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html)"
+     * - "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)"
+     *
+     * Generic HTTP clients (scripts, developer tools):
+     * - "curl/7.88.1"
+     * - "Wget/1.21.4"
+     * - "python-requests/2.31.0"
+     * - "Go-http-client/2.0"
+     * - "PostmanRuntime/7.35.0"
+     * - "insomnia/2023.5.8"
+     * - "axios/1.6.0"
+     * - "node-fetch/2.6.11"
+     *
+     * Uptime / monitoring services:
+     * - "UptimeRobot/2.0 (+http://www.uptimerobot.com/)"
+     * - "Pingdom.com_bot_version_1.4 (http://www.pingdom.com/)"
+     * - "Datadog Agent/7.45.0"
+     * - "NewRelicPinger/v1 AccountId=12345"
     */
    protected const NON_AGENT_BOTS = [
+        // Search engine crawlers
        '/\bGooglebot\b/i',
        '/\bBingbot\b/i',
        '/\bYandexBot\b/i',
        '/\bDuckDuckBot\b/i',
        '/\bBaiduspider\b/i',
+        '/\bApplebot\b/i',
+
+        // Social media / link-preview bots
        '/\bfacebookexternalhit\b/i',
        '/\bTwitterbot\b/i',
        '/\bLinkedInBot\b/i',
@ -124,9 +239,12 @@ class AgentDetection
        '/\bDiscordBot\b/i',
        '/\bTelegramBot\b/i',
        '/\bWhatsApp\//i',
-        '/\bApplebot\b/i',
+
+        // SEO / analytics crawlers
        '/\bSEMrushBot\b/i',
        '/\bAhrefsBot\b/i',
+
+        // Generic HTTP clients
        '/\bcurl\b/i',
        '/\bwget\b/i',
        '/\bpython-requests\b/i',
@ -135,6 +253,8 @@ class AgentDetection
        '/\bInsomnia\b/i',
        '/\baxios\b/i',
        '/\bnode-fetch\b/i',
+
+        // Uptime / monitoring services
        '/\bUptimeRobot\b/i',
        '/\bPingdom\b/i',
        '/\bDatadog\b/i',
@ -142,7 +262,19 @@ class AgentDetection
    ];

    /**
-     * The MCP token header name.
+     * The MCP token header used to identify registered AI agents.
+     *
+     * Agents send this header to bypass User-Agent heuristics and declare their
+     * identity explicitly. Two token formats are supported:
+     *
+     * - Opaque AgentApiKey token (prefix "ak_"):
+     *   Looked up in the database. Grants highest confidence when the key is active.
+     *   Example: `X-MCP-Token: ak_a1b2c3d4e5f6...`
+     *
+     * - Structured provider:model:secret token:
+     *   Encodes provider and model directly in the token value.
+     *   Example: `X-MCP-Token: anthropic:claude-sonnet:mysecret`
+     *   Example: `X-MCP-Token: openai:gpt-4:xyz789`
     */
    protected const MCP_TOKEN_HEADER = 'X-MCP-Token';

--- a/TODO.md
+++ b/TODO.md
@ -169,10 +169,10 @@ Production-quality task list for the AI agent orchestration package.

 ### Documentation Gaps

- [ ] **DOC-001: Add PHPDoc to AgentDetection patterns**
+- [x] **DOC-001: Add PHPDoc to AgentDetection patterns** (FIXED 2026-02-23)
  - Location: `Services/AgentDetection.php`
  - Issue: User-Agent patterns undocumented
-  - Fix: Document each pattern with agent examples
+  - Fix: Added PHPDoc with real UA examples to all pattern constants, class-level usage examples, and MCP_TOKEN_HEADER docs

 - [ ] **DOC-002: Document MCP tool dependency system**
  - Location: `Mcp/Tools/Agent/` directory