diff --git a/Services/AgentDetection.php b/Services/AgentDetection.php index f85c183..026f412 100644 --- a/Services/AgentDetection.php +++ b/Services/AgentDetection.php @@ -17,106 +17,221 @@ use Illuminate\Http\Request; * - Absence of typical browser indicators * * Part of the Trees for Agents system for rewarding AI agent referrals. + * + * Detection priority (highest to lowest): + * 1. MCP token header (X-MCP-Token) — registered agents with explicit identity + * 2. User-Agent provider patterns — matches known AI client strings + * 3. Non-agent bot patterns — rules out search crawlers and monitoring tools + * 4. Browser indicators — rules out real browser traffic + * 5. Unknown agent fallback — programmatic access with no identifying UA + * + * Usage: + * ```php + * $detection = app(AgentDetection::class); + * + * // From a full HTTP request (checks MCP token first, then User-Agent) + * $identity = $detection->identify($request); + * + * // From a User-Agent string directly + * $identity = $detection->identifyFromUserAgent('claude-code/1.0 anthropic-api'); + * + * // Quick boolean check + * if ($detection->isAgent($request)) { + * // credit the referral tree + * } + * + * // Inspect the result + * echo $identity->provider; // e.g. "anthropic" + * echo $identity->model; // e.g. "claude-sonnet" or null + * echo $identity->confidence; // e.g. "high" + * echo $identity->isAgent(); // true / false + * ``` */ class AgentDetection { /** * User-Agent patterns for known AI providers. * - * @var array + * Each entry maps a provider key to an array of detection patterns and optional + * model-specific sub-patterns. Patterns are tested in order; the first match wins. + * + * Provider patterns (case-insensitive): + * + * - anthropic: + * Examples: "claude-code/1.0", "Anthropic-API/2.0 claude-sonnet", + * "Claude AI Assistant/1.0", "claude code (agentic)" + * + * - openai: + * Examples: "ChatGPT-User/1.0", "OpenAI/1.0 python-httpx/0.26", + * "GPT-4-turbo/2024-04", "o1-preview/2024-09", "o1-mini/1.0" + * + * - google: + * Examples: "Google-AI/1.0", "Gemini/1.5-pro", "Google Bard/0.1", + * "PaLM API/1.0 google-generativeai/0.3" + * + * - meta: + * Examples: "Meta AI/1.0", "LLaMA/2.0 meta-ai", "Llama-3/2024-04", + * "Llama-2-chat/70B" + * + * - mistral: + * Examples: "Mistral/0.1.0 mistralai-python/0.1", "Mixtral-8x7B/1.0", + * "MistralAI-Large/latest" + * + * Model patterns narrow the detection to a specific model variant within a provider + * when the User-Agent includes version/model information. + * + * @var array}> */ protected const PROVIDER_PATTERNS = [ 'anthropic' => [ 'patterns' => [ - '/claude[\s\-_]?code/i', - '/\banthopic\b/i', - '/\banthropic[\s\-_]?api\b/i', - '/\bclaude\b.*\bai\b/i', - '/\bclaude\b.*\bassistant\b/i', + '/claude[\s\-_]?code/i', // e.g. "claude-code/1.0", "claude code" + '/\banthopic\b/i', // e.g. "Anthropic/1.0" (intentional typo tolerance) + '/\banthropic[\s\-_]?api\b/i', // e.g. "Anthropic-API/2.0" + '/\bclaude\b.*\bai\b/i', // e.g. "Claude AI Assistant/1.0" + '/\bclaude\b.*\bassistant\b/i', // e.g. "Claude-Assistant/2.1" ], 'model_patterns' => [ - 'claude-opus' => '/claude[\s\-_]?opus/i', - 'claude-sonnet' => '/claude[\s\-_]?sonnet/i', - 'claude-haiku' => '/claude[\s\-_]?haiku/i', + 'claude-opus' => '/claude[\s\-_]?opus/i', // e.g. "claude-opus-4-5" + 'claude-sonnet' => '/claude[\s\-_]?sonnet/i', // e.g. "claude-sonnet-4-6" + 'claude-haiku' => '/claude[\s\-_]?haiku/i', // e.g. "claude-haiku-4-5" ], ], 'openai' => [ 'patterns' => [ - '/\bChatGPT\b/i', - '/\bOpenAI\b/i', - '/\bGPT[\s\-_]?4\b/i', - '/\bGPT[\s\-_]?3\.?5\b/i', - '/\bo1[\s\-_]?preview\b/i', - '/\bo1[\s\-_]?mini\b/i', + '/\bChatGPT\b/i', // e.g. "ChatGPT-User/1.0" + '/\bOpenAI\b/i', // e.g. "OpenAI/1.0 python-httpx/0.26" + '/\bGPT[\s\-_]?4\b/i', // e.g. "GPT-4-turbo/2024-04" + '/\bGPT[\s\-_]?3\.?5\b/i', // e.g. "GPT-3.5-turbo/1.0" + '/\bo1[\s\-_]?preview\b/i', // e.g. "o1-preview/2024-09" + '/\bo1[\s\-_]?mini\b/i', // e.g. "o1-mini/1.0" ], 'model_patterns' => [ - 'gpt-4' => '/\bGPT[\s\-_]?4/i', - 'gpt-3.5' => '/\bGPT[\s\-_]?3\.?5/i', - 'o1' => '/\bo1[\s\-_]?(preview|mini)?\b/i', + 'gpt-4' => '/\bGPT[\s\-_]?4/i', // e.g. "GPT-4o", "GPT-4-turbo" + 'gpt-3.5' => '/\bGPT[\s\-_]?3\.?5/i', // e.g. "GPT-3.5-turbo" + 'o1' => '/\bo1[\s\-_]?(preview|mini)?\b/i', // e.g. "o1", "o1-preview", "o1-mini" ], ], 'google' => [ 'patterns' => [ - '/\bGoogle[\s\-_]?AI\b/i', - '/\bGemini\b/i', - '/\bBard\b/i', - '/\bPaLM\b/i', + '/\bGoogle[\s\-_]?AI\b/i', // e.g. "Google-AI/1.0" + '/\bGemini\b/i', // e.g. "Gemini/1.5-pro", "gemini-flash" + '/\bBard\b/i', // e.g. "Google Bard/0.1" (legacy) + '/\bPaLM\b/i', // e.g. "PaLM API/1.0" (legacy) ], 'model_patterns' => [ - 'gemini-pro' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?pro/i', - 'gemini-ultra' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?ultra/i', - 'gemini-flash' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?flash/i', + 'gemini-pro' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?pro/i', // e.g. "gemini-1.5-pro" + 'gemini-ultra' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?ultra/i', // e.g. "gemini-ultra" + 'gemini-flash' => '/gemini[\s\-_]?(1\.5[\s\-_]?)?flash/i', // e.g. "gemini-1.5-flash" ], ], 'meta' => [ 'patterns' => [ - '/\bMeta[\s\-_]?AI\b/i', - '/\bLLaMA\b/i', - '/\bLlama[\s\-_]?[23]\b/i', + '/\bMeta[\s\-_]?AI\b/i', // e.g. "Meta AI/1.0" + '/\bLLaMA\b/i', // e.g. "LLaMA/2.0 meta-ai" + '/\bLlama[\s\-_]?[23]\b/i', // e.g. "Llama-3/2024-04", "Llama-2-chat" ], 'model_patterns' => [ - 'llama-3' => '/llama[\s\-_]?3/i', - 'llama-2' => '/llama[\s\-_]?2/i', + 'llama-3' => '/llama[\s\-_]?3/i', // e.g. "Llama-3-8B", "llama3-70b" + 'llama-2' => '/llama[\s\-_]?2/i', // e.g. "Llama-2-chat/70B" ], ], 'mistral' => [ 'patterns' => [ - '/\bMistral\b/i', - '/\bMixtral\b/i', + '/\bMistral\b/i', // e.g. "Mistral/0.1.0 mistralai-python/0.1" + '/\bMixtral\b/i', // e.g. "Mixtral-8x7B/1.0" ], 'model_patterns' => [ - 'mistral-large' => '/mistral[\s\-_]?large/i', - 'mistral-medium' => '/mistral[\s\-_]?medium/i', - 'mixtral' => '/mixtral/i', + 'mistral-large' => '/mistral[\s\-_]?large/i', // e.g. "mistral-large-latest" + 'mistral-medium' => '/mistral[\s\-_]?medium/i', // e.g. "mistral-medium" + 'mixtral' => '/mixtral/i', // e.g. "Mixtral-8x7B-Instruct" ], ], ]; /** * Patterns that indicate a typical web browser. - * If none of these are present, it might be programmatic access. + * + * If none of these tokens appear in a User-Agent string, the request is likely + * programmatic (a script, CLI tool, or potential agent). The patterns cover all + * major browser families and legacy rendering engine identifiers. + * + * Examples of matching User-Agents: + * - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0" + * - "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2) ... Safari/537.36" + * - "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0" + * - "Mozilla/5.0 ... Edg/120.0" — Microsoft Edge (Chromium) + * - "Opera/9.80 ... OPR/106.0" — Opera + * - "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)" — Internet Explorer + * - "Mozilla/5.0 ... Trident/7.0; rv:11.0" — IE 11 (Trident engine) */ protected const BROWSER_INDICATORS = [ - '/\bMozilla\b/i', - '/\bChrome\b/i', - '/\bSafari\b/i', - '/\bFirefox\b/i', - '/\bEdge\b/i', - '/\bOpera\b/i', - '/\bMSIE\b/i', - '/\bTrident\b/i', + '/\bMozilla\b/i', // All Gecko/WebKit/Blink browsers include "Mozilla/5.0" + '/\bChrome\b/i', // Chrome, Chromium, and most Chromium-based browsers + '/\bSafari\b/i', // Safari and WebKit-based browsers + '/\bFirefox\b/i', // Mozilla Firefox + '/\bEdge\b/i', // Microsoft Edge (legacy "Edge/" and Chromium "Edg/") + '/\bOpera\b/i', // Opera ("Opera/" classic, "OPR/" modern) + '/\bMSIE\b/i', // Internet Explorer (e.g. "MSIE 11.0") + '/\bTrident\b/i', // IE 11 Trident rendering engine token ]; /** * Known bot patterns that are NOT AI agents. - * These should return notAnAgent, not unknown. + * + * These should resolve to `AgentIdentity::notAnAgent()` rather than + * `AgentIdentity::unknownAgent()`, because we can positively identify them + * as a specific non-AI automated client (crawler, monitoring, HTTP library, etc.). + * + * Categories and example User-Agents: + * + * Search engine crawlers: + * - "Googlebot/2.1 (+http://www.google.com/bot.html)" + * - "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" + * - "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" + * - "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)" + * - "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" + * - "Applebot/0.1 (+http://www.apple.com/go/applebot)" + * + * Social media / link-preview bots: + * - "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)" + * - "Twitterbot/1.0" + * - "LinkedInBot/1.0 (compatible; Mozilla/5.0; Apache-HttpClient/4.5)" + * - "Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)" + * - "DiscordBot (https://discordapp.com) 1.0" + * - "TelegramBot (like TwitterBot)" + * - "WhatsApp/2.23.20 A" + * + * SEO / analytics crawlers: + * - "Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html)" + * - "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)" + * + * Generic HTTP clients (scripts, developer tools): + * - "curl/7.88.1" + * - "Wget/1.21.4" + * - "python-requests/2.31.0" + * - "Go-http-client/2.0" + * - "PostmanRuntime/7.35.0" + * - "insomnia/2023.5.8" + * - "axios/1.6.0" + * - "node-fetch/2.6.11" + * + * Uptime / monitoring services: + * - "UptimeRobot/2.0 (+http://www.uptimerobot.com/)" + * - "Pingdom.com_bot_version_1.4 (http://www.pingdom.com/)" + * - "Datadog Agent/7.45.0" + * - "NewRelicPinger/v1 AccountId=12345" */ protected const NON_AGENT_BOTS = [ + // Search engine crawlers '/\bGooglebot\b/i', '/\bBingbot\b/i', '/\bYandexBot\b/i', '/\bDuckDuckBot\b/i', '/\bBaiduspider\b/i', + '/\bApplebot\b/i', + + // Social media / link-preview bots '/\bfacebookexternalhit\b/i', '/\bTwitterbot\b/i', '/\bLinkedInBot\b/i', @@ -124,9 +239,12 @@ class AgentDetection '/\bDiscordBot\b/i', '/\bTelegramBot\b/i', '/\bWhatsApp\//i', - '/\bApplebot\b/i', + + // SEO / analytics crawlers '/\bSEMrushBot\b/i', '/\bAhrefsBot\b/i', + + // Generic HTTP clients '/\bcurl\b/i', '/\bwget\b/i', '/\bpython-requests\b/i', @@ -135,6 +253,8 @@ class AgentDetection '/\bInsomnia\b/i', '/\baxios\b/i', '/\bnode-fetch\b/i', + + // Uptime / monitoring services '/\bUptimeRobot\b/i', '/\bPingdom\b/i', '/\bDatadog\b/i', @@ -142,7 +262,19 @@ class AgentDetection ]; /** - * The MCP token header name. + * The MCP token header used to identify registered AI agents. + * + * Agents send this header to bypass User-Agent heuristics and declare their + * identity explicitly. Two token formats are supported: + * + * - Opaque AgentApiKey token (prefix "ak_"): + * Looked up in the database. Grants highest confidence when the key is active. + * Example: `X-MCP-Token: ak_a1b2c3d4e5f6...` + * + * - Structured provider:model:secret token: + * Encodes provider and model directly in the token value. + * Example: `X-MCP-Token: anthropic:claude-sonnet:mysecret` + * Example: `X-MCP-Token: openai:gpt-4:xyz789` */ protected const MCP_TOKEN_HEADER = 'X-MCP-Token'; diff --git a/TODO.md b/TODO.md index 6118fae..c572634 100644 --- a/TODO.md +++ b/TODO.md @@ -169,10 +169,10 @@ Production-quality task list for the AI agent orchestration package. ### Documentation Gaps -- [ ] **DOC-001: Add PHPDoc to AgentDetection patterns** +- [x] **DOC-001: Add PHPDoc to AgentDetection patterns** (FIXED 2026-02-23) - Location: `Services/AgentDetection.php` - Issue: User-Agent patterns undocumented - - Fix: Document each pattern with agent examples + - Fix: Added PHPDoc with real UA examples to all pattern constants, class-level usage examples, and MCP_TOKEN_HEADER docs - [ ] **DOC-002: Document MCP tool dependency system** - Location: `Mcp/Tools/Agent/` directory