feat(brain): add wiki source type — ingest Forge repo wikis via API
Fetches wiki pages from all core/* repos on Forge, parses into sections, and stores as type:service with repo/lang tags. Gives the PHP orchestrator contextual knowledge about the Go services it coordinates. 71+ pages across 22+ repos, ~770 vectorised sections. Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
01826bc5e9
commit
b0ed221cfa
1 changed files with 240 additions and 6 deletions
|
|
@ -6,6 +6,7 @@ namespace Core\Mod\Agentic\Console\Commands;
|
|||
|
||||
use Core\Mod\Agentic\Services\BrainService;
|
||||
use Illuminate\Console\Command;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Symfony\Component\Finder\Finder;
|
||||
|
||||
/**
|
||||
|
|
@ -20,7 +21,7 @@ class BrainIngestCommand extends Command
|
|||
protected $signature = 'brain:ingest
|
||||
{--workspace= : Workspace ID to import into (required)}
|
||||
{--agent=virgil : Agent ID to attribute memories to}
|
||||
{--source=all : Source type: memory, plans, claude-md, tasks, docs, all}
|
||||
{--source=all : Source type: memory, plans, claude-md, tasks, docs, wiki, all}
|
||||
{--code-path= : Root code directory (default: ~/Code)}
|
||||
{--dry-run : Preview what would be imported without storing}
|
||||
{--fresh : Clear the Qdrant collection before ingesting}';
|
||||
|
|
@ -44,12 +45,16 @@ class BrainIngestCommand extends Command
|
|||
$isDryRun = (bool) $this->option('dry-run');
|
||||
|
||||
$sources = $source === 'all'
|
||||
? ['memory', 'plans', 'claude-md', 'tasks', 'docs']
|
||||
? ['memory', 'plans', 'claude-md', 'tasks', 'docs', 'wiki']
|
||||
: [strtolower($source)];
|
||||
|
||||
// Gather all files first
|
||||
// Separate file-based and API-based sources
|
||||
$fileSources = array_filter($sources, fn ($s) => $s !== 'wiki');
|
||||
$apiSources = array_filter($sources, fn ($s) => $s === 'wiki');
|
||||
|
||||
// Gather file-based sources
|
||||
$filesBySource = [];
|
||||
foreach ($sources as $src) {
|
||||
foreach ($fileSources as $src) {
|
||||
$files = match ($src) {
|
||||
'memory' => $this->discoverMemoryFiles(),
|
||||
'plans' => $this->discoverPlanFiles($codePath),
|
||||
|
|
@ -62,9 +67,16 @@ class BrainIngestCommand extends Command
|
|||
$this->info(sprintf(' [%s] %d file(s)', $src, count($files)));
|
||||
}
|
||||
|
||||
$totalFiles = array_sum(array_map('count', $filesBySource));
|
||||
// Discover wiki pages from Forge API
|
||||
$wikiPages = [];
|
||||
if (in_array('wiki', $apiSources, true)) {
|
||||
$wikiPages = $this->discoverWikiPages();
|
||||
$this->info(sprintf(' [wiki] %d page(s) across %d repo(s)', count($wikiPages), count(array_unique(array_column($wikiPages, 'repo')))));
|
||||
}
|
||||
|
||||
$totalFiles = array_sum(array_map('count', $filesBySource)) + count($wikiPages);
|
||||
$this->newLine();
|
||||
$this->info("Total: {$totalFiles} file(s) to process.");
|
||||
$this->info("Total: {$totalFiles} item(s) to process.");
|
||||
|
||||
if ($totalFiles === 0) {
|
||||
return self::SUCCESS;
|
||||
|
|
@ -87,6 +99,12 @@ class BrainIngestCommand extends Command
|
|||
}
|
||||
}
|
||||
|
||||
if (! empty($wikiPages)) {
|
||||
$this->newLine();
|
||||
$this->comment('--- wiki ---');
|
||||
$this->processWikiPages($brain, $wikiPages, (int) $workspaceId, $this->option('agent') ?? 'virgil', $isDryRun);
|
||||
}
|
||||
|
||||
$this->newLine();
|
||||
$prefix = $isDryRun ? '[DRY RUN] ' : '';
|
||||
$this->info("{$prefix}Done. Imported: {$this->stats['imported']}, Skipped: {$this->stats['skipped']}, Errors: {$this->stats['errors']}");
|
||||
|
|
@ -280,6 +298,222 @@ class BrainIngestCommand extends Command
|
|||
return $files;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Wiki (Forge API)
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Discover wiki pages from all repos in the Forge org.
|
||||
*
|
||||
* Returns flat array of ['repo' => name, 'title' => title, 'content' => markdown].
|
||||
*
|
||||
* @return array<array{repo: string, title: string, content: string}>
|
||||
*/
|
||||
private function discoverWikiPages(): array
|
||||
{
|
||||
$baseUrl = config('upstream.gitea.url', 'https://forge.lthn.ai');
|
||||
$token = config('upstream.gitea.token');
|
||||
$org = config('upstream.gitea.org', 'core');
|
||||
|
||||
if (! $token) {
|
||||
$this->warn('No Forge token — skipping wiki source.');
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
// Fetch all repos in org
|
||||
$repos = [];
|
||||
$page = 1;
|
||||
|
||||
do {
|
||||
$response = Http::withHeaders(['Authorization' => 'token ' . $token])
|
||||
->timeout(15)
|
||||
->get("{$baseUrl}/api/v1/orgs/{$org}/repos", ['page' => $page, 'limit' => 50]);
|
||||
|
||||
if (! $response->successful()) {
|
||||
$this->warn('Failed to fetch repos: ' . $response->status());
|
||||
break;
|
||||
}
|
||||
|
||||
$batch = $response->json();
|
||||
if (empty($batch)) {
|
||||
break;
|
||||
}
|
||||
|
||||
foreach ($batch as $r) {
|
||||
$repos[] = $r['name'];
|
||||
}
|
||||
$page++;
|
||||
} while (count($batch) === 50);
|
||||
|
||||
// Fetch wiki pages for each repo
|
||||
$pages = [];
|
||||
|
||||
foreach ($repos as $repo) {
|
||||
$response = Http::withHeaders(['Authorization' => 'token ' . $token])
|
||||
->timeout(10)
|
||||
->get("{$baseUrl}/api/v1/repos/{$org}/{$repo}/wiki/pages");
|
||||
|
||||
if (! $response->successful() || $response->status() === 404) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$wikiList = $response->json();
|
||||
|
||||
if (empty($wikiList)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach ($wikiList as $wiki) {
|
||||
$title = $wiki['title'] ?? 'Untitled';
|
||||
|
||||
// Fetch full page content
|
||||
$pageResponse = Http::withHeaders(['Authorization' => 'token ' . $token])
|
||||
->timeout(10)
|
||||
->get("{$baseUrl}/api/v1/repos/{$org}/{$repo}/wiki/page/{$title}");
|
||||
|
||||
if (! $pageResponse->successful()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$content = $pageResponse->json('content_base64');
|
||||
if ($content) {
|
||||
$content = base64_decode($content, true) ?: '';
|
||||
} else {
|
||||
$content = '';
|
||||
}
|
||||
|
||||
if (trim($content) === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$pages[] = [
|
||||
'repo' => $repo,
|
||||
'title' => $title,
|
||||
'content' => $content,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
return $pages;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process wiki pages into contextual memories.
|
||||
*
|
||||
* Each page is tagged with its repo and language, typed as service
|
||||
* documentation so the PHP orchestrator can reason about Go services.
|
||||
*
|
||||
* @param array<array{repo: string, title: string, content: string}> $pages
|
||||
*/
|
||||
private function processWikiPages(BrainService $brain, array $pages, int $workspaceId, string $agentId, bool $isDryRun): void
|
||||
{
|
||||
foreach ($pages as $page) {
|
||||
$sections = $this->parseMarkdownFromString($page['content'], $page['title']);
|
||||
$repo = $page['repo'];
|
||||
|
||||
// Detect language from repo name
|
||||
$lang = str_starts_with($repo, 'php-') ? 'php' : (str_starts_with($repo, 'go-') || $repo === 'go' ? 'go' : 'mixed');
|
||||
|
||||
foreach ($sections as $section) {
|
||||
if (trim($section['content']) === '') {
|
||||
$this->stats['skipped']++;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$tags = [
|
||||
'source:wiki',
|
||||
'repo:' . $repo,
|
||||
'lang:' . $lang,
|
||||
str_replace(['-', '_'], ' ', $page['title']),
|
||||
];
|
||||
|
||||
if ($isDryRun) {
|
||||
$this->line(sprintf(
|
||||
' %s/%s :: %s — %d chars [%s]',
|
||||
$repo,
|
||||
$page['title'],
|
||||
$section['heading'],
|
||||
strlen($section['content']),
|
||||
implode(', ', $tags),
|
||||
));
|
||||
$this->stats['imported']++;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
// Prefix with repo context so embeddings understand the service
|
||||
$text = "[{$repo}] {$section['heading']}\n\n{$section['content']}";
|
||||
|
||||
if (strlen($text) > 3800) {
|
||||
$text = mb_substr($text, 0, 3800) . '…';
|
||||
}
|
||||
|
||||
$brain->remember([
|
||||
'workspace_id' => $workspaceId,
|
||||
'agent_id' => $agentId,
|
||||
'type' => 'service',
|
||||
'content' => $text,
|
||||
'tags' => $tags,
|
||||
'project' => $repo,
|
||||
'confidence' => 0.8,
|
||||
]);
|
||||
$this->stats['imported']++;
|
||||
} catch (\Throwable $e) {
|
||||
$this->warn(' Error: ' . $repo . '/' . $page['title'] . ' :: ' . $section['heading'] . ' — ' . $e->getMessage());
|
||||
$this->stats['errors']++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse markdown sections from a string (not a file).
|
||||
*
|
||||
* @return array<array{heading: string, content: string}>
|
||||
*/
|
||||
private function parseMarkdownFromString(string $content, string $fallbackHeading): array
|
||||
{
|
||||
if (trim($content) === '') {
|
||||
return [];
|
||||
}
|
||||
|
||||
$sections = [];
|
||||
$lines = explode("\n", $content);
|
||||
$currentHeading = '';
|
||||
$currentContent = [];
|
||||
|
||||
foreach ($lines as $line) {
|
||||
if (preg_match('/^#{1,3}\s+(.+)$/', $line, $matches)) {
|
||||
if ($currentHeading !== '' && ! empty($currentContent)) {
|
||||
$text = trim(implode("\n", $currentContent));
|
||||
if ($text !== '') {
|
||||
$sections[] = ['heading' => $currentHeading, 'content' => $text];
|
||||
}
|
||||
}
|
||||
$currentHeading = trim($matches[1]);
|
||||
$currentContent = [];
|
||||
} else {
|
||||
$currentContent[] = $line;
|
||||
}
|
||||
}
|
||||
|
||||
if ($currentHeading !== '' && ! empty($currentContent)) {
|
||||
$text = trim(implode("\n", $currentContent));
|
||||
if ($text !== '') {
|
||||
$sections[] = ['heading' => $currentHeading, 'content' => $text];
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($sections) && trim($content) !== '') {
|
||||
$sections[] = ['heading' => $fallbackHeading, 'content' => trim($content)];
|
||||
}
|
||||
|
||||
return $sections;
|
||||
}
|
||||
|
||||
/** @return array<string> */
|
||||
private function findMd(string $dir): array
|
||||
{
|
||||
|
|
|
|||
Reference in a new issue