memories: add rollout_summary_file header to raw memories and tune prompts (#12221)

## Summary
- Add `rollout_summary_file: <generated>.md` to each thread header in
`raw_memories.md` so Phase 2 can reliably reference the canonical
rollout summary filename.
- Update the memory prompts/templates (`stage_one_system`,
`consolidation`, `read_path`) for the new task-oriented raw-memory /
MEMORY.md schema and stronger consolidation guidance.

## Details
- `codex-rs/core/src/memories/storage.rs`
- Writes the generated `rollout_summary_file` path into the per-thread
metadata header when rebuilding `raw_memories.md`.
- `codex-rs/core/src/memories/tests.rs`
- Verifies the canonical `rollout_summary_file` header is present and
ordered after `updated_at`/`cwd` in `raw_memories.md`.
- Verifies task-structured raw-memory content is preserved while the
canonical header is added.
- `codex-rs/core/templates/memories/*.md`
- Updates the stage-1 raw-memory format to task-grouped sections
(`task`, `task_group`, `task_outcome`).
- Updates Phase 2 consolidation guidance around recency (`updated_at`),
task-oriented `MEMORY.md` blocks, and richer evidence-backed
consolidation.
- Tweaks the quick memory pass wording to emphasize topics/workflows in
addition to keywords.

## Testing
- `cargo test -p codex-core memories`
This commit is contained in:
zuxin-oai 2026-02-20 01:13:35 -08:00 committed by GitHub
parent 18bd6d2d71
commit e747a8eb74
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 298 additions and 153 deletions

View file

@ -82,11 +82,11 @@ async fn rebuild_raw_memories_file(
)
.map_err(raw_memories_format_error)?;
writeln!(body, "cwd: {}", memory.cwd.display()).map_err(raw_memories_format_error)?;
writeln!(body).map_err(raw_memories_format_error)?;
let rollout_summary_file = format!("{}.md", rollout_summary_file_stem(memory));
let raw_memory =
replace_rollout_summary_file_in_raw_memory(&memory.raw_memory, &rollout_summary_file);
body.push_str(raw_memory.trim());
writeln!(body, "rollout_summary_file: {rollout_summary_file}")
.map_err(raw_memories_format_error)?;
writeln!(body).map_err(raw_memories_format_error)?;
body.push_str(memory.raw_memory.trim());
body.push_str("\n\n");
}
@ -161,26 +161,6 @@ fn rollout_summary_format_error(err: std::fmt::Error) -> std::io::Error {
std::io::Error::other(format!("format rollout summary: {err}"))
}
fn replace_rollout_summary_file_in_raw_memory(
raw_memory: &str,
rollout_summary_file: &str,
) -> String {
const ROLLOUT_SUMMARY_PREFIX: &str = "rollout_summary_file: ";
let replacement = format!("rollout_summary_file: {rollout_summary_file}");
raw_memory
.split('\n')
.map(|line| {
if line.starts_with(ROLLOUT_SUMMARY_PREFIX) {
replacement.as_str()
} else {
line
}
})
.collect::<Vec<_>>()
.join("\n")
}
pub(crate) fn rollout_summary_file_stem(memory: &Stage1Output) -> String {
rollout_summary_file_stem_from_parts(
memory.thread_id,
@ -270,7 +250,6 @@ pub(super) fn rollout_summary_file_stem_from_parts(
#[cfg(test)]
mod tests {
use super::replace_rollout_summary_file_in_raw_memory;
use super::rollout_summary_file_stem;
use super::rollout_summary_file_stem_from_parts;
use chrono::TimeZone;
@ -339,71 +318,4 @@ mod tests {
assert_eq!(rollout_summary_file_stem(&memory), FIXED_PREFIX);
}
#[test]
fn replace_rollout_summary_file_in_raw_memory_replaces_existing_value() {
let raw_memory = "\
---
rollout_summary_file: wrong.md
description: demo
keywords: one, two
---
- body line";
let normalized = replace_rollout_summary_file_in_raw_memory(
raw_memory,
"2025-01-01T00-00-00-abcd-demo.md",
);
assert_eq!(
normalized,
"\
---
rollout_summary_file: 2025-01-01T00-00-00-abcd-demo.md
description: demo
keywords: one, two
---
- body line"
);
}
#[test]
fn replace_rollout_summary_file_in_raw_memory_replaces_placeholder() {
let raw_memory = "\
---
rollout_summary_file: <system_populated_file.md>
description: demo
keywords: one, two
---
- body line";
let normalized = replace_rollout_summary_file_in_raw_memory(
raw_memory,
"2025-01-01T00-00-00-abcd-demo.md",
);
assert_eq!(
normalized,
"\
---
rollout_summary_file: 2025-01-01T00-00-00-abcd-demo.md
description: demo
keywords: one, two
---
- body line"
);
}
#[test]
fn replace_rollout_summary_file_in_raw_memory_leaves_text_without_field_unchanged() {
let raw_memory = "\
---
description: demo
keywords: one, two
---
- body line";
let normalized = replace_rollout_summary_file_in_raw_memory(
raw_memory,
"2025-01-01T00-00-00-abcd-demo.md",
);
assert_eq!(normalized, raw_memory);
}
}

View file

@ -127,6 +127,7 @@ async fn sync_rollout_summaries_and_raw_memories_file_keeps_latest_memories_only
}
files.sort_unstable();
assert_eq!(files.len(), 1);
let canonical_rollout_summary_file = &files[0];
let raw_memories = tokio::fs::read_to_string(raw_memories_file(&root))
.await
@ -134,6 +135,30 @@ async fn sync_rollout_summaries_and_raw_memories_file_keeps_latest_memories_only
assert!(raw_memories.contains("raw memory"));
assert!(raw_memories.contains(&keep_id));
assert!(raw_memories.contains("cwd: /tmp/workspace"));
assert!(raw_memories.contains(&format!(
"rollout_summary_file: {canonical_rollout_summary_file}"
)));
let thread_header = format!("## Thread `{keep_id}`");
let thread_pos = raw_memories
.find(&thread_header)
.expect("thread header should exist");
let updated_pos = raw_memories[thread_pos..]
.find("updated_at: ")
.map(|offset| thread_pos + offset)
.expect("updated_at should exist after thread header");
let cwd_pos = raw_memories[thread_pos..]
.find("cwd: /tmp/workspace")
.map(|offset| thread_pos + offset)
.expect("cwd should exist after thread header");
let file_pos = raw_memories[thread_pos..]
.find(&format!(
"rollout_summary_file: {canonical_rollout_summary_file}"
))
.map(|offset| thread_pos + offset)
.expect("rollout_summary_file should exist after thread header");
assert!(thread_pos < updated_pos);
assert!(updated_pos < cwd_pos);
assert!(cwd_pos < file_pos);
}
#[tokio::test]
@ -229,7 +254,7 @@ async fn sync_rollout_summaries_uses_timestamp_hash_and_sanitized_slug_filename(
}
#[tokio::test]
async fn rebuild_raw_memories_file_rewrites_rollout_summary_file_to_canonical_filename() {
async fn rebuild_raw_memories_file_adds_canonical_rollout_summary_file_header() {
let dir = tempdir().expect("tempdir");
let root = dir.path().join("memory");
ensure_layout(&root).await.expect("ensure layout");
@ -241,11 +266,20 @@ async fn rebuild_raw_memories_file_rewrites_rollout_summary_file_to_canonical_fi
source_updated_at: Utc.timestamp_opt(200, 0).single().expect("timestamp"),
raw_memory: "\
---
rollout_summary_file: state_migration_uniqueness_test.md
description: Added a migration test
keywords: codex-state, migrations
---
- Kept details."
### Task 1: migration-test
task: add-migration-test
task_group: codex-state
task_outcome: success
- Added regression coverage for migration uniqueness.
### Task 2: validate-migration
task: validate-migration-ordering
task_group: codex-state
task_outcome: success
- Confirmed no ordering regressions."
.to_string(),
rollout_summary: "short summary".to_string(),
rollout_slug: Some("Unsafe Slug/With Spaces & Symbols + EXTRA_LONG_12345".to_string()),
@ -285,8 +319,11 @@ keywords: codex-state, migrations
assert!(raw_memories.contains(&format!(
"rollout_summary_file: {canonical_rollout_summary_file}"
)));
assert!(!raw_memories.contains("rollout_summary_file: state_migration_uniqueness_test.md"));
assert!(raw_memories.contains("description: Added a migration test"));
assert!(raw_memories.contains("### Task 1: migration-test"));
assert!(raw_memories.contains("task: add-migration-test"));
assert!(raw_memories.contains("task_group: codex-state"));
assert!(raw_memories.contains("task_outcome: success"));
}
mod phase2 {

View file

@ -17,7 +17,8 @@ CONTEXT: MEMORY FOLDER STRUCTURE
Folder structure (under {{ memory_root }}/):
- memory_summary.md
- Always loaded into the system prompt. Must remain tiny and highly navigational.
- Always loaded into the system prompt. Must remain informative and highly navigational,
but still discriminative enough to guide retrieval.
- MEMORY.md
- Handbook entries. Used to grep for keywords; aggregated insights from rollouts;
pointers to rollout summaries if certain past rollouts are very relevant.
@ -40,8 +41,10 @@ GLOBAL SAFETY, HYGIENE, AND NO-FILLER RULES (STRICT)
- Evidence-based only: do not invent facts or claim verification that did not happen.
- Redact secrets: never store tokens/keys/passwords; replace with [REDACTED_SECRET].
- Avoid copying large tool outputs. Prefer compact summaries + exact error snippets + pointers.
- **No-op is allowed and preferred** when there is no meaningful, reusable learning worth saving.
- If nothing is worth saving, make NO file changes.
- No-op content updates are allowed and preferred when there is no meaningful, reusable
learning worth saving.
- INIT mode: still create minimal required files (`MEMORY.md` and `memory_summary.md`).
- INCREMENTAL UPDATE mode: if nothing is worth saving, make no file changes.
============================================================
WHAT COUNTS AS HIGH-SIGNAL MEMORY
@ -97,7 +100,10 @@ Primary inputs (always read these, if exists):
Under `{{ memory_root }}/`:
- `raw_memories.md`
- mechanical merge of `raw_memories` from Phase 1;
- source of rollout-level metadata needed for MEMORY.md header annotations;
- ordered latest-first; use this recency ordering as a major heuristic when choosing
what to promote, expand, or deprecate;
- source of rollout-level metadata needed for MEMORY.md `### rollout_summary_files`
annotations;
you should be able to find `cwd` and `updated_at` there.
- `MEMORY.md`
- merged memories; produce a lightly clustered version if applicable
@ -123,45 +129,148 @@ Rules:
- If there is no meaningful signal to add beyond what already exists, keep outputs minimal.
- You should always make sure `MEMORY.md` and `memory_summary.md` exist and are up to date.
- Follow the format and schema of the artifacts below.
- Do not target fixed counts (memory blocks, task groups, topics, or bullets). Let the
signal determine the granularity and depth.
- Quality objective: for high-signal task families, `MEMORY.md` should be materially more
useful than `raw_memories.md` while remaining easy to navigate.
============================================================
1) `MEMORY.md` FORMAT (STRICT)
============================================================
Clustered schema:
---
rollout_summary_files:
- <file1.md> (<annotation that includes status/usefulness, cwd, and updated_at, e.g. "success, most useful architecture walkthrough, cwd=/repo/path, updated_at=2026-02-12T10:30:00Z">)
- <file2.md> (<annotation with cwd=/..., updated_at=...>)
description: brief description of the shared tasks/outcomes
keywords: k1, k2, k3, ... <searchable handles (tool names, error names, repo concepts, contracts)>
---
`MEMORY.md` is the durable, retrieval-oriented handbook. Each block should be easy to grep
and rich enough to reuse without reopening raw rollout logs.
Each memory block MUST start with:
# Task Group: <repo / project / workflow / detail-task family; broad but distinguishable>
scope: <what this block covers, when to use it, and notable boundaries>
- `Task Group` is for retrieval. Choose granularity based on memory density:
repo / project / workflow / detail-task family.
- `scope:` is for scanning. Keep it short and operational.
Body format (strict):
- Use the task-grouped markdown structure below (headings + bullets). Do not use a flat
bullet dump.
- The header (`# Task Group: ...` + `scope: ...`) is the index. The body contains
task-level detail.
- Every `## Task <n>` section MUST include task-local rollout files, task-local keywords,
and task-specific learnings.
- Use `-` bullets for lists and learnings. Do not use `*`.
- No bolding text in the memory body.
Required task-oriented body shape (strict):
## Task 1: <task description, outcome>
task: <specific, searchable task signature; avoid fluff>
### rollout_summary_files
- <rollout_summaries/file1.md> (cwd=<path>, updated_at=<timestamp>, <optional status/usefulness note>)
### keywords
- <task-local retrieval handles: tool names, error strings, repo concepts, APIs/contracts>
### learnings
- <task-specific learnings>
- <user expectation, preference, style, tone, feedback>
- <what worked, what failed, validation, reusable procedure, etc.>
- <failure shields: symptom -> cause -> fix>
- <scope boundaries / anti-drift notes when relevant>
- <uncertainty explicitly preserved if unresolved>
## Task 2: <task description, outcome>
task: <specific, searchable task signature; avoid fluff>
### rollout_summary_files
- <Structured memory entries. Use bullets. No bolding text.>
- ...
Schema rules (strict):
- Keep entries compact and retrieval-friendly.
- A single note block may correspond to multiple related tasks; aggregate when tasks and lessons align.
- In `rollout_summary_files`, each parenthesized annotation must include
`cwd=<path>` and `updated_at=<timestamp>` copied from that rollout summary metadata.
If missing from an individual rollout summary, recover them from `raw_memories.md`.
- If you need to reference skills, do it in the BODY as bullets, not in the header
(e.g., "- Related skill: skills/<skill-name>/SKILL.md").
- Use lowercase, hyphenated skill folder names.
- Preserve provenance: include the relevant rollout_summary_file(s) for the block.
### keywords
What to write in memory entries: Extract the highest-signal takeaways from the rollout
summaries, especially from "User preferences", "Reusable knowledge", "References", and
"Things that did not work / things that can be improved".
Write what would most help a future agent doing a similar (or adjacent) task: decision
triggers, key steps, proven commands/paths, and failure shields (symptom -> cause -> fix),
plus any stable user preferences.
If a rollout summary contains stable user profile details or preferences that generalize,
capture them here so they're easy to find and can be reflected in memory_summary.md.
The goal of MEMORY.md is to support related-but-not-identical future tasks, so keep
insights slightly more general; when a future task is very similar, expect the agent to
use the rollout summary for full detail.
- ...
### learnings
- <task-specific memories / learnings>
... More `## Task <n>` sections if needed
## General Tips
- <cross-task guidance, deduplicated and generalized> [Task 1]
- <conflict/staleness resolution note using task references> [Task 1][Task 2]
- <structured memory bullets; no bolding>
Schema rules (strict):
- A) Structure and consistency
- Exact block shape: `# Task Group`, `scope:`, one or more `## Task <n>`, and
`## General Tips`.
- Keep all tasks and tips inside the task family implied by the block header.
- Keep entries retrieval-friendly, but not shallow.
- Do not emit placeholder values (`task: task`, `# Task Group: misc`, `scope: general`, etc.).
- B) Task boundaries and clustering
- Primary organization unit is the task (`## Task <n>`), not the rollout file.
- Default mapping: one coherent rollout summary -> one MEMORY block -> one `## Task 1`.
- If a rollout contains multiple distinct tasks, split them into multiple `## Task <n>`
sections. If those tasks belong to different task families, split into separate
MEMORY blocks (`# Task Group`).
- A MEMORY block may include multiple rollouts only when they belong to the same
task group and the task intent, technical context, and outcome pattern align.
- A single `## Task <n>` section may cite multiple rollout summaries when they are
iterative attempts or follow-up runs for the same task.
- Do not cluster on keyword overlap alone.
- When in doubt, preserve boundaries (separate tasks/blocks) rather than over-cluster.
- C) Provenance and metadata
- Every `## Task <n>` section must include `### rollout_summary_files`, `### keywords`,
and `### learnings`.
- `### rollout_summary_files` must be task-local (not a block-wide catch-all list).
- Each rollout annotation must include `cwd=<path>` and `updated_at=<timestamp>`.
If missing from a rollout summary, recover them from `raw_memories.md`.
- Major learnings should be traceable to rollout summaries listed in the same task section.
- Order rollout references by freshness and practical usefulness.
- D) Retrieval and references
- `task:` lines must be specific and searchable.
- `### keywords` should be discriminative and task-local (tool names, error strings,
repo concepts, APIs/contracts).
- Put task-specific detail in `## Task <n>` and only deduplicated cross-task guidance in
`## General Tips`.
- If you reference skills, do it in body bullets only (for example:
`- Related skill: skills/<skill-name>/SKILL.md`).
- Use lowercase, hyphenated skill folder names.
- E) Ordering and conflict handling
- For grouped blocks, order `## Task <n>` sections by practical usefulness, then recency.
- Treat `updated_at` as a first-class signal: fresher validated evidence usually wins.
- If evidence conflicts and validation is unclear, preserve the uncertainty explicitly.
- In `## General Tips`, cite task references (`[Task 1]`, `[Task 2]`, etc.) when
merging, deduplicating, or resolving evidence.
What to write:
- Extract the takeaways from rollout summaries and raw_memories, especially sections like
"User preferences", "Reusable knowledge", "References", and "Things that did not work".
- Optimize for future related tasks: decision triggers, validated commands/paths,
verification steps, and failure shields (symptom -> cause -> fix).
- Capture stable user preferences/details that generalize so they can also inform
`memory_summary.md`.
- `MEMORY.md` should support related-but-not-identical tasks: slightly more general than a
rollout summary, but still operational and concrete.
- Use `raw_memories.md` as the routing layer; deep-dive into `rollout_summaries/*.md` when:
- the task is high-value and needs richer detail,
- multiple rollouts overlap and need conflict/staleness resolution,
- raw memory wording is too terse/ambiguous to consolidate confidently,
- you need stronger evidence, validation context, or user feedback.
- Each block should be useful on its own and materially richer than `memory_summary.md`:
- include concrete triggers, commands/paths, and failure shields,
- include outcome-specific notes (what worked, what failed, what remains uncertain),
- include scope boundaries / anti-drift notes when they affect future task success,
- include stale/conflict notes when newer evidence changes prior guidance.
============================================================
2) `memory_summary.md` FORMAT (STRICT)
@ -210,17 +319,23 @@ For example, include (when known):
## What's in Memory
This is a compact index to help future agents quickly find details in `MEMORY.md`,
`skills/`, and `rollout_summaries/`.
Organize by topic. Each bullet should include: topic, keywords (used to search over
memory files), and a brief description.
Organize by topic. Each bullet must include: topic, keywords, and a clear description.
Ordered by utility - which is the most likely to be useful for a future agent.
Do not target a fixed topic count. Cover the real high-signal areas and omit low-signal noise.
Prefer grouping by task family / workflow intent, not by incidental tools alone.
Recommended format:
- <topic>: <keyword1>, <keyword2>, <keyword3>, ...
- desc: <brief description>
- desc: <clear and specific description of what is inside this topic and when to use it>
Notes:
- Do not include large snippets; push details into MEMORY.md and rollout summaries.
- Prefer topics/keywords that help a future agent search MEMORY.md efficiently.
- Prefer clear topic taxonomy over verbose drill-down pointers.
- Keep descriptions explicit enough that a future model can decide which keyword cluster
to search first for a new user query.
- Topic descriptions should mention what is inside, when to use it, and what kind of
outcome/procedure depth is available (for example: runbook, diagnostics, reporting, recovery).
============================================================
3) `skills/` FORMAT (optional)
@ -303,29 +418,41 @@ WORKFLOW
- create initial `skills/*` (optional but highly recommended)
- write `memory_summary.md` last (highest-signal file)
- Use your best efforts to get the most high-quality memory files
- Do not be lazy at browsing files at the INIT phase
- Do not be lazy at browsing files in INIT mode; deep-dive high-value rollouts and
conflicting task families until MEMORY blocks are richer and more useful than raw memories
3) INCREMENTAL UPDATE behavior:
- Treat `raw_memories.md` as the primary source of NEW signal.
- Read existing memory files first for continuity.
- Integrate new signal into existing artifacts by:
- scanning new raw memories in recency order and identifying which existing blocks they should update
- updating existing knowledge with better/newer evidence
- updating stale or contradicting guidance
- expanding terse old blocks when new summaries/raw memories make the task family clearer
- doing light clustering and merging if needed
- updating existing skills or adding new skills only when there is clear new reusable procedure
- update `memory_summary.md` last to reflect the final state of the memory folder
4) For both modes, update `MEMORY.md` after skill updates:
- add clear **Related skills** pointers in the BODY of corresponding note blocks (do
not change the YAML header schema)
4) Evidence deep-dive rule (both modes):
- `raw_memories.md` is the routing layer, not always the final authority for detail.
- When a task family is important, ambiguous, or duplicated across multiple rollouts,
open the relevant `rollout_summaries/*.md` files and extract richer procedural detail,
validation signals, and user feedback before finalizing `MEMORY.md`.
- Use `updated_at` and validation strength together to resolve stale/conflicting notes.
5) Housekeeping (optional):
5) For both modes, update `MEMORY.md` after skill updates:
- add clear related-skill pointers as plain bullets in the BODY of corresponding task
sections (do not change the `# Task Group` / `scope:` block header format)
6) Housekeeping (optional):
- remove clearly redundant/low-signal rollout summaries
- if multiple summaries overlap for the same thread, keep the best one
6) Final pass:
7) Final pass:
- remove duplication in memory_summary, skills/, and MEMORY.md
- ensure any referenced skills/summaries actually exist
- ensure MEMORY blocks and "What's in Memory" use a consistent task-oriented taxonomy
- ensure recent important task families are easy to find (description + keywords + topic wording)
- if there is no net-new or higher-quality signal to add, keep changes minimal (no
churn for its own sake).
@ -341,6 +468,6 @@ Use `rg` for fast retrieval while consolidating:
- Search durable notes:
`rg -n -i "<pattern>" "{{ memory_root }}/MEMORY.md"`
- Search across memory tree:
`rg -n -i "<pattern>" "{{ memory_root }}" | head -n 50`
`rg -n -i "<pattern>" "{{ memory_root }}" | head -n 100`
- Locate rollout summary files:
`rg --files "{{ memory_root }}/rollout_summaries" | head -n 200`
`rg --files "{{ memory_root }}/rollout_summaries" | head -n 400`

View file

@ -25,8 +25,8 @@ again)
- {{ base_path }}/rollout_summaries/ (per-rollout recaps + evidence snippets)
Quick memory pass (when applicable):
1) Skim the MEMORY_SUMMARY included below and extract a few task-relevant
keywords (for example repo/module names, error strings, etc.).
1) Skim the MEMORY_SUMMARY included below and extract task-relevant topics and
keywords (for example repo/module names, workflows, error strings, etc.).
2) Search {{ base_path }}/MEMORY.md for those keywords, and for any referenced
rollout summary files and skills.
3) If relevant rollout summary files and skills exist, open matching files

View file

@ -109,6 +109,16 @@ Typical real-world signals (use as examples when analyzing the rollout):
3) User keeps iterating on the same task:
- Requests for fixes/revisions on the same artifact usually mean partial, not success.
- Requesting a restart or pointing out contradictions often indicates fail.
4) Last task in the rollout:
- Treat the final task more conservatively than earlier tasks.
- If there is no explicit user feedback or environment validation for the final task,
prefer `uncertain` (or `partial` if there was obvious progress but no confirmation).
- For non-final tasks, switching to another task without unresolved blockers is a stronger
positive signal.
Signal priority:
- Explicit user feedback and explicit environment/test/tool validation outrank all heuristics.
- If heuristic signals conflict with explicit feedback, follow explicit feedback.
Fallback heuristics:
- Success: explicit "done/works", tests pass, correct artifact produced, user
@ -152,6 +162,8 @@ This summary should be very comprehensive and detailed, because it will be furth
distilled into MEMORY.md and memory_summary.md.
There is no strict size limit, and you should feel free to list a lot of points here as
long as they are helpful.
Do not target fixed counts (tasks, bullets, references, or topics). Let the rollout's
signal density decide how much to write.
Instructional notes in angle brackets are guidance only; do not include them verbatim in the rollout summary.
Template (items are flexible; include only what is useful):
@ -170,7 +182,7 @@ User preferences: <explicit or inferred from user messages; include how you infe
<Then followed by tasks in this rollout. Each task is a section; sections below are optional per task.>
## Task <idx>: <short task name>
## Task <idx>: <task name>
Outcome: <success|partial|fail|uncertain>
Key steps:
@ -188,9 +200,9 @@ Things that did not work / things that can be improved:
user approval.">
- ...
Reusable knowledge: <you are encouraged to list 3-10 points for each task here, anything
helpful counts, stick to facts. Don't put opinions or suggestions from the assistant
that are not validated by the user.>
Reusable knowledge: <list as many durable, evidence-backed points as needed for this task.
Anything helpful counts; stick to facts. Don't put vague opinions or suggestions from the
assistant that are not validated.>
- <facts that will be helpful for future agents, such as how the system works, anything
that took the agent some effort to figure out, user preferences, etc.>
- <e.g. "When running evals, you should pass in the flag `some flag
@ -226,22 +238,70 @@ shows or why it matters>:
- [3] final verification evidence or explicit user feedback
## Task <idx> (if there are multiple tasks): <short task name>
## Task <idx> (if there are multiple tasks): <task name>
...
Task section quality bar (strict):
- Each task section should be detailed enough that other agent can understand it without
reopening the raw rollout.
- For each task, cover the following when evidence exists (and state uncertainty when it
does not):
- what the user wanted / expected,
- what was attempted and what actually worked,
- what failed or remained uncertain and why,
- how the outcome was validated (user feedback, tests, tool output, or explicit lack of validation),
- reusable procedure/checklist and failure shields,
- concrete artifacts/commands/paths/error signatures that future agents can reuse.
- Do not be terse in task sections. Rich, evidence-backed task summaries are preferred
over compact summaries.
============================================================
`raw_memory` FORMAT (STRICT)
============================================================
The schema is below.
---
rollout_summary_file: <file.md>
description: brief description of the task and outcome
description: concise but information-dense description of the primary task(s), outcome, and highest-value takeaway
task: <primary_task_signature>
task_group: <repo_or_workflow_bucket>
task_outcome: <success|partial|fail|uncertain>
keywords: k1, k2, k3, ... <searchable handles (tool names, error names, repo concepts, contracts)>
---
- <Structured memory entries. Use bullets. No bolding text.>
Then write task-grouped body content (required):
### Task 1: <short task name>
task: <task signature for this task>
task_group: <project/workflow topic>
task_outcome: <success|partial|fail|uncertain>
- <useful memory bullet>
- ...
### Task 2: <short task name> (if needed)
task: ...
task_group: ...
task_outcome: ...
- ...
Preferred task-block body shape (strongly recommended):
- `### Task <n>` blocks should preserve task-specific retrieval signal and consolidation-ready detail.
- Within each task block, include bullets that explicitly cover (when applicable):
- user goal / expected outcome,
- what worked (key steps, commands, code paths, artifacts),
- what did not work or drifted (and what pivot worked),
- validation state (user confirmation, tests, runtime checks, or missing validation),
- reusable procedure/checklist and failure shields,
- high-signal evidence pointers (error strings, commands, files, IDs, URLs, etc.).
- Prefer labeled bullets when useful (for example: `- User goal: ...`, `- Validation: ...`,
`- Failure shield: ...`) so Phase 2 can retrieve and consolidate faster.
Task grouping rules (strict):
- Every distinct user task in the thread must appear as its own `### Task <n>` block.
- Do not merge unrelated tasks into one block just because they happen in the same thread.
- If a thread contains only one task, keep exactly one task block.
- For each task block, keep the outcome tied to evidence relevant to that task.
- If a thread has partially related tasks, prefer splitting into separate task blocks and
linking them through shared keywords rather than merging.
What to write in memory entries: Extract useful takeaways from the rollout summaries,
especially from "User preferences", "Reusable knowledge", "References", and
"Things that did not work / things that can be improved".
@ -249,10 +309,17 @@ Write what would help a future agent doing a similar (or adjacent) task: decisio
triggers, key steps, proven commands/paths, and failure shields (symptom -> cause -> fix),
plus any stable user preferences.
If a rollout summary contains stable user profile details or preferences that generalize,
capture them here so they're easy to find and can be reflected in memory_summary.md.
capture them here so they're easy to find without checking rollout summary.
The goal is to support related-but-not-identical future tasks, so keep
insights slightly more general; when a future task is very similar, expect the agent to
use the rollout summary for full detail.
For each task block, include enough detail to be useful for future agent reference:
- what the user wanted and expected,
- what was attempted and what actually worked,
- what failed or remained uncertain and why,
- what evidence validates the outcome (user feedback, environment/test feedback, or lack of both),
- reusable procedures/checklists and failure shields that should survive future similar tasks,
- artifacts and retrieval handles (commands, file paths, error strings, IDs) that make the task easy to rediscover.
============================================================
@ -264,4 +331,6 @@ WORKFLOW
1) Triage outcome using the common rules.
2) Read the rollout carefully (do not miss user messages/tool calls/outputs).
3) Return `rollout_summary`, `rollout_slug`, and `raw_memory`, valid JSON only.
No markdown wrapper, no prose outside JSON.
No markdown wrapper, no prose outside JSON.
- Do not be terse in task sections. Include validation signal, failure mode, and reusable procedure per task when available.