memories: add rollout_summary_file header to raw memories and tune prompts (#12221)

## Summary - Add `rollout_summary_file: <generated>.md` to each thread header in `raw_memories.md` so Phase 2 can reliably reference the canonical rollout summary filename. - Update the memory prompts/templates (`stage_one_system`, `consolidation`, `read_path`) for the new task-oriented raw-memory / MEMORY.md schema and stronger consolidation guidance. ## Details - `codex-rs/core/src/memories/storage.rs` - Writes the generated `rollout_summary_file` path into the per-thread metadata header when rebuilding `raw_memories.md`. - `codex-rs/core/src/memories/tests.rs` - Verifies the canonical `rollout_summary_file` header is present and ordered after `updated_at`/`cwd` in `raw_memories.md`. - Verifies task-structured raw-memory content is preserved while the canonical header is added. - `codex-rs/core/templates/memories/*.md` - Updates the stage-1 raw-memory format to task-grouped sections (`task`, `task_group`, `task_outcome`). - Updates Phase 2 consolidation guidance around recency (`updated_at`), task-oriented `MEMORY.md` blocks, and richer evidence-backed consolidation. - Tweaks the quick memory pass wording to emphasize topics/workflows in addition to keywords. ## Testing - `cargo test -p codex-core memories`
2026-02-20 01:13:35 -08:00 · 2026-02-20 01:13:35 -08:00 · e747a8eb74
commit e747a8eb74
parent 18bd6d2d71
5 changed files with 298 additions and 153 deletions
--- a/codex-rs/core/src/memories/storage.rs
+++ b/codex-rs/core/src/memories/storage.rs
@ -82,11 +82,11 @@ async fn rebuild_raw_memories_file(
        )
        .map_err(raw_memories_format_error)?;
        writeln!(body, "cwd: {}", memory.cwd.display()).map_err(raw_memories_format_error)?;
-        writeln!(body).map_err(raw_memories_format_error)?;
        let rollout_summary_file = format!("{}.md", rollout_summary_file_stem(memory));
-        let raw_memory =
-            replace_rollout_summary_file_in_raw_memory(&memory.raw_memory, &rollout_summary_file);
-        body.push_str(raw_memory.trim());
+        writeln!(body, "rollout_summary_file: {rollout_summary_file}")
+            .map_err(raw_memories_format_error)?;
+        writeln!(body).map_err(raw_memories_format_error)?;
+        body.push_str(memory.raw_memory.trim());
        body.push_str("\n\n");
    }

@ -161,26 +161,6 @@ fn rollout_summary_format_error(err: std::fmt::Error) -> std::io::Error {
    std::io::Error::other(format!("format rollout summary: {err}"))
 }

-fn replace_rollout_summary_file_in_raw_memory(
-    raw_memory: &str,
-    rollout_summary_file: &str,
-) -> String {
-    const ROLLOUT_SUMMARY_PREFIX: &str = "rollout_summary_file: ";
-
-    let replacement = format!("rollout_summary_file: {rollout_summary_file}");
-    raw_memory
-        .split('\n')
-        .map(|line| {
-            if line.starts_with(ROLLOUT_SUMMARY_PREFIX) {
-                replacement.as_str()
-            } else {
-                line
-            }
-        })
-        .collect::<Vec<_>>()
-        .join("\n")
-}
-
 pub(crate) fn rollout_summary_file_stem(memory: &Stage1Output) -> String {
    rollout_summary_file_stem_from_parts(
        memory.thread_id,
@ -270,7 +250,6 @@ pub(super) fn rollout_summary_file_stem_from_parts(

 #[cfg(test)]
 mod tests {
-    use super::replace_rollout_summary_file_in_raw_memory;
    use super::rollout_summary_file_stem;
    use super::rollout_summary_file_stem_from_parts;
    use chrono::TimeZone;
@ -339,71 +318,4 @@ mod tests {

        assert_eq!(rollout_summary_file_stem(&memory), FIXED_PREFIX);
    }
-
-    #[test]
-    fn replace_rollout_summary_file_in_raw_memory_replaces_existing_value() {
-        let raw_memory = "\
---
-rollout_summary_file: wrong.md
-description: demo
-keywords: one, two
---
- body line";
-        let normalized = replace_rollout_summary_file_in_raw_memory(
-            raw_memory,
-            "2025-01-01T00-00-00-abcd-demo.md",
-        );
-
-        assert_eq!(
-            normalized,
-            "\
---
-rollout_summary_file: 2025-01-01T00-00-00-abcd-demo.md
-description: demo
-keywords: one, two
---
- body line"
-        );
-    }
-
-    #[test]
-    fn replace_rollout_summary_file_in_raw_memory_replaces_placeholder() {
-        let raw_memory = "\
---
-rollout_summary_file: <system_populated_file.md>
-description: demo
-keywords: one, two
---
- body line";
-        let normalized = replace_rollout_summary_file_in_raw_memory(
-            raw_memory,
-            "2025-01-01T00-00-00-abcd-demo.md",
-        );
-
-        assert_eq!(
-            normalized,
-            "\
---
-rollout_summary_file: 2025-01-01T00-00-00-abcd-demo.md
-description: demo
-keywords: one, two
---
- body line"
-        );
-    }
-
-    #[test]
-    fn replace_rollout_summary_file_in_raw_memory_leaves_text_without_field_unchanged() {
-        let raw_memory = "\
---
-description: demo
-keywords: one, two
---
- body line";
-        let normalized = replace_rollout_summary_file_in_raw_memory(
-            raw_memory,
-            "2025-01-01T00-00-00-abcd-demo.md",
-        );
-        assert_eq!(normalized, raw_memory);
-    }
 }
--- a/codex-rs/core/src/memories/tests.rs
+++ b/codex-rs/core/src/memories/tests.rs
@ -127,6 +127,7 @@ async fn sync_rollout_summaries_and_raw_memories_file_keeps_latest_memories_only
    }
    files.sort_unstable();
    assert_eq!(files.len(), 1);
+    let canonical_rollout_summary_file = &files[0];

    let raw_memories = tokio::fs::read_to_string(raw_memories_file(&root))
        .await
@ -134,6 +135,30 @@ async fn sync_rollout_summaries_and_raw_memories_file_keeps_latest_memories_only
    assert!(raw_memories.contains("raw memory"));
    assert!(raw_memories.contains(&keep_id));
    assert!(raw_memories.contains("cwd: /tmp/workspace"));
+    assert!(raw_memories.contains(&format!(
+        "rollout_summary_file: {canonical_rollout_summary_file}"
+    )));
+    let thread_header = format!("## Thread `{keep_id}`");
+    let thread_pos = raw_memories
+        .find(&thread_header)
+        .expect("thread header should exist");
+    let updated_pos = raw_memories[thread_pos..]
+        .find("updated_at: ")
+        .map(|offset| thread_pos + offset)
+        .expect("updated_at should exist after thread header");
+    let cwd_pos = raw_memories[thread_pos..]
+        .find("cwd: /tmp/workspace")
+        .map(|offset| thread_pos + offset)
+        .expect("cwd should exist after thread header");
+    let file_pos = raw_memories[thread_pos..]
+        .find(&format!(
+            "rollout_summary_file: {canonical_rollout_summary_file}"
+        ))
+        .map(|offset| thread_pos + offset)
+        .expect("rollout_summary_file should exist after thread header");
+    assert!(thread_pos < updated_pos);
+    assert!(updated_pos < cwd_pos);
+    assert!(cwd_pos < file_pos);
 }

 #[tokio::test]
@ -229,7 +254,7 @@ async fn sync_rollout_summaries_uses_timestamp_hash_and_sanitized_slug_filename(
 }

 #[tokio::test]
-async fn rebuild_raw_memories_file_rewrites_rollout_summary_file_to_canonical_filename() {
+async fn rebuild_raw_memories_file_adds_canonical_rollout_summary_file_header() {
    let dir = tempdir().expect("tempdir");
    let root = dir.path().join("memory");
    ensure_layout(&root).await.expect("ensure layout");
@ -241,11 +266,20 @@ async fn rebuild_raw_memories_file_rewrites_rollout_summary_file_to_canonical_fi
        source_updated_at: Utc.timestamp_opt(200, 0).single().expect("timestamp"),
        raw_memory: "\
 ---
-rollout_summary_file: state_migration_uniqueness_test.md
 description: Added a migration test
 keywords: codex-state, migrations
 ---
- Kept details."
+### Task 1: migration-test
+task: add-migration-test
+task_group: codex-state
+task_outcome: success
+- Added regression coverage for migration uniqueness.
+
+### Task 2: validate-migration
+task: validate-migration-ordering
+task_group: codex-state
+task_outcome: success
+- Confirmed no ordering regressions."
            .to_string(),
        rollout_summary: "short summary".to_string(),
        rollout_slug: Some("Unsafe Slug/With Spaces & Symbols + EXTRA_LONG_12345".to_string()),
@ -285,8 +319,11 @@ keywords: codex-state, migrations
    assert!(raw_memories.contains(&format!(
        "rollout_summary_file: {canonical_rollout_summary_file}"
    )));
-    assert!(!raw_memories.contains("rollout_summary_file: state_migration_uniqueness_test.md"));
    assert!(raw_memories.contains("description: Added a migration test"));
+    assert!(raw_memories.contains("### Task 1: migration-test"));
+    assert!(raw_memories.contains("task: add-migration-test"));
+    assert!(raw_memories.contains("task_group: codex-state"));
+    assert!(raw_memories.contains("task_outcome: success"));
 }

 mod phase2 {
--- a/codex-rs/core/templates/memories/consolidation.md
+++ b/codex-rs/core/templates/memories/consolidation.md
@ -17,7 +17,8 @@ CONTEXT: MEMORY FOLDER STRUCTURE

 Folder structure (under {{ memory_root }}/):
 - memory_summary.md
-  - Always loaded into the system prompt. Must remain tiny and highly navigational.
+  - Always loaded into the system prompt. Must remain informative and highly navigational,
+    but still discriminative enough to guide retrieval.
 - MEMORY.md
  - Handbook entries. Used to grep for keywords; aggregated insights from rollouts;
    pointers to rollout summaries if certain past rollouts are very relevant.
@ -40,8 +41,10 @@ GLOBAL SAFETY, HYGIENE, AND NO-FILLER RULES (STRICT)
 - Evidence-based only: do not invent facts or claim verification that did not happen.
 - Redact secrets: never store tokens/keys/passwords; replace with [REDACTED_SECRET].
 - Avoid copying large tool outputs. Prefer compact summaries + exact error snippets + pointers.
- **No-op is allowed and preferred** when there is no meaningful, reusable learning worth saving.
-  - If nothing is worth saving, make NO file changes.
+- No-op content updates are allowed and preferred when there is no meaningful, reusable
+  learning worth saving.
+  - INIT mode: still create minimal required files (`MEMORY.md` and `memory_summary.md`).
+  - INCREMENTAL UPDATE mode: if nothing is worth saving, make no file changes.

 ============================================================
 WHAT COUNTS AS HIGH-SIGNAL MEMORY
@ -97,7 +100,10 @@ Primary inputs (always read these, if exists):
 Under `{{ memory_root }}/`:
 - `raw_memories.md`
  - mechanical merge of `raw_memories` from Phase 1;
-  - source of rollout-level metadata needed for MEMORY.md header annotations;
+  - ordered latest-first; use this recency ordering as a major heuristic when choosing
+    what to promote, expand, or deprecate;
+  - source of rollout-level metadata needed for MEMORY.md `### rollout_summary_files`
+    annotations;
    you should be able to find `cwd` and `updated_at` there.
 - `MEMORY.md`
  - merged memories; produce a lightly clustered version if applicable
@ -123,45 +129,148 @@ Rules:
 - If there is no meaningful signal to add beyond what already exists, keep outputs minimal.
 - You should always make sure `MEMORY.md` and `memory_summary.md` exist and are up to date.
 - Follow the format and schema of the artifacts below.
+- Do not target fixed counts (memory blocks, task groups, topics, or bullets). Let the
+  signal determine the granularity and depth.
+- Quality objective: for high-signal task families, `MEMORY.md` should be materially more
+  useful than `raw_memories.md` while remaining easy to navigate.

 ============================================================
 1) `MEMORY.md` FORMAT (STRICT)
 ============================================================

-Clustered schema:
---
-rollout_summary_files:
-  - <file1.md> (<annotation that includes status/usefulness, cwd, and updated_at, e.g. "success, most useful architecture walkthrough, cwd=/repo/path, updated_at=2026-02-12T10:30:00Z">)
-  - <file2.md> (<annotation with cwd=/..., updated_at=...>)
-description: brief description of the shared tasks/outcomes
-keywords: k1, k2, k3, ... <searchable handles (tool names, error names, repo concepts, contracts)>
---
+`MEMORY.md` is the durable, retrieval-oriented handbook. Each block should be easy to grep
+and rich enough to reuse without reopening raw rollout logs.
+
+Each memory block MUST start with:
+
+# Task Group: <repo / project / workflow / detail-task family; broad but distinguishable>
+
+scope: <what this block covers, when to use it, and notable boundaries>
+
+- `Task Group` is for retrieval. Choose granularity based on memory density:
+  repo / project / workflow / detail-task family.
+- `scope:` is for scanning. Keep it short and operational.
+
+Body format (strict):
+
+- Use the task-grouped markdown structure below (headings + bullets). Do not use a flat
+  bullet dump.
+- The header (`# Task Group: ...` + `scope: ...`) is the index. The body contains
+  task-level detail.
+- Every `## Task <n>` section MUST include task-local rollout files, task-local keywords,
+  and task-specific learnings.
+- Use `-` bullets for lists and learnings. Do not use `*`.
+- No bolding text in the memory body.
+
+Required task-oriented body shape (strict):
+
+## Task 1: <task description, outcome>
+
+task: <specific, searchable task signature; avoid fluff>
+
+### rollout_summary_files
+
+- <rollout_summaries/file1.md> (cwd=<path>, updated_at=<timestamp>, <optional status/usefulness note>)
+
+### keywords
+
+- <task-local retrieval handles: tool names, error strings, repo concepts, APIs/contracts>
+
+### learnings
+
+- <task-specific learnings>
+- <user expectation, preference, style, tone, feedback>
+- <what worked, what failed, validation, reusable procedure, etc.>
+- <failure shields: symptom -> cause -> fix>
+- <scope boundaries / anti-drift notes when relevant>
+- <uncertainty explicitly preserved if unresolved>
+
+## Task 2: <task description, outcome>
+
+task: <specific, searchable task signature; avoid fluff>
+
+### rollout_summary_files

- <Structured memory entries. Use bullets. No bolding text.>
 - ...

-Schema rules (strict):
- Keep entries compact and retrieval-friendly.
- A single note block may correspond to multiple related tasks; aggregate when tasks and lessons align.
- In `rollout_summary_files`, each parenthesized annotation must include
-  `cwd=<path>` and `updated_at=<timestamp>` copied from that rollout summary metadata.
-  If missing from an individual rollout summary, recover them from `raw_memories.md`.
- If you need to reference skills, do it in the BODY as bullets, not in the header
-  (e.g., "- Related skill: skills/<skill-name>/SKILL.md").
- Use lowercase, hyphenated skill folder names.
- Preserve provenance: include the relevant rollout_summary_file(s) for the block.
+### keywords

-What to write in memory entries: Extract the highest-signal takeaways from the rollout
-summaries, especially from "User preferences", "Reusable knowledge", "References", and
-"Things that did not work / things that can be improved".
-Write what would most help a future agent doing a similar (or adjacent) task: decision
-triggers, key steps, proven commands/paths, and failure shields (symptom -> cause -> fix),
-plus any stable user preferences.
-If a rollout summary contains stable user profile details or preferences that generalize,
-capture them here so they're easy to find and can be reflected in memory_summary.md.
-The goal of MEMORY.md is to support related-but-not-identical future tasks, so keep
-insights slightly more general; when a future task is very similar, expect the agent to
-use the rollout summary for full detail.
+- ...
+
+### learnings
+
+- <task-specific memories / learnings>
+
+... More `## Task <n>` sections if needed
+
+## General Tips
+
+- <cross-task guidance, deduplicated and generalized> [Task 1]
+- <conflict/staleness resolution note using task references> [Task 1][Task 2]
+- <structured memory bullets; no bolding>
+
+Schema rules (strict):
+- A) Structure and consistency
+  - Exact block shape: `# Task Group`, `scope:`, one or more `## Task <n>`, and
+    `## General Tips`.
+  - Keep all tasks and tips inside the task family implied by the block header.
+  - Keep entries retrieval-friendly, but not shallow.
+  - Do not emit placeholder values (`task: task`, `# Task Group: misc`, `scope: general`, etc.).
+- B) Task boundaries and clustering
+  - Primary organization unit is the task (`## Task <n>`), not the rollout file.
+  - Default mapping: one coherent rollout summary -> one MEMORY block -> one `## Task 1`.
+  - If a rollout contains multiple distinct tasks, split them into multiple `## Task <n>`
+    sections. If those tasks belong to different task families, split into separate
+    MEMORY blocks (`# Task Group`).
+  - A MEMORY block may include multiple rollouts only when they belong to the same
+    task group and the task intent, technical context, and outcome pattern align.
+  - A single `## Task <n>` section may cite multiple rollout summaries when they are
+    iterative attempts or follow-up runs for the same task.
+  - Do not cluster on keyword overlap alone.
+  - When in doubt, preserve boundaries (separate tasks/blocks) rather than over-cluster.
+- C) Provenance and metadata
+  - Every `## Task <n>` section must include `### rollout_summary_files`, `### keywords`,
+    and `### learnings`.
+  - `### rollout_summary_files` must be task-local (not a block-wide catch-all list).
+  - Each rollout annotation must include `cwd=<path>` and `updated_at=<timestamp>`.
+    If missing from a rollout summary, recover them from `raw_memories.md`.
+  - Major learnings should be traceable to rollout summaries listed in the same task section.
+  - Order rollout references by freshness and practical usefulness.
+- D) Retrieval and references
+  - `task:` lines must be specific and searchable.
+  - `### keywords` should be discriminative and task-local (tool names, error strings,
+    repo concepts, APIs/contracts).
+  - Put task-specific detail in `## Task <n>` and only deduplicated cross-task guidance in
+    `## General Tips`.
+  - If you reference skills, do it in body bullets only (for example:
+    `- Related skill: skills/<skill-name>/SKILL.md`).
+  - Use lowercase, hyphenated skill folder names.
+- E) Ordering and conflict handling
+  - For grouped blocks, order `## Task <n>` sections by practical usefulness, then recency.
+  - Treat `updated_at` as a first-class signal: fresher validated evidence usually wins.
+  - If evidence conflicts and validation is unclear, preserve the uncertainty explicitly.
+  - In `## General Tips`, cite task references (`[Task 1]`, `[Task 2]`, etc.) when
+    merging, deduplicating, or resolving evidence.
+
+What to write:
+- Extract the takeaways from rollout summaries and raw_memories, especially sections like
+  "User preferences", "Reusable knowledge", "References", and "Things that did not work".
+- Optimize for future related tasks: decision triggers, validated commands/paths,
+  verification steps, and failure shields (symptom -> cause -> fix).
+- Capture stable user preferences/details that generalize so they can also inform
+  `memory_summary.md`.
+- `MEMORY.md` should support related-but-not-identical tasks: slightly more general than a
+  rollout summary, but still operational and concrete.
+- Use `raw_memories.md` as the routing layer; deep-dive into `rollout_summaries/*.md` when:
+  - the task is high-value and needs richer detail,
+  - multiple rollouts overlap and need conflict/staleness resolution,
+  - raw memory wording is too terse/ambiguous to consolidate confidently,
+  - you need stronger evidence, validation context, or user feedback.
+- Each block should be useful on its own and materially richer than `memory_summary.md`:
+  - include concrete triggers, commands/paths, and failure shields,
+  - include outcome-specific notes (what worked, what failed, what remains uncertain),
+  - include scope boundaries / anti-drift notes when they affect future task success,
+  - include stale/conflict notes when newer evidence changes prior guidance.

 ============================================================
 2) `memory_summary.md` FORMAT (STRICT)
@ -210,17 +319,23 @@ For example, include (when known):
 ## What's in Memory
 This is a compact index to help future agents quickly find details in `MEMORY.md`,
 `skills/`, and `rollout_summaries/`.
-Organize by topic. Each bullet should include: topic, keywords (used to search over
-memory files), and a brief description.
+Organize by topic. Each bullet must include: topic, keywords, and a clear description.
 Ordered by utility - which is the most likely to be useful for a future agent.
+Do not target a fixed topic count. Cover the real high-signal areas and omit low-signal noise.
+Prefer grouping by task family / workflow intent, not by incidental tools alone.

 Recommended format:
 - <topic>: <keyword1>, <keyword2>, <keyword3>, ...
-  - desc: <brief description>
+  - desc: <clear and specific description of what is inside this topic and when to use it>

 Notes:
 - Do not include large snippets; push details into MEMORY.md and rollout summaries.
 - Prefer topics/keywords that help a future agent search MEMORY.md efficiently.
+- Prefer clear topic taxonomy over verbose drill-down pointers.
+- Keep descriptions explicit enough that a future model can decide which keyword cluster
+  to search first for a new user query.
+- Topic descriptions should mention what is inside, when to use it, and what kind of
+  outcome/procedure depth is available (for example: runbook, diagnostics, reporting, recovery).

 ============================================================
 3) `skills/` FORMAT (optional)
@ -303,29 +418,41 @@ WORKFLOW
     - create initial `skills/*` (optional but highly recommended)
     - write `memory_summary.md` last (highest-signal file)
   - Use your best efforts to get the most high-quality memory files
-   - Do not be lazy at browsing files at the INIT phase
+   - Do not be lazy at browsing files in INIT mode; deep-dive high-value rollouts and
+     conflicting task families until MEMORY blocks are richer and more useful than raw memories

 3) INCREMENTAL UPDATE behavior:
   - Treat `raw_memories.md` as the primary source of NEW signal.
   - Read existing memory files first for continuity.
   - Integrate new signal into existing artifacts by:
+     - scanning new raw memories in recency order and identifying which existing blocks they should update
     - updating existing knowledge with better/newer evidence
     - updating stale or contradicting guidance
+     - expanding terse old blocks when new summaries/raw memories make the task family clearer
     - doing light clustering and merging if needed
     - updating existing skills or adding new skills only when there is clear new reusable procedure
     - update `memory_summary.md` last to reflect the final state of the memory folder

-4) For both modes, update `MEMORY.md` after skill updates:
-   - add clear **Related skills** pointers in the BODY of corresponding note blocks (do
-     not change the YAML header schema)
+4) Evidence deep-dive rule (both modes):
+   - `raw_memories.md` is the routing layer, not always the final authority for detail.
+   - When a task family is important, ambiguous, or duplicated across multiple rollouts,
+     open the relevant `rollout_summaries/*.md` files and extract richer procedural detail,
+     validation signals, and user feedback before finalizing `MEMORY.md`.
+   - Use `updated_at` and validation strength together to resolve stale/conflicting notes.

-5) Housekeeping (optional):
+5) For both modes, update `MEMORY.md` after skill updates:
+   - add clear related-skill pointers as plain bullets in the BODY of corresponding task
+     sections (do not change the `# Task Group` / `scope:` block header format)
+
+6) Housekeeping (optional):
   - remove clearly redundant/low-signal rollout summaries
   - if multiple summaries overlap for the same thread, keep the best one

-6) Final pass:
+7) Final pass:
   - remove duplication in memory_summary, skills/, and MEMORY.md
   - ensure any referenced skills/summaries actually exist
+   - ensure MEMORY blocks and "What's in Memory" use a consistent task-oriented taxonomy
+   - ensure recent important task families are easy to find (description + keywords + topic wording)
   - if there is no net-new or higher-quality signal to add, keep changes minimal (no
     churn for its own sake).

@ -341,6 +468,6 @@ Use `rg` for fast retrieval while consolidating:
 - Search durable notes:
  `rg -n -i "<pattern>" "{{ memory_root }}/MEMORY.md"`
 - Search across memory tree:
-  `rg -n -i "<pattern>" "{{ memory_root }}" | head -n 50`
+  `rg -n -i "<pattern>" "{{ memory_root }}" | head -n 100`
 - Locate rollout summary files:
-  `rg --files "{{ memory_root }}/rollout_summaries" | head -n 200`
+  `rg --files "{{ memory_root }}/rollout_summaries" | head -n 400`
--- a/codex-rs/core/templates/memories/read_path.md
+++ b/codex-rs/core/templates/memories/read_path.md
@ -25,8 +25,8 @@ again)
 - {{ base_path }}/rollout_summaries/ (per-rollout recaps + evidence snippets)

 Quick memory pass (when applicable):
-1) Skim the MEMORY_SUMMARY included below and extract a few task-relevant
-keywords (for example repo/module names, error strings, etc.).
+1) Skim the MEMORY_SUMMARY included below and extract task-relevant topics and
+keywords (for example repo/module names, workflows, error strings, etc.).
 2) Search {{ base_path }}/MEMORY.md for those keywords, and for any referenced
 rollout summary files and skills.
 3) If relevant rollout summary files and skills exist, open matching files
--- a/codex-rs/core/templates/memories/stage_one_system.md
+++ b/codex-rs/core/templates/memories/stage_one_system.md
@ -109,6 +109,16 @@ Typical real-world signals (use as examples when analyzing the rollout):
 3) User keeps iterating on the same task:
   - Requests for fixes/revisions on the same artifact usually mean partial, not success.
   - Requesting a restart or pointing out contradictions often indicates fail.
+4) Last task in the rollout:
+   - Treat the final task more conservatively than earlier tasks.
+   - If there is no explicit user feedback or environment validation for the final task,
+     prefer `uncertain` (or `partial` if there was obvious progress but no confirmation).
+   - For non-final tasks, switching to another task without unresolved blockers is a stronger
+     positive signal.
+
+Signal priority:
+- Explicit user feedback and explicit environment/test/tool validation outrank all heuristics.
+- If heuristic signals conflict with explicit feedback, follow explicit feedback.

 Fallback heuristics:
  - Success: explicit "done/works", tests pass, correct artifact produced, user
@ -152,6 +162,8 @@ This summary should be very comprehensive and detailed, because it will be furth
 distilled into MEMORY.md and memory_summary.md.
 There is no strict size limit, and you should feel free to list a lot of points here as
 long as they are helpful.
+Do not target fixed counts (tasks, bullets, references, or topics). Let the rollout's
+signal density decide how much to write.
 Instructional notes in angle brackets are guidance only; do not include them verbatim in the rollout summary.

 Template (items are flexible; include only what is useful):
@ -170,7 +182,7 @@ User preferences: <explicit or inferred from user messages; include how you infe

 <Then followed by tasks in this rollout. Each task is a section; sections below are optional per task.>

-## Task <idx>: <short task name>
+## Task <idx>: <task name>
 Outcome: <success|partial|fail|uncertain>

 Key steps:
@ -188,9 +200,9 @@ Things that did not work / things that can be improved:
  user approval.">
 - ...

-Reusable knowledge: <you are encouraged to list 3-10 points for each task here, anything
-helpful counts, stick to facts. Don't put opinions or suggestions from the assistant
-that are not validated by the user.>
+Reusable knowledge: <list as many durable, evidence-backed points as needed for this task.
+Anything helpful counts; stick to facts. Don't put vague opinions or suggestions from the
+assistant that are not validated.>
 - <facts that will be helpful for future agents, such as how the system works, anything
  that took the agent some effort to figure out, user preferences, etc.>
 - <e.g. "When running evals, you should pass in the flag `some flag
@ -226,22 +238,70 @@ shows or why it matters>:
  - [3] final verification evidence or explicit user feedback


-## Task <idx> (if there are multiple tasks): <short task name>
+## Task <idx> (if there are multiple tasks): <task name>
 ...

+Task section quality bar (strict):
+- Each task section should be detailed enough that other agent can understand it without
+  reopening the raw rollout.
+- For each task, cover the following when evidence exists (and state uncertainty when it
+  does not):
+  - what the user wanted / expected,
+  - what was attempted and what actually worked,
+  - what failed or remained uncertain and why,
+  - how the outcome was validated (user feedback, tests, tool output, or explicit lack of validation),
+  - reusable procedure/checklist and failure shields,
+  - concrete artifacts/commands/paths/error signatures that future agents can reuse.
+- Do not be terse in task sections. Rich, evidence-backed task summaries are preferred
+  over compact summaries.
+
 ============================================================
 `raw_memory` FORMAT (STRICT)
 ============================================================

 The schema is below.
 ---
-rollout_summary_file: <file.md>
-description: brief description of the task and outcome
+description: concise but information-dense description of the primary task(s), outcome, and highest-value takeaway
+task: <primary_task_signature>
+task_group: <repo_or_workflow_bucket>
+task_outcome: <success|partial|fail|uncertain>
 keywords: k1, k2, k3, ... <searchable handles (tool names, error names, repo concepts, contracts)>
 ---
- <Structured memory entries. Use bullets. No bolding text.>
+
+Then write task-grouped body content (required):
+### Task 1: <short task name>
+task: <task signature for this task>
+task_group: <project/workflow topic>
+task_outcome: <success|partial|fail|uncertain>
+- <useful memory bullet>
 - ...

+### Task 2: <short task name> (if needed)
+task: ...
+task_group: ...
+task_outcome: ...
+- ...
+
+Preferred task-block body shape (strongly recommended):
+- `### Task <n>` blocks should preserve task-specific retrieval signal and consolidation-ready detail.
+- Within each task block, include bullets that explicitly cover (when applicable):
+  - user goal / expected outcome,
+  - what worked (key steps, commands, code paths, artifacts),
+  - what did not work or drifted (and what pivot worked),
+  - validation state (user confirmation, tests, runtime checks, or missing validation),
+  - reusable procedure/checklist and failure shields,
+  - high-signal evidence pointers (error strings, commands, files, IDs, URLs, etc.).
+- Prefer labeled bullets when useful (for example: `- User goal: ...`, `- Validation: ...`,
+  `- Failure shield: ...`) so Phase 2 can retrieve and consolidate faster.
+
+Task grouping rules (strict):
+- Every distinct user task in the thread must appear as its own `### Task <n>` block.
+- Do not merge unrelated tasks into one block just because they happen in the same thread.
+- If a thread contains only one task, keep exactly one task block.
+- For each task block, keep the outcome tied to evidence relevant to that task.
+- If a thread has partially related tasks, prefer splitting into separate task blocks and
+  linking them through shared keywords rather than merging.
+
 What to write in memory entries: Extract useful takeaways from the rollout summaries,
 especially from "User preferences", "Reusable knowledge", "References", and
 "Things that did not work / things that can be improved".
@ -249,10 +309,17 @@ Write what would help a future agent doing a similar (or adjacent) task: decisio
 triggers, key steps, proven commands/paths, and failure shields (symptom -> cause -> fix),
 plus any stable user preferences.
 If a rollout summary contains stable user profile details or preferences that generalize,
-capture them here so they're easy to find and can be reflected in memory_summary.md.
+capture them here so they're easy to find without checking rollout summary.
 The goal is to support related-but-not-identical future tasks, so keep
 insights slightly more general; when a future task is very similar, expect the agent to
 use the rollout summary for full detail.
+For each task block, include enough detail to be useful for future agent reference:
+- what the user wanted and expected,
+- what was attempted and what actually worked,
+- what failed or remained uncertain and why,
+- what evidence validates the outcome (user feedback, environment/test feedback, or lack of both),
+- reusable procedures/checklists and failure shields that should survive future similar tasks,
+- artifacts and retrieval handles (commands, file paths, error strings, IDs) that make the task easy to rediscover.


 ============================================================
@ -264,4 +331,6 @@ WORKFLOW
 1) Triage outcome using the common rules.
 2) Read the rollout carefully (do not miss user messages/tool calls/outputs).
 3) Return `rollout_summary`, `rollout_slug`, and `raw_memory`, valid JSON only.
-   No markdown wrapper, no prose outside JSON.
+   No markdown wrapper, no prose outside JSON.
+
+- Do not be terse in task sections. Include validation signal, failure mode, and reusable procedure per task when available.