Upload folder using huggingface_hub
Browse files- envs/agent_stress_test_env/models.py +34 -8
- envs/agent_stress_test_env/openenv.yaml +92 -8
- envs/agent_stress_test_env/server/graders.py +292 -0
- envs/agent_stress_test_env/server/stress_test_environment.py +67 -19
- envs/agent_stress_test_env/server/workflow_simulator.py +135 -4
- models.py +34 -8
- openenv.yaml +92 -8
- server/graders.py +292 -0
- server/stress_test_environment.py +67 -19
- server/workflow_simulator.py +135 -4
envs/agent_stress_test_env/models.py
CHANGED
|
@@ -23,9 +23,16 @@ class ResilienceConfig(Action):
|
|
| 23 |
|
| 24 |
The agent outputs this to fix multi-agent workflow failures.
|
| 25 |
Supports different fix types based on failure mode:
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
retry_max: int = 0
|
|
@@ -38,11 +45,30 @@ class ResilienceConfig(Action):
|
|
| 38 |
min_review_depth: int = 1
|
| 39 |
consistency_check: bool = False
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
spec_fix: str = ""
|
| 43 |
-
explicit_role_spec: bool = False
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
class StressTestObservation(Observation):
|
|
|
|
| 23 |
|
| 24 |
The agent outputs this to fix multi-agent workflow failures.
|
| 25 |
Supports different fix types based on failure mode:
|
| 26 |
+
|
| 27 |
+
MAST Categories (NeurIPS 2025):
|
| 28 |
+
- FC1: System Design (41.8%) - spec, termination, memory
|
| 29 |
+
- FC2: Inter-Agent Misalignment (36.9%) - format, reasoning-action
|
| 30 |
+
- FC3: Task Verification (21.3%) - verification checks
|
| 31 |
+
|
| 32 |
+
IBM 2026 Updates:
|
| 33 |
+
- FM-1.5/FM-3.1: Termination awareness (FATAL)
|
| 34 |
+
- FM-1.4: Memory/Context loss (FATAL)
|
| 35 |
+
- FM-2.6: Reasoning-action mismatch (FATAL)
|
| 36 |
"""
|
| 37 |
|
| 38 |
retry_max: int = 0
|
|
|
|
| 45 |
min_review_depth: int = 1
|
| 46 |
consistency_check: bool = False
|
| 47 |
|
| 48 |
+
# MAST FC1: System Design (Easy task - spec ambiguity)
|
| 49 |
+
spec_fix: str = ""
|
| 50 |
+
explicit_role_spec: bool = False
|
| 51 |
+
|
| 52 |
+
# MAST FC2: Inter-Agent Misalignment (Medium task - format mismatch)
|
| 53 |
+
format_translator: bool = False
|
| 54 |
+
|
| 55 |
+
# MAST FC3: Task Verification (Hard task - verification failure)
|
| 56 |
+
# (uses consistency_check + min_review_depth)
|
| 57 |
+
|
| 58 |
+
# IBM 2026: FC1 - Termination Awareness (FATAL)
|
| 59 |
+
explicit_termination: bool = False
|
| 60 |
+
max_iterations: int = 0
|
| 61 |
+
|
| 62 |
+
# IBM 2026: FC1 - Memory/Context Management (FATAL)
|
| 63 |
+
context_summarization: bool = False
|
| 64 |
+
sliding_window: bool = False
|
| 65 |
+
|
| 66 |
+
# IBM 2026: FC2 - Reasoning-Action Alignment (FATAL)
|
| 67 |
+
action_validation: bool = False
|
| 68 |
+
reasoning_consistency_check: bool = False
|
| 69 |
+
|
| 70 |
+
# Agent's diagnosis of the failure mode
|
| 71 |
+
diagnosis: str = ""
|
| 72 |
|
| 73 |
|
| 74 |
class StressTestObservation(Observation):
|
envs/agent_stress_test_env/openenv.yaml
CHANGED
|
@@ -9,13 +9,13 @@ tasks:
|
|
| 9 |
- id: easy
|
| 10 |
name: "Specification Ambiguity Fix"
|
| 11 |
difficulty: easy
|
| 12 |
-
category: "MAST:
|
| 13 |
description: |
|
| 14 |
The researcher agent has a vague role definition ('You are a helpful assistant').
|
| 15 |
This causes task misinterpretation - the agent doesn't know what to research.
|
| 16 |
Your task: Provide an explicit role specification JSON with clear capabilities,
|
| 17 |
constraints, and success criteria.
|
| 18 |
-
failure_mode: "Specification ambiguity - vague role definition causes task misinterpretation"
|
| 19 |
grader:
|
| 20 |
type: programmatic
|
| 21 |
score_range: [0.0, 1.0]
|
|
@@ -30,12 +30,12 @@ tasks:
|
|
| 30 |
- id: medium
|
| 31 |
name: "Format Mismatch Fix"
|
| 32 |
difficulty: medium
|
| 33 |
-
category: "MAST: Inter-Agent Misalignment (36.9% of failures)"
|
| 34 |
description: |
|
| 35 |
Multi-agent workflow where the planner outputs YAML but the executor expects JSON.
|
| 36 |
This format mismatch causes the executor to fail (cannot parse input).
|
| 37 |
Your task: Add a format translation layer/middleware to convert YAML to JSON.
|
| 38 |
-
failure_mode: "Format mismatch - planner outputs YAML, executor expects JSON"
|
| 39 |
grader:
|
| 40 |
type: programmatic
|
| 41 |
score_range: [0.0, 1.0]
|
|
@@ -50,20 +50,21 @@ tasks:
|
|
| 50 |
- id: hard
|
| 51 |
name: "Verification Failure Fix"
|
| 52 |
difficulty: hard
|
| 53 |
-
category: "MAST: Task Verification (21.3% of failures)"
|
| 54 |
description: |
|
| 55 |
Multi-agent pipeline with verification failure. The writer produces content
|
| 56 |
with contradictions (30% rate), and the reviewer prematurely approves (60% rate)
|
| 57 |
without proper verification. This combines premature termination with incorrect verification.
|
| 58 |
Your task: Implement multi-level verification - unit checks per agent,
|
| 59 |
integration checks across outputs, and final validation against success criteria.
|
| 60 |
-
failure_mode: "Verification failure - premature termination + incorrect verification"
|
| 61 |
grader:
|
| 62 |
type: programmatic
|
| 63 |
score_range: [0.0, 1.0]
|
| 64 |
criteria: |
|
| 65 |
Based on MAST research: 21.3% of failures come from verification issues
|
| 66 |
(6.2% premature, 8.2% no verification, 9.1% incorrect).
|
|
|
|
| 67 |
The agent must add deep verification with explicit success criteria.
|
| 68 |
- +0.15 for enabling consistency_check
|
| 69 |
- +0.15 for setting min_review_depth >= 3
|
|
@@ -71,16 +72,92 @@ tasks:
|
|
| 71 |
- +0.10 for achieving 50%+ success rate
|
| 72 |
- +0.20 max for diagnosis keywords (partial credit)
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
difficulty_progression:
|
| 75 |
- easy: "Single spec issue (vague role definition) - solution: explicit spec"
|
| 76 |
- medium: "Inter-agent format mismatch (YAML vs JSON) - solution: translator"
|
| 77 |
- hard: "Verification failure (premature + incorrect) - solution: multi-level checks"
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
research_basis:
|
| 80 |
- name: "MAST: Multi-Agent System Failure Taxonomy"
|
| 81 |
source: "NeurIPS 2025 (Berkeley)"
|
| 82 |
url: "https://arxiv.org/abs/2503.13657"
|
| 83 |
key_finding: "Multi-agent LLM systems fail 41-86.7% of the time in production"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
- name: "Why Do Multi-Agent LLM Systems Fail?"
|
| 85 |
source: "Future AGI 2026 Guide"
|
| 86 |
url: "https://futureagi.substack.com/p/why-do-multi-agent-llm-systems-fail"
|
|
@@ -96,10 +173,17 @@ metadata:
|
|
| 96 |
- mast-research
|
| 97 |
- specification
|
| 98 |
- verification
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
created: 2026-04-08
|
| 100 |
-
version: 1.
|
| 101 |
author: OpenEnv Hackathon
|
| 102 |
benchmark_scores:
|
| 103 |
easy: "Expected 0.85+ for strong LLM with explicit spec"
|
| 104 |
medium: "Expected 0.60-0.75 for strong LLM with translator"
|
| 105 |
-
hard: "Expected 0.35-0.50 for strong LLM with deep verification"
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
- id: easy
|
| 10 |
name: "Specification Ambiguity Fix"
|
| 11 |
difficulty: easy
|
| 12 |
+
category: "MAST FC1: System Design (41.8% of failures)"
|
| 13 |
description: |
|
| 14 |
The researcher agent has a vague role definition ('You are a helpful assistant').
|
| 15 |
This causes task misinterpretation - the agent doesn't know what to research.
|
| 16 |
Your task: Provide an explicit role specification JSON with clear capabilities,
|
| 17 |
constraints, and success criteria.
|
| 18 |
+
failure_mode: "FM-1.1: Specification ambiguity - vague role definition causes task misinterpretation"
|
| 19 |
grader:
|
| 20 |
type: programmatic
|
| 21 |
score_range: [0.0, 1.0]
|
|
|
|
| 30 |
- id: medium
|
| 31 |
name: "Format Mismatch Fix"
|
| 32 |
difficulty: medium
|
| 33 |
+
category: "MAST FC2: Inter-Agent Misalignment (36.9% of failures)"
|
| 34 |
description: |
|
| 35 |
Multi-agent workflow where the planner outputs YAML but the executor expects JSON.
|
| 36 |
This format mismatch causes the executor to fail (cannot parse input).
|
| 37 |
Your task: Add a format translation layer/middleware to convert YAML to JSON.
|
| 38 |
+
failure_mode: "FM-2.x: Format mismatch - planner outputs YAML, executor expects JSON"
|
| 39 |
grader:
|
| 40 |
type: programmatic
|
| 41 |
score_range: [0.0, 1.0]
|
|
|
|
| 50 |
- id: hard
|
| 51 |
name: "Verification Failure Fix"
|
| 52 |
difficulty: hard
|
| 53 |
+
category: "MAST FC3: Task Verification (21.3% of failures)"
|
| 54 |
description: |
|
| 55 |
Multi-agent pipeline with verification failure. The writer produces content
|
| 56 |
with contradictions (30% rate), and the reviewer prematurely approves (60% rate)
|
| 57 |
without proper verification. This combines premature termination with incorrect verification.
|
| 58 |
Your task: Implement multi-level verification - unit checks per agent,
|
| 59 |
integration checks across outputs, and final validation against success criteria.
|
| 60 |
+
failure_mode: "FM-3.1/FM-3.3: Verification failure - premature termination + incorrect verification"
|
| 61 |
grader:
|
| 62 |
type: programmatic
|
| 63 |
score_range: [0.0, 1.0]
|
| 64 |
criteria: |
|
| 65 |
Based on MAST research: 21.3% of failures come from verification issues
|
| 66 |
(6.2% premature, 8.2% no verification, 9.1% incorrect).
|
| 67 |
+
IBM 2026 update: FM-3.3 (Incorrect Verification) is the STRONGEST predictor of failure.
|
| 68 |
The agent must add deep verification with explicit success criteria.
|
| 69 |
- +0.15 for enabling consistency_check
|
| 70 |
- +0.15 for setting min_review_depth >= 3
|
|
|
|
| 72 |
- +0.10 for achieving 50%+ success rate
|
| 73 |
- +0.20 max for diagnosis keywords (partial credit)
|
| 74 |
|
| 75 |
+
- id: termination
|
| 76 |
+
name: "Termination Awareness Fix"
|
| 77 |
+
difficulty: medium
|
| 78 |
+
category: "MAST FC1: System Design - FATAL FAILURE"
|
| 79 |
+
description: |
|
| 80 |
+
The agent struggles to recognize when a task is complete. It either:
|
| 81 |
+
- Loops indefinitely (FM-1.3 Step Repetition)
|
| 82 |
+
- Prematurely exits without confirming success (FM-3.1)
|
| 83 |
+
- Is unaware of termination conditions (FM-1.5)
|
| 84 |
+
|
| 85 |
+
Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues.
|
| 86 |
+
Your task: Implement explicit termination conditions with success criteria verification.
|
| 87 |
+
failure_mode: "FM-1.5/FM-3.1: Unaware of termination + premature termination"
|
| 88 |
+
grader:
|
| 89 |
+
type: programmatic
|
| 90 |
+
score_range: [0.0, 1.0]
|
| 91 |
+
criteria: |
|
| 92 |
+
FATAL FAILURE: When these modes appear, success probability drops precipitously.
|
| 93 |
+
IBM 2026: Use deterministic state machine to enforce termination.
|
| 94 |
+
- +0.25 for enabling explicit termination detection
|
| 95 |
+
- +0.20 for implementing max_iterations limit
|
| 96 |
+
- +0.30 * success_rate from 10 simulation trials
|
| 97 |
+
- +0.15 for achieving 60%+ success rate
|
| 98 |
+
- +0.15 max for diagnosis keywords (partial credit)
|
| 99 |
+
|
| 100 |
+
- id: memory
|
| 101 |
+
name: "Conversation History Fix"
|
| 102 |
+
difficulty: hard
|
| 103 |
+
category: "MAST FC1: System Design - FATAL FAILURE"
|
| 104 |
+
description: |
|
| 105 |
+
As conversation history grows, the agent loses context and derails.
|
| 106 |
+
This is FM-1.4 (Loss of Conversation History) - unique fatal flaw.
|
| 107 |
+
Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces.
|
| 108 |
+
Your task: Implement context management - sliding window, summarization, or state machine.
|
| 109 |
+
failure_mode: "FM-1.4: Loss of conversation history - agent forgets original task"
|
| 110 |
+
grader:
|
| 111 |
+
type: programmatic
|
| 112 |
+
score_range: [0.0, 1.0]
|
| 113 |
+
criteria: |
|
| 114 |
+
FATAL FAILURE: Memory loss in long traces leads to total task derailment.
|
| 115 |
+
IBM 2026: Implement aggressive context hygiene and early error detection.
|
| 116 |
+
- +0.20 for enabling context summarization
|
| 117 |
+
- +0.20 for implementing sliding window
|
| 118 |
+
- +0.35 * success_rate from 10 simulation trials (with long context)
|
| 119 |
+
- +0.15 for achieving 50%+ success rate in long traces
|
| 120 |
+
- +0.15 max for diagnosis keywords (partial credit)
|
| 121 |
+
|
| 122 |
+
- id: reasoning
|
| 123 |
+
name: "Reasoning-Action Alignment Fix"
|
| 124 |
+
difficulty: hard
|
| 125 |
+
category: "MAST FC2: Inter-Agent Misalignment - FATAL FAILURE"
|
| 126 |
+
description: |
|
| 127 |
+
The agent identifies the correct next step but executes a redundant or irrelevant command.
|
| 128 |
+
FM-2.6: Reasoning-Action Mismatch - describes correct plan but executes unrelated tool call.
|
| 129 |
+
Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this.
|
| 130 |
+
Your task: Implement action validation layer that checks execution against reasoning.
|
| 131 |
+
failure_mode: "FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution"
|
| 132 |
+
grader:
|
| 133 |
+
type: programmatic
|
| 134 |
+
score_range: [0.0, 1.0]
|
| 135 |
+
criteria: |
|
| 136 |
+
FATAL FAILURE: Decoupling of reasoning and action causes cascading collapse.
|
| 137 |
+
IBM 2026: Small reasoning mismatches early poison entire task history.
|
| 138 |
+
- +0.20 for enabling action validation
|
| 139 |
+
- +0.20 for implementing reasoning-execution consistency check
|
| 140 |
+
- +0.35 * success_rate from 10 simulation trials
|
| 141 |
+
- +0.15 for achieving 45%+ success rate
|
| 142 |
+
- +0.15 max for diagnosis keywords (partial credit)
|
| 143 |
+
|
| 144 |
difficulty_progression:
|
| 145 |
- easy: "Single spec issue (vague role definition) - solution: explicit spec"
|
| 146 |
- medium: "Inter-agent format mismatch (YAML vs JSON) - solution: translator"
|
| 147 |
- hard: "Verification failure (premature + incorrect) - solution: multi-level checks"
|
| 148 |
+
- termination: "Termination awareness (loops/premature exit) - solution: state machine"
|
| 149 |
+
- memory: "Conversation history loss (forgets context) - solution: context management"
|
| 150 |
+
- reasoning: "Reasoning-action mismatch (wrong execution) - solution: validation layer"
|
| 151 |
|
| 152 |
research_basis:
|
| 153 |
- name: "MAST: Multi-Agent System Failure Taxonomy"
|
| 154 |
source: "NeurIPS 2025 (Berkeley)"
|
| 155 |
url: "https://arxiv.org/abs/2503.13657"
|
| 156 |
key_finding: "Multi-agent LLM systems fail 41-86.7% of the time in production"
|
| 157 |
+
- name: "IBM and UC Berkeley: Enterprise Agents Fail with IT-Bench and MAST"
|
| 158 |
+
source: "Hugging Face Blog (Feb 2026)"
|
| 159 |
+
url: "https://huggingface.co/blog/ibm-research/itbenchandmast"
|
| 160 |
+
key_finding: "FM-3.3 (Incorrect Verification) is strongest failure predictor; fatal vs non-fatal distinction critical"
|
| 161 |
- name: "Why Do Multi-Agent LLM Systems Fail?"
|
| 162 |
source: "Future AGI 2026 Guide"
|
| 163 |
url: "https://futureagi.substack.com/p/why-do-multi-agent-llm-systems-fail"
|
|
|
|
| 173 |
- mast-research
|
| 174 |
- specification
|
| 175 |
- verification
|
| 176 |
+
- termination-awareness
|
| 177 |
+
- memory-management
|
| 178 |
+
- reasoning-alignment
|
| 179 |
+
- ibm-research
|
| 180 |
created: 2026-04-08
|
| 181 |
+
version: 1.2.0
|
| 182 |
author: OpenEnv Hackathon
|
| 183 |
benchmark_scores:
|
| 184 |
easy: "Expected 0.85+ for strong LLM with explicit spec"
|
| 185 |
medium: "Expected 0.60-0.75 for strong LLM with translator"
|
| 186 |
+
hard: "Expected 0.35-0.50 for strong LLM with deep verification"
|
| 187 |
+
termination: "Expected 0.50-0.65 for LLM with state machine"
|
| 188 |
+
memory: "Expected 0.40-0.55 for LLM with context management"
|
| 189 |
+
reasoning: "Expected 0.35-0.50 for LLM with validation layer"
|
envs/agent_stress_test_env/server/graders.py
CHANGED
|
@@ -57,6 +57,12 @@ class Grader:
|
|
| 57 |
"verify": 0.0,
|
| 58 |
"check": 0.0,
|
| 59 |
"review": 0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
# Specification keywords
|
|
@@ -87,6 +93,24 @@ class Grader:
|
|
| 87 |
if "contradict" in diagnosis_lower:
|
| 88 |
scores["contradiction"] = 0.10
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
return scores
|
| 91 |
|
| 92 |
def grade(
|
|
@@ -406,5 +430,273 @@ def get_grader(task_id: str) -> Grader:
|
|
| 406 |
"easy": EasyGrader(),
|
| 407 |
"medium": MediumGrader(),
|
| 408 |
"hard": HardGrader(),
|
|
|
|
|
|
|
|
|
|
| 409 |
}
|
| 410 |
return graders.get(task_id, EasyGrader())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
"verify": 0.0,
|
| 58 |
"check": 0.0,
|
| 59 |
"review": 0.0,
|
| 60 |
+
"termination": 0.0,
|
| 61 |
+
"loop": 0.0,
|
| 62 |
+
"memory": 0.0,
|
| 63 |
+
"context": 0.0,
|
| 64 |
+
"reasoning": 0.0,
|
| 65 |
+
"action": 0.0,
|
| 66 |
}
|
| 67 |
|
| 68 |
# Specification keywords
|
|
|
|
| 93 |
if "contradict" in diagnosis_lower:
|
| 94 |
scores["contradiction"] = 0.10
|
| 95 |
|
| 96 |
+
# Termination keywords (IBM 2026 - FATAL)
|
| 97 |
+
if "terminat" in diagnosis_lower or "loop" in diagnosis_lower:
|
| 98 |
+
scores["termination"] = 0.10
|
| 99 |
+
if "infinite" in diagnosis_lower or "repeat" in diagnosis_lower:
|
| 100 |
+
scores["loop"] = 0.10
|
| 101 |
+
|
| 102 |
+
# Memory/Context keywords (IBM 2026 - FATAL)
|
| 103 |
+
if "memory" in diagnosis_lower or "forget" in diagnosis_lower:
|
| 104 |
+
scores["memory"] = 0.10
|
| 105 |
+
if "context" in diagnosis_lower or "history" in diagnosis_lower:
|
| 106 |
+
scores["context"] = 0.10
|
| 107 |
+
|
| 108 |
+
# Reasoning-Action keywords (IBM 2026 - FATAL)
|
| 109 |
+
if "reason" in diagnosis_lower or "think" in diagnosis_lower:
|
| 110 |
+
scores["reasoning"] = 0.10
|
| 111 |
+
if "action" in diagnosis_lower or "execut" in diagnosis_lower:
|
| 112 |
+
scores["action"] = 0.10
|
| 113 |
+
|
| 114 |
return scores
|
| 115 |
|
| 116 |
def grade(
|
|
|
|
| 430 |
"easy": EasyGrader(),
|
| 431 |
"medium": MediumGrader(),
|
| 432 |
"hard": HardGrader(),
|
| 433 |
+
"termination": TerminationGrader(),
|
| 434 |
+
"memory": MemoryGrader(),
|
| 435 |
+
"reasoning": ReasoningGrader(),
|
| 436 |
}
|
| 437 |
return graders.get(task_id, EasyGrader())
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
class TerminationGrader(Grader):
|
| 441 |
+
"""
|
| 442 |
+
Grader for termination task: FM-1.5/FM-3.1 (IBM 2026 - FATAL FAILURE)
|
| 443 |
+
|
| 444 |
+
Task: Agent struggles to recognize task completion - loops or prematurely exits.
|
| 445 |
+
Fix: Implement explicit termination conditions with success criteria.
|
| 446 |
+
"""
|
| 447 |
+
|
| 448 |
+
def __init__(self):
|
| 449 |
+
super().__init__("termination", "medium")
|
| 450 |
+
|
| 451 |
+
def grade(
|
| 452 |
+
self,
|
| 453 |
+
agent_config: dict[str, Any],
|
| 454 |
+
task_description: str,
|
| 455 |
+
failure_mode: str,
|
| 456 |
+
diagnosis: str,
|
| 457 |
+
) -> tuple[float, dict[str, Any]]:
|
| 458 |
+
from .workflow_simulator import create_termination_task
|
| 459 |
+
|
| 460 |
+
nodes, _, _ = create_termination_task()
|
| 461 |
+
resilience = self._parse_config(agent_config, diagnosis)
|
| 462 |
+
simulator = WorkflowSimulator(nodes, seed=42)
|
| 463 |
+
|
| 464 |
+
results = []
|
| 465 |
+
for _ in range(10):
|
| 466 |
+
result = simulator.run_workflow(resilience)
|
| 467 |
+
results.append(result.success)
|
| 468 |
+
|
| 469 |
+
success_rate = sum(results) / len(results)
|
| 470 |
+
|
| 471 |
+
has_termination_detection = agent_config.get("explicit_termination", False)
|
| 472 |
+
has_max_iterations = agent_config.get("max_iterations", 0) > 0
|
| 473 |
+
|
| 474 |
+
diagnosis_scores = self._parse_diagnosis(diagnosis)
|
| 475 |
+
diagnosis_points = min(
|
| 476 |
+
0.15, diagnosis_scores["termination"] + diagnosis_scores["loop"]
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
score = 0.0
|
| 480 |
+
|
| 481 |
+
if has_termination_detection:
|
| 482 |
+
score += 0.25
|
| 483 |
+
if has_max_iterations:
|
| 484 |
+
score += 0.20
|
| 485 |
+
|
| 486 |
+
if success_rate > 0:
|
| 487 |
+
score += success_rate * 0.30
|
| 488 |
+
|
| 489 |
+
if success_rate >= 0.6:
|
| 490 |
+
score += 0.15
|
| 491 |
+
|
| 492 |
+
score += diagnosis_points
|
| 493 |
+
|
| 494 |
+
score = min(1.0, max(0.0, score))
|
| 495 |
+
|
| 496 |
+
return score, {
|
| 497 |
+
"success_rate": success_rate,
|
| 498 |
+
"has_termination_detection": has_termination_detection,
|
| 499 |
+
"has_max_iterations": has_max_iterations,
|
| 500 |
+
"diagnosis_points": diagnosis_points,
|
| 501 |
+
"config": agent_config,
|
| 502 |
+
"diagnosis": diagnosis,
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
def _parse_config(
|
| 506 |
+
self, agent_config: dict[str, Any], diagnosis: str
|
| 507 |
+
) -> ResilienceConfig:
|
| 508 |
+
return ResilienceConfig(
|
| 509 |
+
retry_max=agent_config.get("max_iterations", 50),
|
| 510 |
+
retry_delay_ms=agent_config.get("retry_delay_ms", 0),
|
| 511 |
+
timeout_ms=agent_config.get("timeout_ms", 30000),
|
| 512 |
+
fallback=agent_config.get("fallback", "abort"),
|
| 513 |
+
circuit_breaker_threshold=agent_config.get(
|
| 514 |
+
"circuit_breaker_threshold", 1.0
|
| 515 |
+
),
|
| 516 |
+
context_strategy=agent_config.get("context_strategy", "truncate"),
|
| 517 |
+
context_summarization_threshold=agent_config.get(
|
| 518 |
+
"context_summarization_threshold", 500
|
| 519 |
+
),
|
| 520 |
+
min_review_depth=agent_config.get("min_review_depth", 1),
|
| 521 |
+
consistency_check=agent_config.get("consistency_check", False),
|
| 522 |
+
explicit_termination=agent_config.get("explicit_termination", False),
|
| 523 |
+
diagnosis=diagnosis,
|
| 524 |
+
)
|
| 525 |
+
|
| 526 |
+
|
| 527 |
+
class MemoryGrader(Grader):
|
| 528 |
+
"""
|
| 529 |
+
Grader for memory task: FM-1.4 (IBM 2026 - FATAL FAILURE)
|
| 530 |
+
|
| 531 |
+
Task: Agent loses conversation history in long traces - forgets original task.
|
| 532 |
+
Fix: Implement context management (sliding window, summarization, state machine).
|
| 533 |
+
"""
|
| 534 |
+
|
| 535 |
+
def __init__(self):
|
| 536 |
+
super().__init__("memory", "hard")
|
| 537 |
+
|
| 538 |
+
def grade(
|
| 539 |
+
self,
|
| 540 |
+
agent_config: dict[str, Any],
|
| 541 |
+
task_description: str,
|
| 542 |
+
failure_mode: str,
|
| 543 |
+
diagnosis: str,
|
| 544 |
+
) -> tuple[float, dict[str, Any]]:
|
| 545 |
+
from .workflow_simulator import create_memory_task
|
| 546 |
+
|
| 547 |
+
nodes, _, _ = create_memory_task()
|
| 548 |
+
resilience = self._parse_config(agent_config, diagnosis)
|
| 549 |
+
simulator = WorkflowSimulator(nodes, seed=42)
|
| 550 |
+
|
| 551 |
+
results = []
|
| 552 |
+
for _ in range(10):
|
| 553 |
+
result = simulator.run_workflow(resilience)
|
| 554 |
+
results.append(result.success)
|
| 555 |
+
|
| 556 |
+
success_rate = sum(results) / len(results)
|
| 557 |
+
|
| 558 |
+
has_summarization = agent_config.get("context_summarization", False)
|
| 559 |
+
has_sliding_window = agent_config.get("sliding_window", False)
|
| 560 |
+
|
| 561 |
+
diagnosis_scores = self._parse_diagnosis(diagnosis)
|
| 562 |
+
diagnosis_points = min(
|
| 563 |
+
0.15, diagnosis_scores["memory"] + diagnosis_scores["context"]
|
| 564 |
+
)
|
| 565 |
+
|
| 566 |
+
score = 0.0
|
| 567 |
+
|
| 568 |
+
if has_summarization:
|
| 569 |
+
score += 0.20
|
| 570 |
+
if has_sliding_window:
|
| 571 |
+
score += 0.20
|
| 572 |
+
|
| 573 |
+
if success_rate > 0:
|
| 574 |
+
score += success_rate * 0.35
|
| 575 |
+
|
| 576 |
+
if success_rate >= 0.5:
|
| 577 |
+
score += 0.15
|
| 578 |
+
|
| 579 |
+
score += diagnosis_points
|
| 580 |
+
|
| 581 |
+
score = min(1.0, max(0.0, score))
|
| 582 |
+
|
| 583 |
+
return score, {
|
| 584 |
+
"success_rate": success_rate,
|
| 585 |
+
"has_summarization": has_summarization,
|
| 586 |
+
"has_sliding_window": has_sliding_window,
|
| 587 |
+
"diagnosis_points": diagnosis_points,
|
| 588 |
+
"config": agent_config,
|
| 589 |
+
"diagnosis": diagnosis,
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
def _parse_config(
|
| 593 |
+
self, agent_config: dict[str, Any], diagnosis: str
|
| 594 |
+
) -> ResilienceConfig:
|
| 595 |
+
return ResilienceConfig(
|
| 596 |
+
retry_max=agent_config.get("retry_max", 0),
|
| 597 |
+
retry_delay_ms=agent_config.get("retry_delay_ms", 0),
|
| 598 |
+
timeout_ms=agent_config.get("timeout_ms", 30000),
|
| 599 |
+
fallback=agent_config.get("fallback", "abort"),
|
| 600 |
+
circuit_breaker_threshold=agent_config.get(
|
| 601 |
+
"circuit_breaker_threshold", 1.0
|
| 602 |
+
),
|
| 603 |
+
context_strategy=agent_config.get("context_strategy", "summarize"),
|
| 604 |
+
context_summarization_threshold=agent_config.get(
|
| 605 |
+
"context_summarization_threshold", 200
|
| 606 |
+
),
|
| 607 |
+
min_review_depth=agent_config.get("min_review_depth", 1),
|
| 608 |
+
consistency_check=agent_config.get("consistency_check", False),
|
| 609 |
+
context_summarization=agent_config.get("context_summarization", False),
|
| 610 |
+
sliding_window=agent_config.get("sliding_window", False),
|
| 611 |
+
diagnosis=diagnosis,
|
| 612 |
+
)
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
class ReasoningGrader(Grader):
|
| 616 |
+
"""
|
| 617 |
+
Grader for reasoning-action alignment: FM-2.6 (IBM 2026 - FATAL FAILURE)
|
| 618 |
+
|
| 619 |
+
Task: Agent describes correct plan but executes unrelated/redundant command.
|
| 620 |
+
Fix: Implement action validation layer checking execution against reasoning.
|
| 621 |
+
"""
|
| 622 |
+
|
| 623 |
+
def __init__(self):
|
| 624 |
+
super().__init__("reasoning", "hard")
|
| 625 |
+
|
| 626 |
+
def grade(
|
| 627 |
+
self,
|
| 628 |
+
agent_config: dict[str, Any],
|
| 629 |
+
task_description: str,
|
| 630 |
+
failure_mode: str,
|
| 631 |
+
diagnosis: str,
|
| 632 |
+
) -> tuple[float, dict[str, Any]]:
|
| 633 |
+
from .workflow_simulator import create_reasoning_task
|
| 634 |
+
|
| 635 |
+
nodes, _, _ = create_reasoning_task()
|
| 636 |
+
resilience = self._parse_config(agent_config, diagnosis)
|
| 637 |
+
simulator = WorkflowSimulator(nodes, seed=42)
|
| 638 |
+
|
| 639 |
+
results = []
|
| 640 |
+
for _ in range(10):
|
| 641 |
+
result = simulator.run_workflow(resilience)
|
| 642 |
+
results.append(result.success)
|
| 643 |
+
|
| 644 |
+
success_rate = sum(results) / len(results)
|
| 645 |
+
|
| 646 |
+
has_action_validation = agent_config.get("action_validation", False)
|
| 647 |
+
has_consistency_check = agent_config.get("reasoning_consistency_check", False)
|
| 648 |
+
|
| 649 |
+
diagnosis_scores = self._parse_diagnosis(diagnosis)
|
| 650 |
+
diagnosis_points = min(
|
| 651 |
+
0.15, diagnosis_scores["reasoning"] + diagnosis_scores["action"]
|
| 652 |
+
)
|
| 653 |
+
|
| 654 |
+
score = 0.0
|
| 655 |
+
|
| 656 |
+
if has_action_validation:
|
| 657 |
+
score += 0.20
|
| 658 |
+
if has_consistency_check:
|
| 659 |
+
score += 0.20
|
| 660 |
+
|
| 661 |
+
if success_rate > 0:
|
| 662 |
+
score += success_rate * 0.35
|
| 663 |
+
|
| 664 |
+
if success_rate >= 0.45:
|
| 665 |
+
score += 0.15
|
| 666 |
+
|
| 667 |
+
score += diagnosis_points
|
| 668 |
+
|
| 669 |
+
score = min(1.0, max(0.0, score))
|
| 670 |
+
|
| 671 |
+
return score, {
|
| 672 |
+
"success_rate": success_rate,
|
| 673 |
+
"has_action_validation": has_action_validation,
|
| 674 |
+
"has_consistency_check": has_consistency_check,
|
| 675 |
+
"diagnosis_points": diagnosis_points,
|
| 676 |
+
"config": agent_config,
|
| 677 |
+
"diagnosis": diagnosis,
|
| 678 |
+
}
|
| 679 |
+
|
| 680 |
+
def _parse_config(
|
| 681 |
+
self, agent_config: dict[str, Any], diagnosis: str
|
| 682 |
+
) -> ResilienceConfig:
|
| 683 |
+
return ResilienceConfig(
|
| 684 |
+
retry_max=agent_config.get("retry_max", 0),
|
| 685 |
+
retry_delay_ms=agent_config.get("retry_delay_ms", 0),
|
| 686 |
+
timeout_ms=agent_config.get("timeout_ms", 30000),
|
| 687 |
+
fallback=agent_config.get("fallback", "abort"),
|
| 688 |
+
circuit_breaker_threshold=agent_config.get(
|
| 689 |
+
"circuit_breaker_threshold", 1.0
|
| 690 |
+
),
|
| 691 |
+
context_strategy=agent_config.get("context_strategy", "truncate"),
|
| 692 |
+
context_summarization_threshold=agent_config.get(
|
| 693 |
+
"context_summarization_threshold", 500
|
| 694 |
+
),
|
| 695 |
+
min_review_depth=agent_config.get("min_review_depth", 1),
|
| 696 |
+
consistency_check=agent_config.get("consistency_check", False),
|
| 697 |
+
action_validation=agent_config.get("action_validation", False),
|
| 698 |
+
reasoning_consistency_check=agent_config.get(
|
| 699 |
+
"reasoning_consistency_check", False
|
| 700 |
+
),
|
| 701 |
+
diagnosis=diagnosis,
|
| 702 |
+
)
|
envs/agent_stress_test_env/server/stress_test_environment.py
CHANGED
|
@@ -23,6 +23,9 @@ try:
|
|
| 23 |
create_easy_task,
|
| 24 |
create_hard_task,
|
| 25 |
create_medium_task,
|
|
|
|
|
|
|
|
|
|
| 26 |
)
|
| 27 |
except ImportError:
|
| 28 |
from openenv.core.env_server.interfaces import (
|
|
@@ -48,23 +51,44 @@ TASK_DEFINITIONS = {
|
|
| 48 |
"easy": {
|
| 49 |
"id": "easy",
|
| 50 |
"difficulty": "easy",
|
| 51 |
-
"category": "MAST:
|
| 52 |
"description": "The researcher agent has a vague role definition ('You are a helpful assistant'). This causes task misinterpretation. Your task: Provide an explicit role specification JSON with clear capabilities, constraints, and success criteria.",
|
| 53 |
-
"failure_mode": "Specification ambiguity - vague role definition causes task misinterpretation",
|
| 54 |
},
|
| 55 |
"medium": {
|
| 56 |
"id": "medium",
|
| 57 |
"difficulty": "medium",
|
| 58 |
-
"category": "MAST: Inter-Agent Misalignment (36.9% of failures)",
|
| 59 |
"description": "Multi-agent workflow where the planner outputs YAML but the executor expects JSON. This format mismatch causes the executor to fail. Your task: Add a format translation layer/middleware.",
|
| 60 |
-
"failure_mode": "Format mismatch - planner outputs YAML, executor expects JSON",
|
| 61 |
},
|
| 62 |
"hard": {
|
| 63 |
"id": "hard",
|
| 64 |
"difficulty": "hard",
|
| 65 |
-
"category": "MAST: Task Verification (21.3% of failures)",
|
| 66 |
-
"description": "Multi-agent pipeline with verification failure. Writer produces contradictions (30%), reviewer prematurely approves (60%) without checks. Your task: Implement multi-level verification.",
|
| 67 |
-
"failure_mode": "Verification failure - premature termination + incorrect verification",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
},
|
| 69 |
}
|
| 70 |
|
|
@@ -116,6 +140,14 @@ class StressTestEnvironment(
|
|
| 116 |
step_count=0,
|
| 117 |
)
|
| 118 |
self._current_task_index = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
task = TASK_DEFINITIONS["easy"]
|
| 121 |
|
|
@@ -207,33 +239,31 @@ class StressTestEnvironment(
|
|
| 207 |
self._state.total_score = combined_score
|
| 208 |
self._state.step_count += 1
|
| 209 |
|
| 210 |
-
# Return combined result for
|
| 211 |
task_id = "all_tasks"
|
| 212 |
task = {
|
| 213 |
-
"description": "All
|
| 214 |
-
"failure_mode": "Combined MAST failure modes",
|
| 215 |
"category": "MAST: All categories",
|
| 216 |
}
|
| 217 |
|
| 218 |
obs = StressTestObservation(
|
| 219 |
task_id="all_tasks",
|
| 220 |
-
task_description=f"Easy: {all_scores[0]:.2f}, Medium: {all_scores[1]:.2f}, Hard: {all_scores[2]:.2f} | Combined: {combined_score:.2f}",
|
| 221 |
-
scenario_setup="All
|
| 222 |
-
failure_category="MAST: Spec (41.8%) + Inter-Agent (36.9%) + Verification (21.3%)",
|
| 223 |
failure_mode_detected=True,
|
| 224 |
-
failure_mode_description="Specification, Format Mismatch, and
|
| 225 |
resilience_applied=True,
|
| 226 |
applied_config=json.dumps(agent_config),
|
| 227 |
test_passed=combined_score >= 0.5,
|
| 228 |
-
test_completions=int(
|
| 229 |
-
|
| 230 |
-
), # Report easy task completions
|
| 231 |
-
test_total_trials=30, # Total across all tasks
|
| 232 |
test_latency_ms=0,
|
| 233 |
diagnosis=f"Task scores: {all_scores}",
|
| 234 |
diagnosis_points=0.0,
|
| 235 |
reward=combined_score,
|
| 236 |
-
done=True,
|
| 237 |
)
|
| 238 |
|
| 239 |
return obs
|
|
@@ -302,6 +332,24 @@ class StressTestEnvironment(
|
|
| 302 |
# Hard: Verification fix
|
| 303 |
config["consistency_check"] = agent_config.get("consistency_check", False)
|
| 304 |
config["min_review_depth"] = agent_config.get("min_review_depth", 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
return config
|
| 307 |
|
|
|
|
| 23 |
create_easy_task,
|
| 24 |
create_hard_task,
|
| 25 |
create_medium_task,
|
| 26 |
+
create_termination_task,
|
| 27 |
+
create_memory_task,
|
| 28 |
+
create_reasoning_task,
|
| 29 |
)
|
| 30 |
except ImportError:
|
| 31 |
from openenv.core.env_server.interfaces import (
|
|
|
|
| 51 |
"easy": {
|
| 52 |
"id": "easy",
|
| 53 |
"difficulty": "easy",
|
| 54 |
+
"category": "MAST FC1: System Design (41.8% of failures)",
|
| 55 |
"description": "The researcher agent has a vague role definition ('You are a helpful assistant'). This causes task misinterpretation. Your task: Provide an explicit role specification JSON with clear capabilities, constraints, and success criteria.",
|
| 56 |
+
"failure_mode": "FM-1.1: Specification ambiguity - vague role definition causes task misinterpretation",
|
| 57 |
},
|
| 58 |
"medium": {
|
| 59 |
"id": "medium",
|
| 60 |
"difficulty": "medium",
|
| 61 |
+
"category": "MAST FC2: Inter-Agent Misalignment (36.9% of failures)",
|
| 62 |
"description": "Multi-agent workflow where the planner outputs YAML but the executor expects JSON. This format mismatch causes the executor to fail. Your task: Add a format translation layer/middleware.",
|
| 63 |
+
"failure_mode": "FM-2.x: Format mismatch - planner outputs YAML, executor expects JSON",
|
| 64 |
},
|
| 65 |
"hard": {
|
| 66 |
"id": "hard",
|
| 67 |
"difficulty": "hard",
|
| 68 |
+
"category": "MAST FC3: Task Verification (21.3% of failures)",
|
| 69 |
+
"description": "Multi-agent pipeline with verification failure. Writer produces contradictions (30%), reviewer prematurely approves (60%) without checks. Your task: Implement multi-level verification. IBM 2026: FM-3.3 is strongest failure predictor.",
|
| 70 |
+
"failure_mode": "FM-3.1/FM-3.3: Verification failure - premature termination + incorrect verification",
|
| 71 |
+
},
|
| 72 |
+
"termination": {
|
| 73 |
+
"id": "termination",
|
| 74 |
+
"difficulty": "medium",
|
| 75 |
+
"category": "MAST FC1: System Design - FATAL FAILURE",
|
| 76 |
+
"description": "The agent struggles to recognize when a task is complete. It loops indefinitely or prematurely exits. Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues. Your task: Implement explicit termination conditions with success criteria.",
|
| 77 |
+
"failure_mode": "FM-1.5/FM-3.1: Unaware of termination + premature termination",
|
| 78 |
+
},
|
| 79 |
+
"memory": {
|
| 80 |
+
"id": "memory",
|
| 81 |
+
"difficulty": "hard",
|
| 82 |
+
"category": "MAST FC1: System Design - FATAL FAILURE",
|
| 83 |
+
"description": "As conversation history grows, the agent loses context and derails. Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces. Your task: Implement context management - sliding window, summarization, or state machine.",
|
| 84 |
+
"failure_mode": "FM-1.4: Loss of conversation history - agent forgets original task",
|
| 85 |
+
},
|
| 86 |
+
"reasoning": {
|
| 87 |
+
"id": "reasoning",
|
| 88 |
+
"difficulty": "hard",
|
| 89 |
+
"category": "MAST FC2: Inter-Agent Misalignment - FATAL FAILURE",
|
| 90 |
+
"description": "The agent describes correct plan but executes unrelated command. Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this. Your task: Implement action validation layer checking execution against reasoning.",
|
| 91 |
+
"failure_mode": "FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution",
|
| 92 |
},
|
| 93 |
}
|
| 94 |
|
|
|
|
| 140 |
step_count=0,
|
| 141 |
)
|
| 142 |
self._current_task_index = 0
|
| 143 |
+
self._task_ids = [
|
| 144 |
+
"easy",
|
| 145 |
+
"medium",
|
| 146 |
+
"hard",
|
| 147 |
+
"termination",
|
| 148 |
+
"memory",
|
| 149 |
+
"reasoning",
|
| 150 |
+
]
|
| 151 |
|
| 152 |
task = TASK_DEFINITIONS["easy"]
|
| 153 |
|
|
|
|
| 239 |
self._state.total_score = combined_score
|
| 240 |
self._state.step_count += 1
|
| 241 |
|
| 242 |
+
# Return combined result for all tasks
|
| 243 |
task_id = "all_tasks"
|
| 244 |
task = {
|
| 245 |
+
"description": "All 6 tasks (Easy/Medium/Hard + Termination/Memory/Reasoning)",
|
| 246 |
+
"failure_mode": "Combined MAST failure modes including IBM 2026 FATAL failures",
|
| 247 |
"category": "MAST: All categories",
|
| 248 |
}
|
| 249 |
|
| 250 |
obs = StressTestObservation(
|
| 251 |
task_id="all_tasks",
|
| 252 |
+
task_description=f"Easy: {all_scores[0]:.2f}, Medium: {all_scores[1]:.2f}, Hard: {all_scores[2]:.2f}, Term: {all_scores[3]:.2f}, Mem: {all_scores[4]:.2f}, Reas: {all_scores[5]:.2f} | Combined: {combined_score:.2f}",
|
| 253 |
+
scenario_setup="All 6 MAST failure categories evaluated including IBM 2026 fatal failures",
|
| 254 |
+
failure_category="MAST: Spec (41.8%) + Inter-Agent (36.9%) + Verification (21.3%) + IBM FATAL (termination, memory, reasoning)",
|
| 255 |
failure_mode_detected=True,
|
| 256 |
+
failure_mode_description="Specification, Format Mismatch, Verification, Termination, Memory, and Reasoning-Action failures",
|
| 257 |
resilience_applied=True,
|
| 258 |
applied_config=json.dumps(agent_config),
|
| 259 |
test_passed=combined_score >= 0.5,
|
| 260 |
+
test_completions=int(all_scores[0] * 10),
|
| 261 |
+
test_total_trials=60, # Total across all 6 tasks
|
|
|
|
|
|
|
| 262 |
test_latency_ms=0,
|
| 263 |
diagnosis=f"Task scores: {all_scores}",
|
| 264 |
diagnosis_points=0.0,
|
| 265 |
reward=combined_score,
|
| 266 |
+
done=True,
|
| 267 |
)
|
| 268 |
|
| 269 |
return obs
|
|
|
|
| 332 |
# Hard: Verification fix
|
| 333 |
config["consistency_check"] = agent_config.get("consistency_check", False)
|
| 334 |
config["min_review_depth"] = agent_config.get("min_review_depth", 1)
|
| 335 |
+
elif task_id == "termination":
|
| 336 |
+
# Termination: FM-1.5/FM-3.1 (IBM 2026 - FATAL)
|
| 337 |
+
config["explicit_termination"] = agent_config.get(
|
| 338 |
+
"explicit_termination", False
|
| 339 |
+
)
|
| 340 |
+
config["max_iterations"] = agent_config.get("max_iterations", 0)
|
| 341 |
+
elif task_id == "memory":
|
| 342 |
+
# Memory: FM-1.4 (IBM 2026 - FATAL)
|
| 343 |
+
config["context_summarization"] = agent_config.get(
|
| 344 |
+
"context_summarization", False
|
| 345 |
+
)
|
| 346 |
+
config["sliding_window"] = agent_config.get("sliding_window", False)
|
| 347 |
+
elif task_id == "reasoning":
|
| 348 |
+
# Reasoning: FM-2.6 (IBM 2026 - FATAL)
|
| 349 |
+
config["action_validation"] = agent_config.get("action_validation", False)
|
| 350 |
+
config["reasoning_consistency_check"] = agent_config.get(
|
| 351 |
+
"reasoning_consistency_check", False
|
| 352 |
+
)
|
| 353 |
|
| 354 |
return config
|
| 355 |
|
envs/agent_stress_test_env/server/workflow_simulator.py
CHANGED
|
@@ -53,10 +53,17 @@ class ResilienceConfig:
|
|
| 53 |
context_summarization_threshold: int = 500
|
| 54 |
min_review_depth: int = 1
|
| 55 |
consistency_check: bool = False
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
@dataclass
|
|
@@ -430,3 +437,127 @@ def create_hard_task() -> tuple[list[NodeConfig], str, str]:
|
|
| 430 |
"Verification failure - premature termination + incorrect verification"
|
| 431 |
)
|
| 432 |
return nodes, description, failure_mode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
context_summarization_threshold: int = 500
|
| 54 |
min_review_depth: int = 1
|
| 55 |
consistency_check: bool = False
|
| 56 |
+
# IBM 2026: FC1 - Termination Awareness (FATAL)
|
| 57 |
+
explicit_termination: bool = False
|
| 58 |
+
max_iterations: int = 0
|
| 59 |
+
|
| 60 |
+
# IBM 2026: FC1 - Memory/Context Management (FATAL)
|
| 61 |
+
context_summarization: bool = False
|
| 62 |
+
sliding_window: bool = False
|
| 63 |
+
|
| 64 |
+
# IBM 2026: FC2 - Reasoning-Action Alignment (FATAL)
|
| 65 |
+
action_validation: bool = False
|
| 66 |
+
reasoning_consistency_check: bool = False
|
| 67 |
|
| 68 |
|
| 69 |
@dataclass
|
|
|
|
| 437 |
"Verification failure - premature termination + incorrect verification"
|
| 438 |
)
|
| 439 |
return nodes, description, failure_mode
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
def create_termination_task() -> tuple[list[NodeConfig], str, str]:
|
| 443 |
+
"""
|
| 444 |
+
Termination task: FM-1.5/FM-3.1 (IBM 2026 - FATAL FAILURE)
|
| 445 |
+
|
| 446 |
+
Research: Kimi-K2 shows +46% spike in termination issues.
|
| 447 |
+
Task: Agent struggles to recognize when task is complete - loops or prematurely exits.
|
| 448 |
+
Fix: Implement explicit termination conditions with success criteria.
|
| 449 |
+
"""
|
| 450 |
+
nodes = [
|
| 451 |
+
NodeConfig(
|
| 452 |
+
node_id="researcher",
|
| 453 |
+
role="researcher",
|
| 454 |
+
role_definition="Research and produce a detailed report",
|
| 455 |
+
latency_ms=100,
|
| 456 |
+
),
|
| 457 |
+
NodeConfig(
|
| 458 |
+
node_id="worker1",
|
| 459 |
+
role="worker",
|
| 460 |
+
role_definition="Process research findings",
|
| 461 |
+
fail_rate=0.2, # Occasional failures
|
| 462 |
+
latency_ms=100,
|
| 463 |
+
),
|
| 464 |
+
NodeConfig(
|
| 465 |
+
node_id="worker2",
|
| 466 |
+
role="worker",
|
| 467 |
+
role_definition="Process worker1 output",
|
| 468 |
+
fail_rate=0.2,
|
| 469 |
+
latency_ms=100,
|
| 470 |
+
),
|
| 471 |
+
]
|
| 472 |
+
description = (
|
| 473 |
+
"The agent struggles to recognize when a task is complete. It either: "
|
| 474 |
+
"- Loops indefinitely (FM-1.3 Step Repetition) "
|
| 475 |
+
"- Prematurely exits without confirming success (FM-3.1) "
|
| 476 |
+
"- Is unaware of termination conditions (FM-1.5) "
|
| 477 |
+
"Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues. "
|
| 478 |
+
"Your task: Implement explicit termination conditions with success criteria verification."
|
| 479 |
+
)
|
| 480 |
+
failure_mode = "FM-1.5/FM-3.1: Unaware of termination + premature termination"
|
| 481 |
+
return nodes, description, failure_mode
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
def create_memory_task() -> tuple[list[NodeConfig], str, str]:
|
| 485 |
+
"""
|
| 486 |
+
Memory task: FM-1.4 (IBM 2026 - FATAL FAILURE)
|
| 487 |
+
|
| 488 |
+
Research: GPT-OSS-120B shows 24% memory loss in long traces.
|
| 489 |
+
Task: As conversation history grows, agent loses context and derails.
|
| 490 |
+
Fix: Implement context management (sliding window, summarization, state machine).
|
| 491 |
+
"""
|
| 492 |
+
nodes = [
|
| 493 |
+
NodeConfig(
|
| 494 |
+
node_id="analyzer1",
|
| 495 |
+
role="analyzer",
|
| 496 |
+
role_definition="Analyze data and produce findings",
|
| 497 |
+
context_limit=200, # Small context to trigger memory issues
|
| 498 |
+
latency_ms=100,
|
| 499 |
+
),
|
| 500 |
+
NodeConfig(
|
| 501 |
+
node_id="analyzer2",
|
| 502 |
+
role="analyzer",
|
| 503 |
+
role_definition="Analyze analyzer1 output with original context",
|
| 504 |
+
context_limit=200,
|
| 505 |
+
latency_ms=100,
|
| 506 |
+
),
|
| 507 |
+
NodeConfig(
|
| 508 |
+
node_id="analyzer3",
|
| 509 |
+
role="analyzer",
|
| 510 |
+
role_definition="Synthesize all previous findings",
|
| 511 |
+
context_limit=200,
|
| 512 |
+
latency_ms=100,
|
| 513 |
+
),
|
| 514 |
+
]
|
| 515 |
+
description = (
|
| 516 |
+
"As conversation history grows, the agent loses context and derails. "
|
| 517 |
+
"This is FM-1.4 (Loss of Conversation History) - unique fatal flaw. "
|
| 518 |
+
"Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces. "
|
| 519 |
+
"Your task: Implement context management - sliding window, summarization, or state machine."
|
| 520 |
+
)
|
| 521 |
+
failure_mode = "FM-1.4: Loss of conversation history - agent forgets original task"
|
| 522 |
+
return nodes, description, failure_mode
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
def create_reasoning_task() -> tuple[list[NodeConfig], str, str]:
|
| 526 |
+
"""
|
| 527 |
+
Reasoning-Action task: FM-2.6 (IBM 2026 - FATAL FAILURE)
|
| 528 |
+
|
| 529 |
+
Research: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this.
|
| 530 |
+
Task: Agent identifies correct next step but executes redundant/irrelevant command.
|
| 531 |
+
Fix: Implement action validation layer checking execution against reasoning.
|
| 532 |
+
"""
|
| 533 |
+
nodes = [
|
| 534 |
+
NodeConfig(
|
| 535 |
+
node_id="planner",
|
| 536 |
+
role="planner",
|
| 537 |
+
role_definition="Plan the next action based on current state",
|
| 538 |
+
latency_ms=100,
|
| 539 |
+
),
|
| 540 |
+
NodeConfig(
|
| 541 |
+
node_id="executor",
|
| 542 |
+
role="executor",
|
| 543 |
+
role_definition="Execute the planned action",
|
| 544 |
+
output_corruption_rate=0.4, # 40% chance of executing wrong action
|
| 545 |
+
latency_ms=100,
|
| 546 |
+
),
|
| 547 |
+
NodeConfig(
|
| 548 |
+
node_id="verifier",
|
| 549 |
+
role="verifier",
|
| 550 |
+
role_definition="Verify execution matches plan",
|
| 551 |
+
latency_ms=100,
|
| 552 |
+
),
|
| 553 |
+
]
|
| 554 |
+
description = (
|
| 555 |
+
"The agent identifies the correct next step but executes a redundant or irrelevant command. "
|
| 556 |
+
"FM-2.6: Reasoning-Action Mismatch - describes correct plan but executes unrelated tool call. "
|
| 557 |
+
"Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this. "
|
| 558 |
+
"Your task: Implement action validation layer that checks execution against reasoning."
|
| 559 |
+
)
|
| 560 |
+
failure_mode = (
|
| 561 |
+
"FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution"
|
| 562 |
+
)
|
| 563 |
+
return nodes, description, failure_mode
|
models.py
CHANGED
|
@@ -23,9 +23,16 @@ class ResilienceConfig(Action):
|
|
| 23 |
|
| 24 |
The agent outputs this to fix multi-agent workflow failures.
|
| 25 |
Supports different fix types based on failure mode:
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
retry_max: int = 0
|
|
@@ -38,11 +45,30 @@ class ResilienceConfig(Action):
|
|
| 38 |
min_review_depth: int = 1
|
| 39 |
consistency_check: bool = False
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
spec_fix: str = ""
|
| 43 |
-
explicit_role_spec: bool = False
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
class StressTestObservation(Observation):
|
|
|
|
| 23 |
|
| 24 |
The agent outputs this to fix multi-agent workflow failures.
|
| 25 |
Supports different fix types based on failure mode:
|
| 26 |
+
|
| 27 |
+
MAST Categories (NeurIPS 2025):
|
| 28 |
+
- FC1: System Design (41.8%) - spec, termination, memory
|
| 29 |
+
- FC2: Inter-Agent Misalignment (36.9%) - format, reasoning-action
|
| 30 |
+
- FC3: Task Verification (21.3%) - verification checks
|
| 31 |
+
|
| 32 |
+
IBM 2026 Updates:
|
| 33 |
+
- FM-1.5/FM-3.1: Termination awareness (FATAL)
|
| 34 |
+
- FM-1.4: Memory/Context loss (FATAL)
|
| 35 |
+
- FM-2.6: Reasoning-action mismatch (FATAL)
|
| 36 |
"""
|
| 37 |
|
| 38 |
retry_max: int = 0
|
|
|
|
| 45 |
min_review_depth: int = 1
|
| 46 |
consistency_check: bool = False
|
| 47 |
|
| 48 |
+
# MAST FC1: System Design (Easy task - spec ambiguity)
|
| 49 |
+
spec_fix: str = ""
|
| 50 |
+
explicit_role_spec: bool = False
|
| 51 |
+
|
| 52 |
+
# MAST FC2: Inter-Agent Misalignment (Medium task - format mismatch)
|
| 53 |
+
format_translator: bool = False
|
| 54 |
+
|
| 55 |
+
# MAST FC3: Task Verification (Hard task - verification failure)
|
| 56 |
+
# (uses consistency_check + min_review_depth)
|
| 57 |
+
|
| 58 |
+
# IBM 2026: FC1 - Termination Awareness (FATAL)
|
| 59 |
+
explicit_termination: bool = False
|
| 60 |
+
max_iterations: int = 0
|
| 61 |
+
|
| 62 |
+
# IBM 2026: FC1 - Memory/Context Management (FATAL)
|
| 63 |
+
context_summarization: bool = False
|
| 64 |
+
sliding_window: bool = False
|
| 65 |
+
|
| 66 |
+
# IBM 2026: FC2 - Reasoning-Action Alignment (FATAL)
|
| 67 |
+
action_validation: bool = False
|
| 68 |
+
reasoning_consistency_check: bool = False
|
| 69 |
+
|
| 70 |
+
# Agent's diagnosis of the failure mode
|
| 71 |
+
diagnosis: str = ""
|
| 72 |
|
| 73 |
|
| 74 |
class StressTestObservation(Observation):
|
openenv.yaml
CHANGED
|
@@ -9,13 +9,13 @@ tasks:
|
|
| 9 |
- id: easy
|
| 10 |
name: "Specification Ambiguity Fix"
|
| 11 |
difficulty: easy
|
| 12 |
-
category: "MAST:
|
| 13 |
description: |
|
| 14 |
The researcher agent has a vague role definition ('You are a helpful assistant').
|
| 15 |
This causes task misinterpretation - the agent doesn't know what to research.
|
| 16 |
Your task: Provide an explicit role specification JSON with clear capabilities,
|
| 17 |
constraints, and success criteria.
|
| 18 |
-
failure_mode: "Specification ambiguity - vague role definition causes task misinterpretation"
|
| 19 |
grader:
|
| 20 |
type: programmatic
|
| 21 |
score_range: [0.0, 1.0]
|
|
@@ -30,12 +30,12 @@ tasks:
|
|
| 30 |
- id: medium
|
| 31 |
name: "Format Mismatch Fix"
|
| 32 |
difficulty: medium
|
| 33 |
-
category: "MAST: Inter-Agent Misalignment (36.9% of failures)"
|
| 34 |
description: |
|
| 35 |
Multi-agent workflow where the planner outputs YAML but the executor expects JSON.
|
| 36 |
This format mismatch causes the executor to fail (cannot parse input).
|
| 37 |
Your task: Add a format translation layer/middleware to convert YAML to JSON.
|
| 38 |
-
failure_mode: "Format mismatch - planner outputs YAML, executor expects JSON"
|
| 39 |
grader:
|
| 40 |
type: programmatic
|
| 41 |
score_range: [0.0, 1.0]
|
|
@@ -50,20 +50,21 @@ tasks:
|
|
| 50 |
- id: hard
|
| 51 |
name: "Verification Failure Fix"
|
| 52 |
difficulty: hard
|
| 53 |
-
category: "MAST: Task Verification (21.3% of failures)"
|
| 54 |
description: |
|
| 55 |
Multi-agent pipeline with verification failure. The writer produces content
|
| 56 |
with contradictions (30% rate), and the reviewer prematurely approves (60% rate)
|
| 57 |
without proper verification. This combines premature termination with incorrect verification.
|
| 58 |
Your task: Implement multi-level verification - unit checks per agent,
|
| 59 |
integration checks across outputs, and final validation against success criteria.
|
| 60 |
-
failure_mode: "Verification failure - premature termination + incorrect verification"
|
| 61 |
grader:
|
| 62 |
type: programmatic
|
| 63 |
score_range: [0.0, 1.0]
|
| 64 |
criteria: |
|
| 65 |
Based on MAST research: 21.3% of failures come from verification issues
|
| 66 |
(6.2% premature, 8.2% no verification, 9.1% incorrect).
|
|
|
|
| 67 |
The agent must add deep verification with explicit success criteria.
|
| 68 |
- +0.15 for enabling consistency_check
|
| 69 |
- +0.15 for setting min_review_depth >= 3
|
|
@@ -71,16 +72,92 @@ tasks:
|
|
| 71 |
- +0.10 for achieving 50%+ success rate
|
| 72 |
- +0.20 max for diagnosis keywords (partial credit)
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
difficulty_progression:
|
| 75 |
- easy: "Single spec issue (vague role definition) - solution: explicit spec"
|
| 76 |
- medium: "Inter-agent format mismatch (YAML vs JSON) - solution: translator"
|
| 77 |
- hard: "Verification failure (premature + incorrect) - solution: multi-level checks"
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
research_basis:
|
| 80 |
- name: "MAST: Multi-Agent System Failure Taxonomy"
|
| 81 |
source: "NeurIPS 2025 (Berkeley)"
|
| 82 |
url: "https://arxiv.org/abs/2503.13657"
|
| 83 |
key_finding: "Multi-agent LLM systems fail 41-86.7% of the time in production"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
- name: "Why Do Multi-Agent LLM Systems Fail?"
|
| 85 |
source: "Future AGI 2026 Guide"
|
| 86 |
url: "https://futureagi.substack.com/p/why-do-multi-agent-llm-systems-fail"
|
|
@@ -96,10 +173,17 @@ metadata:
|
|
| 96 |
- mast-research
|
| 97 |
- specification
|
| 98 |
- verification
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
created: 2026-04-08
|
| 100 |
-
version: 1.
|
| 101 |
author: OpenEnv Hackathon
|
| 102 |
benchmark_scores:
|
| 103 |
easy: "Expected 0.85+ for strong LLM with explicit spec"
|
| 104 |
medium: "Expected 0.60-0.75 for strong LLM with translator"
|
| 105 |
-
hard: "Expected 0.35-0.50 for strong LLM with deep verification"
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
- id: easy
|
| 10 |
name: "Specification Ambiguity Fix"
|
| 11 |
difficulty: easy
|
| 12 |
+
category: "MAST FC1: System Design (41.8% of failures)"
|
| 13 |
description: |
|
| 14 |
The researcher agent has a vague role definition ('You are a helpful assistant').
|
| 15 |
This causes task misinterpretation - the agent doesn't know what to research.
|
| 16 |
Your task: Provide an explicit role specification JSON with clear capabilities,
|
| 17 |
constraints, and success criteria.
|
| 18 |
+
failure_mode: "FM-1.1: Specification ambiguity - vague role definition causes task misinterpretation"
|
| 19 |
grader:
|
| 20 |
type: programmatic
|
| 21 |
score_range: [0.0, 1.0]
|
|
|
|
| 30 |
- id: medium
|
| 31 |
name: "Format Mismatch Fix"
|
| 32 |
difficulty: medium
|
| 33 |
+
category: "MAST FC2: Inter-Agent Misalignment (36.9% of failures)"
|
| 34 |
description: |
|
| 35 |
Multi-agent workflow where the planner outputs YAML but the executor expects JSON.
|
| 36 |
This format mismatch causes the executor to fail (cannot parse input).
|
| 37 |
Your task: Add a format translation layer/middleware to convert YAML to JSON.
|
| 38 |
+
failure_mode: "FM-2.x: Format mismatch - planner outputs YAML, executor expects JSON"
|
| 39 |
grader:
|
| 40 |
type: programmatic
|
| 41 |
score_range: [0.0, 1.0]
|
|
|
|
| 50 |
- id: hard
|
| 51 |
name: "Verification Failure Fix"
|
| 52 |
difficulty: hard
|
| 53 |
+
category: "MAST FC3: Task Verification (21.3% of failures)"
|
| 54 |
description: |
|
| 55 |
Multi-agent pipeline with verification failure. The writer produces content
|
| 56 |
with contradictions (30% rate), and the reviewer prematurely approves (60% rate)
|
| 57 |
without proper verification. This combines premature termination with incorrect verification.
|
| 58 |
Your task: Implement multi-level verification - unit checks per agent,
|
| 59 |
integration checks across outputs, and final validation against success criteria.
|
| 60 |
+
failure_mode: "FM-3.1/FM-3.3: Verification failure - premature termination + incorrect verification"
|
| 61 |
grader:
|
| 62 |
type: programmatic
|
| 63 |
score_range: [0.0, 1.0]
|
| 64 |
criteria: |
|
| 65 |
Based on MAST research: 21.3% of failures come from verification issues
|
| 66 |
(6.2% premature, 8.2% no verification, 9.1% incorrect).
|
| 67 |
+
IBM 2026 update: FM-3.3 (Incorrect Verification) is the STRONGEST predictor of failure.
|
| 68 |
The agent must add deep verification with explicit success criteria.
|
| 69 |
- +0.15 for enabling consistency_check
|
| 70 |
- +0.15 for setting min_review_depth >= 3
|
|
|
|
| 72 |
- +0.10 for achieving 50%+ success rate
|
| 73 |
- +0.20 max for diagnosis keywords (partial credit)
|
| 74 |
|
| 75 |
+
- id: termination
|
| 76 |
+
name: "Termination Awareness Fix"
|
| 77 |
+
difficulty: medium
|
| 78 |
+
category: "MAST FC1: System Design - FATAL FAILURE"
|
| 79 |
+
description: |
|
| 80 |
+
The agent struggles to recognize when a task is complete. It either:
|
| 81 |
+
- Loops indefinitely (FM-1.3 Step Repetition)
|
| 82 |
+
- Prematurely exits without confirming success (FM-3.1)
|
| 83 |
+
- Is unaware of termination conditions (FM-1.5)
|
| 84 |
+
|
| 85 |
+
Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues.
|
| 86 |
+
Your task: Implement explicit termination conditions with success criteria verification.
|
| 87 |
+
failure_mode: "FM-1.5/FM-3.1: Unaware of termination + premature termination"
|
| 88 |
+
grader:
|
| 89 |
+
type: programmatic
|
| 90 |
+
score_range: [0.0, 1.0]
|
| 91 |
+
criteria: |
|
| 92 |
+
FATAL FAILURE: When these modes appear, success probability drops precipitously.
|
| 93 |
+
IBM 2026: Use deterministic state machine to enforce termination.
|
| 94 |
+
- +0.25 for enabling explicit termination detection
|
| 95 |
+
- +0.20 for implementing max_iterations limit
|
| 96 |
+
- +0.30 * success_rate from 10 simulation trials
|
| 97 |
+
- +0.15 for achieving 60%+ success rate
|
| 98 |
+
- +0.15 max for diagnosis keywords (partial credit)
|
| 99 |
+
|
| 100 |
+
- id: memory
|
| 101 |
+
name: "Conversation History Fix"
|
| 102 |
+
difficulty: hard
|
| 103 |
+
category: "MAST FC1: System Design - FATAL FAILURE"
|
| 104 |
+
description: |
|
| 105 |
+
As conversation history grows, the agent loses context and derails.
|
| 106 |
+
This is FM-1.4 (Loss of Conversation History) - unique fatal flaw.
|
| 107 |
+
Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces.
|
| 108 |
+
Your task: Implement context management - sliding window, summarization, or state machine.
|
| 109 |
+
failure_mode: "FM-1.4: Loss of conversation history - agent forgets original task"
|
| 110 |
+
grader:
|
| 111 |
+
type: programmatic
|
| 112 |
+
score_range: [0.0, 1.0]
|
| 113 |
+
criteria: |
|
| 114 |
+
FATAL FAILURE: Memory loss in long traces leads to total task derailment.
|
| 115 |
+
IBM 2026: Implement aggressive context hygiene and early error detection.
|
| 116 |
+
- +0.20 for enabling context summarization
|
| 117 |
+
- +0.20 for implementing sliding window
|
| 118 |
+
- +0.35 * success_rate from 10 simulation trials (with long context)
|
| 119 |
+
- +0.15 for achieving 50%+ success rate in long traces
|
| 120 |
+
- +0.15 max for diagnosis keywords (partial credit)
|
| 121 |
+
|
| 122 |
+
- id: reasoning
|
| 123 |
+
name: "Reasoning-Action Alignment Fix"
|
| 124 |
+
difficulty: hard
|
| 125 |
+
category: "MAST FC2: Inter-Agent Misalignment - FATAL FAILURE"
|
| 126 |
+
description: |
|
| 127 |
+
The agent identifies the correct next step but executes a redundant or irrelevant command.
|
| 128 |
+
FM-2.6: Reasoning-Action Mismatch - describes correct plan but executes unrelated tool call.
|
| 129 |
+
Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this.
|
| 130 |
+
Your task: Implement action validation layer that checks execution against reasoning.
|
| 131 |
+
failure_mode: "FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution"
|
| 132 |
+
grader:
|
| 133 |
+
type: programmatic
|
| 134 |
+
score_range: [0.0, 1.0]
|
| 135 |
+
criteria: |
|
| 136 |
+
FATAL FAILURE: Decoupling of reasoning and action causes cascading collapse.
|
| 137 |
+
IBM 2026: Small reasoning mismatches early poison entire task history.
|
| 138 |
+
- +0.20 for enabling action validation
|
| 139 |
+
- +0.20 for implementing reasoning-execution consistency check
|
| 140 |
+
- +0.35 * success_rate from 10 simulation trials
|
| 141 |
+
- +0.15 for achieving 45%+ success rate
|
| 142 |
+
- +0.15 max for diagnosis keywords (partial credit)
|
| 143 |
+
|
| 144 |
difficulty_progression:
|
| 145 |
- easy: "Single spec issue (vague role definition) - solution: explicit spec"
|
| 146 |
- medium: "Inter-agent format mismatch (YAML vs JSON) - solution: translator"
|
| 147 |
- hard: "Verification failure (premature + incorrect) - solution: multi-level checks"
|
| 148 |
+
- termination: "Termination awareness (loops/premature exit) - solution: state machine"
|
| 149 |
+
- memory: "Conversation history loss (forgets context) - solution: context management"
|
| 150 |
+
- reasoning: "Reasoning-action mismatch (wrong execution) - solution: validation layer"
|
| 151 |
|
| 152 |
research_basis:
|
| 153 |
- name: "MAST: Multi-Agent System Failure Taxonomy"
|
| 154 |
source: "NeurIPS 2025 (Berkeley)"
|
| 155 |
url: "https://arxiv.org/abs/2503.13657"
|
| 156 |
key_finding: "Multi-agent LLM systems fail 41-86.7% of the time in production"
|
| 157 |
+
- name: "IBM and UC Berkeley: Enterprise Agents Fail with IT-Bench and MAST"
|
| 158 |
+
source: "Hugging Face Blog (Feb 2026)"
|
| 159 |
+
url: "https://huggingface.co/blog/ibm-research/itbenchandmast"
|
| 160 |
+
key_finding: "FM-3.3 (Incorrect Verification) is strongest failure predictor; fatal vs non-fatal distinction critical"
|
| 161 |
- name: "Why Do Multi-Agent LLM Systems Fail?"
|
| 162 |
source: "Future AGI 2026 Guide"
|
| 163 |
url: "https://futureagi.substack.com/p/why-do-multi-agent-llm-systems-fail"
|
|
|
|
| 173 |
- mast-research
|
| 174 |
- specification
|
| 175 |
- verification
|
| 176 |
+
- termination-awareness
|
| 177 |
+
- memory-management
|
| 178 |
+
- reasoning-alignment
|
| 179 |
+
- ibm-research
|
| 180 |
created: 2026-04-08
|
| 181 |
+
version: 1.2.0
|
| 182 |
author: OpenEnv Hackathon
|
| 183 |
benchmark_scores:
|
| 184 |
easy: "Expected 0.85+ for strong LLM with explicit spec"
|
| 185 |
medium: "Expected 0.60-0.75 for strong LLM with translator"
|
| 186 |
+
hard: "Expected 0.35-0.50 for strong LLM with deep verification"
|
| 187 |
+
termination: "Expected 0.50-0.65 for LLM with state machine"
|
| 188 |
+
memory: "Expected 0.40-0.55 for LLM with context management"
|
| 189 |
+
reasoning: "Expected 0.35-0.50 for LLM with validation layer"
|
server/graders.py
CHANGED
|
@@ -57,6 +57,12 @@ class Grader:
|
|
| 57 |
"verify": 0.0,
|
| 58 |
"check": 0.0,
|
| 59 |
"review": 0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
# Specification keywords
|
|
@@ -87,6 +93,24 @@ class Grader:
|
|
| 87 |
if "contradict" in diagnosis_lower:
|
| 88 |
scores["contradiction"] = 0.10
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
return scores
|
| 91 |
|
| 92 |
def grade(
|
|
@@ -406,5 +430,273 @@ def get_grader(task_id: str) -> Grader:
|
|
| 406 |
"easy": EasyGrader(),
|
| 407 |
"medium": MediumGrader(),
|
| 408 |
"hard": HardGrader(),
|
|
|
|
|
|
|
|
|
|
| 409 |
}
|
| 410 |
return graders.get(task_id, EasyGrader())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
"verify": 0.0,
|
| 58 |
"check": 0.0,
|
| 59 |
"review": 0.0,
|
| 60 |
+
"termination": 0.0,
|
| 61 |
+
"loop": 0.0,
|
| 62 |
+
"memory": 0.0,
|
| 63 |
+
"context": 0.0,
|
| 64 |
+
"reasoning": 0.0,
|
| 65 |
+
"action": 0.0,
|
| 66 |
}
|
| 67 |
|
| 68 |
# Specification keywords
|
|
|
|
| 93 |
if "contradict" in diagnosis_lower:
|
| 94 |
scores["contradiction"] = 0.10
|
| 95 |
|
| 96 |
+
# Termination keywords (IBM 2026 - FATAL)
|
| 97 |
+
if "terminat" in diagnosis_lower or "loop" in diagnosis_lower:
|
| 98 |
+
scores["termination"] = 0.10
|
| 99 |
+
if "infinite" in diagnosis_lower or "repeat" in diagnosis_lower:
|
| 100 |
+
scores["loop"] = 0.10
|
| 101 |
+
|
| 102 |
+
# Memory/Context keywords (IBM 2026 - FATAL)
|
| 103 |
+
if "memory" in diagnosis_lower or "forget" in diagnosis_lower:
|
| 104 |
+
scores["memory"] = 0.10
|
| 105 |
+
if "context" in diagnosis_lower or "history" in diagnosis_lower:
|
| 106 |
+
scores["context"] = 0.10
|
| 107 |
+
|
| 108 |
+
# Reasoning-Action keywords (IBM 2026 - FATAL)
|
| 109 |
+
if "reason" in diagnosis_lower or "think" in diagnosis_lower:
|
| 110 |
+
scores["reasoning"] = 0.10
|
| 111 |
+
if "action" in diagnosis_lower or "execut" in diagnosis_lower:
|
| 112 |
+
scores["action"] = 0.10
|
| 113 |
+
|
| 114 |
return scores
|
| 115 |
|
| 116 |
def grade(
|
|
|
|
| 430 |
"easy": EasyGrader(),
|
| 431 |
"medium": MediumGrader(),
|
| 432 |
"hard": HardGrader(),
|
| 433 |
+
"termination": TerminationGrader(),
|
| 434 |
+
"memory": MemoryGrader(),
|
| 435 |
+
"reasoning": ReasoningGrader(),
|
| 436 |
}
|
| 437 |
return graders.get(task_id, EasyGrader())
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
class TerminationGrader(Grader):
|
| 441 |
+
"""
|
| 442 |
+
Grader for termination task: FM-1.5/FM-3.1 (IBM 2026 - FATAL FAILURE)
|
| 443 |
+
|
| 444 |
+
Task: Agent struggles to recognize task completion - loops or prematurely exits.
|
| 445 |
+
Fix: Implement explicit termination conditions with success criteria.
|
| 446 |
+
"""
|
| 447 |
+
|
| 448 |
+
def __init__(self):
|
| 449 |
+
super().__init__("termination", "medium")
|
| 450 |
+
|
| 451 |
+
def grade(
|
| 452 |
+
self,
|
| 453 |
+
agent_config: dict[str, Any],
|
| 454 |
+
task_description: str,
|
| 455 |
+
failure_mode: str,
|
| 456 |
+
diagnosis: str,
|
| 457 |
+
) -> tuple[float, dict[str, Any]]:
|
| 458 |
+
from .workflow_simulator import create_termination_task
|
| 459 |
+
|
| 460 |
+
nodes, _, _ = create_termination_task()
|
| 461 |
+
resilience = self._parse_config(agent_config, diagnosis)
|
| 462 |
+
simulator = WorkflowSimulator(nodes, seed=42)
|
| 463 |
+
|
| 464 |
+
results = []
|
| 465 |
+
for _ in range(10):
|
| 466 |
+
result = simulator.run_workflow(resilience)
|
| 467 |
+
results.append(result.success)
|
| 468 |
+
|
| 469 |
+
success_rate = sum(results) / len(results)
|
| 470 |
+
|
| 471 |
+
has_termination_detection = agent_config.get("explicit_termination", False)
|
| 472 |
+
has_max_iterations = agent_config.get("max_iterations", 0) > 0
|
| 473 |
+
|
| 474 |
+
diagnosis_scores = self._parse_diagnosis(diagnosis)
|
| 475 |
+
diagnosis_points = min(
|
| 476 |
+
0.15, diagnosis_scores["termination"] + diagnosis_scores["loop"]
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
score = 0.0
|
| 480 |
+
|
| 481 |
+
if has_termination_detection:
|
| 482 |
+
score += 0.25
|
| 483 |
+
if has_max_iterations:
|
| 484 |
+
score += 0.20
|
| 485 |
+
|
| 486 |
+
if success_rate > 0:
|
| 487 |
+
score += success_rate * 0.30
|
| 488 |
+
|
| 489 |
+
if success_rate >= 0.6:
|
| 490 |
+
score += 0.15
|
| 491 |
+
|
| 492 |
+
score += diagnosis_points
|
| 493 |
+
|
| 494 |
+
score = min(1.0, max(0.0, score))
|
| 495 |
+
|
| 496 |
+
return score, {
|
| 497 |
+
"success_rate": success_rate,
|
| 498 |
+
"has_termination_detection": has_termination_detection,
|
| 499 |
+
"has_max_iterations": has_max_iterations,
|
| 500 |
+
"diagnosis_points": diagnosis_points,
|
| 501 |
+
"config": agent_config,
|
| 502 |
+
"diagnosis": diagnosis,
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
def _parse_config(
|
| 506 |
+
self, agent_config: dict[str, Any], diagnosis: str
|
| 507 |
+
) -> ResilienceConfig:
|
| 508 |
+
return ResilienceConfig(
|
| 509 |
+
retry_max=agent_config.get("max_iterations", 50),
|
| 510 |
+
retry_delay_ms=agent_config.get("retry_delay_ms", 0),
|
| 511 |
+
timeout_ms=agent_config.get("timeout_ms", 30000),
|
| 512 |
+
fallback=agent_config.get("fallback", "abort"),
|
| 513 |
+
circuit_breaker_threshold=agent_config.get(
|
| 514 |
+
"circuit_breaker_threshold", 1.0
|
| 515 |
+
),
|
| 516 |
+
context_strategy=agent_config.get("context_strategy", "truncate"),
|
| 517 |
+
context_summarization_threshold=agent_config.get(
|
| 518 |
+
"context_summarization_threshold", 500
|
| 519 |
+
),
|
| 520 |
+
min_review_depth=agent_config.get("min_review_depth", 1),
|
| 521 |
+
consistency_check=agent_config.get("consistency_check", False),
|
| 522 |
+
explicit_termination=agent_config.get("explicit_termination", False),
|
| 523 |
+
diagnosis=diagnosis,
|
| 524 |
+
)
|
| 525 |
+
|
| 526 |
+
|
| 527 |
+
class MemoryGrader(Grader):
|
| 528 |
+
"""
|
| 529 |
+
Grader for memory task: FM-1.4 (IBM 2026 - FATAL FAILURE)
|
| 530 |
+
|
| 531 |
+
Task: Agent loses conversation history in long traces - forgets original task.
|
| 532 |
+
Fix: Implement context management (sliding window, summarization, state machine).
|
| 533 |
+
"""
|
| 534 |
+
|
| 535 |
+
def __init__(self):
|
| 536 |
+
super().__init__("memory", "hard")
|
| 537 |
+
|
| 538 |
+
def grade(
|
| 539 |
+
self,
|
| 540 |
+
agent_config: dict[str, Any],
|
| 541 |
+
task_description: str,
|
| 542 |
+
failure_mode: str,
|
| 543 |
+
diagnosis: str,
|
| 544 |
+
) -> tuple[float, dict[str, Any]]:
|
| 545 |
+
from .workflow_simulator import create_memory_task
|
| 546 |
+
|
| 547 |
+
nodes, _, _ = create_memory_task()
|
| 548 |
+
resilience = self._parse_config(agent_config, diagnosis)
|
| 549 |
+
simulator = WorkflowSimulator(nodes, seed=42)
|
| 550 |
+
|
| 551 |
+
results = []
|
| 552 |
+
for _ in range(10):
|
| 553 |
+
result = simulator.run_workflow(resilience)
|
| 554 |
+
results.append(result.success)
|
| 555 |
+
|
| 556 |
+
success_rate = sum(results) / len(results)
|
| 557 |
+
|
| 558 |
+
has_summarization = agent_config.get("context_summarization", False)
|
| 559 |
+
has_sliding_window = agent_config.get("sliding_window", False)
|
| 560 |
+
|
| 561 |
+
diagnosis_scores = self._parse_diagnosis(diagnosis)
|
| 562 |
+
diagnosis_points = min(
|
| 563 |
+
0.15, diagnosis_scores["memory"] + diagnosis_scores["context"]
|
| 564 |
+
)
|
| 565 |
+
|
| 566 |
+
score = 0.0
|
| 567 |
+
|
| 568 |
+
if has_summarization:
|
| 569 |
+
score += 0.20
|
| 570 |
+
if has_sliding_window:
|
| 571 |
+
score += 0.20
|
| 572 |
+
|
| 573 |
+
if success_rate > 0:
|
| 574 |
+
score += success_rate * 0.35
|
| 575 |
+
|
| 576 |
+
if success_rate >= 0.5:
|
| 577 |
+
score += 0.15
|
| 578 |
+
|
| 579 |
+
score += diagnosis_points
|
| 580 |
+
|
| 581 |
+
score = min(1.0, max(0.0, score))
|
| 582 |
+
|
| 583 |
+
return score, {
|
| 584 |
+
"success_rate": success_rate,
|
| 585 |
+
"has_summarization": has_summarization,
|
| 586 |
+
"has_sliding_window": has_sliding_window,
|
| 587 |
+
"diagnosis_points": diagnosis_points,
|
| 588 |
+
"config": agent_config,
|
| 589 |
+
"diagnosis": diagnosis,
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
def _parse_config(
|
| 593 |
+
self, agent_config: dict[str, Any], diagnosis: str
|
| 594 |
+
) -> ResilienceConfig:
|
| 595 |
+
return ResilienceConfig(
|
| 596 |
+
retry_max=agent_config.get("retry_max", 0),
|
| 597 |
+
retry_delay_ms=agent_config.get("retry_delay_ms", 0),
|
| 598 |
+
timeout_ms=agent_config.get("timeout_ms", 30000),
|
| 599 |
+
fallback=agent_config.get("fallback", "abort"),
|
| 600 |
+
circuit_breaker_threshold=agent_config.get(
|
| 601 |
+
"circuit_breaker_threshold", 1.0
|
| 602 |
+
),
|
| 603 |
+
context_strategy=agent_config.get("context_strategy", "summarize"),
|
| 604 |
+
context_summarization_threshold=agent_config.get(
|
| 605 |
+
"context_summarization_threshold", 200
|
| 606 |
+
),
|
| 607 |
+
min_review_depth=agent_config.get("min_review_depth", 1),
|
| 608 |
+
consistency_check=agent_config.get("consistency_check", False),
|
| 609 |
+
context_summarization=agent_config.get("context_summarization", False),
|
| 610 |
+
sliding_window=agent_config.get("sliding_window", False),
|
| 611 |
+
diagnosis=diagnosis,
|
| 612 |
+
)
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
class ReasoningGrader(Grader):
|
| 616 |
+
"""
|
| 617 |
+
Grader for reasoning-action alignment: FM-2.6 (IBM 2026 - FATAL FAILURE)
|
| 618 |
+
|
| 619 |
+
Task: Agent describes correct plan but executes unrelated/redundant command.
|
| 620 |
+
Fix: Implement action validation layer checking execution against reasoning.
|
| 621 |
+
"""
|
| 622 |
+
|
| 623 |
+
def __init__(self):
|
| 624 |
+
super().__init__("reasoning", "hard")
|
| 625 |
+
|
| 626 |
+
def grade(
|
| 627 |
+
self,
|
| 628 |
+
agent_config: dict[str, Any],
|
| 629 |
+
task_description: str,
|
| 630 |
+
failure_mode: str,
|
| 631 |
+
diagnosis: str,
|
| 632 |
+
) -> tuple[float, dict[str, Any]]:
|
| 633 |
+
from .workflow_simulator import create_reasoning_task
|
| 634 |
+
|
| 635 |
+
nodes, _, _ = create_reasoning_task()
|
| 636 |
+
resilience = self._parse_config(agent_config, diagnosis)
|
| 637 |
+
simulator = WorkflowSimulator(nodes, seed=42)
|
| 638 |
+
|
| 639 |
+
results = []
|
| 640 |
+
for _ in range(10):
|
| 641 |
+
result = simulator.run_workflow(resilience)
|
| 642 |
+
results.append(result.success)
|
| 643 |
+
|
| 644 |
+
success_rate = sum(results) / len(results)
|
| 645 |
+
|
| 646 |
+
has_action_validation = agent_config.get("action_validation", False)
|
| 647 |
+
has_consistency_check = agent_config.get("reasoning_consistency_check", False)
|
| 648 |
+
|
| 649 |
+
diagnosis_scores = self._parse_diagnosis(diagnosis)
|
| 650 |
+
diagnosis_points = min(
|
| 651 |
+
0.15, diagnosis_scores["reasoning"] + diagnosis_scores["action"]
|
| 652 |
+
)
|
| 653 |
+
|
| 654 |
+
score = 0.0
|
| 655 |
+
|
| 656 |
+
if has_action_validation:
|
| 657 |
+
score += 0.20
|
| 658 |
+
if has_consistency_check:
|
| 659 |
+
score += 0.20
|
| 660 |
+
|
| 661 |
+
if success_rate > 0:
|
| 662 |
+
score += success_rate * 0.35
|
| 663 |
+
|
| 664 |
+
if success_rate >= 0.45:
|
| 665 |
+
score += 0.15
|
| 666 |
+
|
| 667 |
+
score += diagnosis_points
|
| 668 |
+
|
| 669 |
+
score = min(1.0, max(0.0, score))
|
| 670 |
+
|
| 671 |
+
return score, {
|
| 672 |
+
"success_rate": success_rate,
|
| 673 |
+
"has_action_validation": has_action_validation,
|
| 674 |
+
"has_consistency_check": has_consistency_check,
|
| 675 |
+
"diagnosis_points": diagnosis_points,
|
| 676 |
+
"config": agent_config,
|
| 677 |
+
"diagnosis": diagnosis,
|
| 678 |
+
}
|
| 679 |
+
|
| 680 |
+
def _parse_config(
|
| 681 |
+
self, agent_config: dict[str, Any], diagnosis: str
|
| 682 |
+
) -> ResilienceConfig:
|
| 683 |
+
return ResilienceConfig(
|
| 684 |
+
retry_max=agent_config.get("retry_max", 0),
|
| 685 |
+
retry_delay_ms=agent_config.get("retry_delay_ms", 0),
|
| 686 |
+
timeout_ms=agent_config.get("timeout_ms", 30000),
|
| 687 |
+
fallback=agent_config.get("fallback", "abort"),
|
| 688 |
+
circuit_breaker_threshold=agent_config.get(
|
| 689 |
+
"circuit_breaker_threshold", 1.0
|
| 690 |
+
),
|
| 691 |
+
context_strategy=agent_config.get("context_strategy", "truncate"),
|
| 692 |
+
context_summarization_threshold=agent_config.get(
|
| 693 |
+
"context_summarization_threshold", 500
|
| 694 |
+
),
|
| 695 |
+
min_review_depth=agent_config.get("min_review_depth", 1),
|
| 696 |
+
consistency_check=agent_config.get("consistency_check", False),
|
| 697 |
+
action_validation=agent_config.get("action_validation", False),
|
| 698 |
+
reasoning_consistency_check=agent_config.get(
|
| 699 |
+
"reasoning_consistency_check", False
|
| 700 |
+
),
|
| 701 |
+
diagnosis=diagnosis,
|
| 702 |
+
)
|
server/stress_test_environment.py
CHANGED
|
@@ -23,6 +23,9 @@ try:
|
|
| 23 |
create_easy_task,
|
| 24 |
create_hard_task,
|
| 25 |
create_medium_task,
|
|
|
|
|
|
|
|
|
|
| 26 |
)
|
| 27 |
except ImportError:
|
| 28 |
from openenv.core.env_server.interfaces import (
|
|
@@ -48,23 +51,44 @@ TASK_DEFINITIONS = {
|
|
| 48 |
"easy": {
|
| 49 |
"id": "easy",
|
| 50 |
"difficulty": "easy",
|
| 51 |
-
"category": "MAST:
|
| 52 |
"description": "The researcher agent has a vague role definition ('You are a helpful assistant'). This causes task misinterpretation. Your task: Provide an explicit role specification JSON with clear capabilities, constraints, and success criteria.",
|
| 53 |
-
"failure_mode": "Specification ambiguity - vague role definition causes task misinterpretation",
|
| 54 |
},
|
| 55 |
"medium": {
|
| 56 |
"id": "medium",
|
| 57 |
"difficulty": "medium",
|
| 58 |
-
"category": "MAST: Inter-Agent Misalignment (36.9% of failures)",
|
| 59 |
"description": "Multi-agent workflow where the planner outputs YAML but the executor expects JSON. This format mismatch causes the executor to fail. Your task: Add a format translation layer/middleware.",
|
| 60 |
-
"failure_mode": "Format mismatch - planner outputs YAML, executor expects JSON",
|
| 61 |
},
|
| 62 |
"hard": {
|
| 63 |
"id": "hard",
|
| 64 |
"difficulty": "hard",
|
| 65 |
-
"category": "MAST: Task Verification (21.3% of failures)",
|
| 66 |
-
"description": "Multi-agent pipeline with verification failure. Writer produces contradictions (30%), reviewer prematurely approves (60%) without checks. Your task: Implement multi-level verification.",
|
| 67 |
-
"failure_mode": "Verification failure - premature termination + incorrect verification",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
},
|
| 69 |
}
|
| 70 |
|
|
@@ -116,6 +140,14 @@ class StressTestEnvironment(
|
|
| 116 |
step_count=0,
|
| 117 |
)
|
| 118 |
self._current_task_index = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
task = TASK_DEFINITIONS["easy"]
|
| 121 |
|
|
@@ -207,33 +239,31 @@ class StressTestEnvironment(
|
|
| 207 |
self._state.total_score = combined_score
|
| 208 |
self._state.step_count += 1
|
| 209 |
|
| 210 |
-
# Return combined result for
|
| 211 |
task_id = "all_tasks"
|
| 212 |
task = {
|
| 213 |
-
"description": "All
|
| 214 |
-
"failure_mode": "Combined MAST failure modes",
|
| 215 |
"category": "MAST: All categories",
|
| 216 |
}
|
| 217 |
|
| 218 |
obs = StressTestObservation(
|
| 219 |
task_id="all_tasks",
|
| 220 |
-
task_description=f"Easy: {all_scores[0]:.2f}, Medium: {all_scores[1]:.2f}, Hard: {all_scores[2]:.2f} | Combined: {combined_score:.2f}",
|
| 221 |
-
scenario_setup="All
|
| 222 |
-
failure_category="MAST: Spec (41.8%) + Inter-Agent (36.9%) + Verification (21.3%)",
|
| 223 |
failure_mode_detected=True,
|
| 224 |
-
failure_mode_description="Specification, Format Mismatch, and
|
| 225 |
resilience_applied=True,
|
| 226 |
applied_config=json.dumps(agent_config),
|
| 227 |
test_passed=combined_score >= 0.5,
|
| 228 |
-
test_completions=int(
|
| 229 |
-
|
| 230 |
-
), # Report easy task completions
|
| 231 |
-
test_total_trials=30, # Total across all tasks
|
| 232 |
test_latency_ms=0,
|
| 233 |
diagnosis=f"Task scores: {all_scores}",
|
| 234 |
diagnosis_points=0.0,
|
| 235 |
reward=combined_score,
|
| 236 |
-
done=True,
|
| 237 |
)
|
| 238 |
|
| 239 |
return obs
|
|
@@ -302,6 +332,24 @@ class StressTestEnvironment(
|
|
| 302 |
# Hard: Verification fix
|
| 303 |
config["consistency_check"] = agent_config.get("consistency_check", False)
|
| 304 |
config["min_review_depth"] = agent_config.get("min_review_depth", 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
return config
|
| 307 |
|
|
|
|
| 23 |
create_easy_task,
|
| 24 |
create_hard_task,
|
| 25 |
create_medium_task,
|
| 26 |
+
create_termination_task,
|
| 27 |
+
create_memory_task,
|
| 28 |
+
create_reasoning_task,
|
| 29 |
)
|
| 30 |
except ImportError:
|
| 31 |
from openenv.core.env_server.interfaces import (
|
|
|
|
| 51 |
"easy": {
|
| 52 |
"id": "easy",
|
| 53 |
"difficulty": "easy",
|
| 54 |
+
"category": "MAST FC1: System Design (41.8% of failures)",
|
| 55 |
"description": "The researcher agent has a vague role definition ('You are a helpful assistant'). This causes task misinterpretation. Your task: Provide an explicit role specification JSON with clear capabilities, constraints, and success criteria.",
|
| 56 |
+
"failure_mode": "FM-1.1: Specification ambiguity - vague role definition causes task misinterpretation",
|
| 57 |
},
|
| 58 |
"medium": {
|
| 59 |
"id": "medium",
|
| 60 |
"difficulty": "medium",
|
| 61 |
+
"category": "MAST FC2: Inter-Agent Misalignment (36.9% of failures)",
|
| 62 |
"description": "Multi-agent workflow where the planner outputs YAML but the executor expects JSON. This format mismatch causes the executor to fail. Your task: Add a format translation layer/middleware.",
|
| 63 |
+
"failure_mode": "FM-2.x: Format mismatch - planner outputs YAML, executor expects JSON",
|
| 64 |
},
|
| 65 |
"hard": {
|
| 66 |
"id": "hard",
|
| 67 |
"difficulty": "hard",
|
| 68 |
+
"category": "MAST FC3: Task Verification (21.3% of failures)",
|
| 69 |
+
"description": "Multi-agent pipeline with verification failure. Writer produces contradictions (30%), reviewer prematurely approves (60%) without checks. Your task: Implement multi-level verification. IBM 2026: FM-3.3 is strongest failure predictor.",
|
| 70 |
+
"failure_mode": "FM-3.1/FM-3.3: Verification failure - premature termination + incorrect verification",
|
| 71 |
+
},
|
| 72 |
+
"termination": {
|
| 73 |
+
"id": "termination",
|
| 74 |
+
"difficulty": "medium",
|
| 75 |
+
"category": "MAST FC1: System Design - FATAL FAILURE",
|
| 76 |
+
"description": "The agent struggles to recognize when a task is complete. It loops indefinitely or prematurely exits. Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues. Your task: Implement explicit termination conditions with success criteria.",
|
| 77 |
+
"failure_mode": "FM-1.5/FM-3.1: Unaware of termination + premature termination",
|
| 78 |
+
},
|
| 79 |
+
"memory": {
|
| 80 |
+
"id": "memory",
|
| 81 |
+
"difficulty": "hard",
|
| 82 |
+
"category": "MAST FC1: System Design - FATAL FAILURE",
|
| 83 |
+
"description": "As conversation history grows, the agent loses context and derails. Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces. Your task: Implement context management - sliding window, summarization, or state machine.",
|
| 84 |
+
"failure_mode": "FM-1.4: Loss of conversation history - agent forgets original task",
|
| 85 |
+
},
|
| 86 |
+
"reasoning": {
|
| 87 |
+
"id": "reasoning",
|
| 88 |
+
"difficulty": "hard",
|
| 89 |
+
"category": "MAST FC2: Inter-Agent Misalignment - FATAL FAILURE",
|
| 90 |
+
"description": "The agent describes correct plan but executes unrelated command. Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this. Your task: Implement action validation layer checking execution against reasoning.",
|
| 91 |
+
"failure_mode": "FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution",
|
| 92 |
},
|
| 93 |
}
|
| 94 |
|
|
|
|
| 140 |
step_count=0,
|
| 141 |
)
|
| 142 |
self._current_task_index = 0
|
| 143 |
+
self._task_ids = [
|
| 144 |
+
"easy",
|
| 145 |
+
"medium",
|
| 146 |
+
"hard",
|
| 147 |
+
"termination",
|
| 148 |
+
"memory",
|
| 149 |
+
"reasoning",
|
| 150 |
+
]
|
| 151 |
|
| 152 |
task = TASK_DEFINITIONS["easy"]
|
| 153 |
|
|
|
|
| 239 |
self._state.total_score = combined_score
|
| 240 |
self._state.step_count += 1
|
| 241 |
|
| 242 |
+
# Return combined result for all tasks
|
| 243 |
task_id = "all_tasks"
|
| 244 |
task = {
|
| 245 |
+
"description": "All 6 tasks (Easy/Medium/Hard + Termination/Memory/Reasoning)",
|
| 246 |
+
"failure_mode": "Combined MAST failure modes including IBM 2026 FATAL failures",
|
| 247 |
"category": "MAST: All categories",
|
| 248 |
}
|
| 249 |
|
| 250 |
obs = StressTestObservation(
|
| 251 |
task_id="all_tasks",
|
| 252 |
+
task_description=f"Easy: {all_scores[0]:.2f}, Medium: {all_scores[1]:.2f}, Hard: {all_scores[2]:.2f}, Term: {all_scores[3]:.2f}, Mem: {all_scores[4]:.2f}, Reas: {all_scores[5]:.2f} | Combined: {combined_score:.2f}",
|
| 253 |
+
scenario_setup="All 6 MAST failure categories evaluated including IBM 2026 fatal failures",
|
| 254 |
+
failure_category="MAST: Spec (41.8%) + Inter-Agent (36.9%) + Verification (21.3%) + IBM FATAL (termination, memory, reasoning)",
|
| 255 |
failure_mode_detected=True,
|
| 256 |
+
failure_mode_description="Specification, Format Mismatch, Verification, Termination, Memory, and Reasoning-Action failures",
|
| 257 |
resilience_applied=True,
|
| 258 |
applied_config=json.dumps(agent_config),
|
| 259 |
test_passed=combined_score >= 0.5,
|
| 260 |
+
test_completions=int(all_scores[0] * 10),
|
| 261 |
+
test_total_trials=60, # Total across all 6 tasks
|
|
|
|
|
|
|
| 262 |
test_latency_ms=0,
|
| 263 |
diagnosis=f"Task scores: {all_scores}",
|
| 264 |
diagnosis_points=0.0,
|
| 265 |
reward=combined_score,
|
| 266 |
+
done=True,
|
| 267 |
)
|
| 268 |
|
| 269 |
return obs
|
|
|
|
| 332 |
# Hard: Verification fix
|
| 333 |
config["consistency_check"] = agent_config.get("consistency_check", False)
|
| 334 |
config["min_review_depth"] = agent_config.get("min_review_depth", 1)
|
| 335 |
+
elif task_id == "termination":
|
| 336 |
+
# Termination: FM-1.5/FM-3.1 (IBM 2026 - FATAL)
|
| 337 |
+
config["explicit_termination"] = agent_config.get(
|
| 338 |
+
"explicit_termination", False
|
| 339 |
+
)
|
| 340 |
+
config["max_iterations"] = agent_config.get("max_iterations", 0)
|
| 341 |
+
elif task_id == "memory":
|
| 342 |
+
# Memory: FM-1.4 (IBM 2026 - FATAL)
|
| 343 |
+
config["context_summarization"] = agent_config.get(
|
| 344 |
+
"context_summarization", False
|
| 345 |
+
)
|
| 346 |
+
config["sliding_window"] = agent_config.get("sliding_window", False)
|
| 347 |
+
elif task_id == "reasoning":
|
| 348 |
+
# Reasoning: FM-2.6 (IBM 2026 - FATAL)
|
| 349 |
+
config["action_validation"] = agent_config.get("action_validation", False)
|
| 350 |
+
config["reasoning_consistency_check"] = agent_config.get(
|
| 351 |
+
"reasoning_consistency_check", False
|
| 352 |
+
)
|
| 353 |
|
| 354 |
return config
|
| 355 |
|
server/workflow_simulator.py
CHANGED
|
@@ -53,10 +53,17 @@ class ResilienceConfig:
|
|
| 53 |
context_summarization_threshold: int = 500
|
| 54 |
min_review_depth: int = 1
|
| 55 |
consistency_check: bool = False
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
@dataclass
|
|
@@ -430,3 +437,127 @@ def create_hard_task() -> tuple[list[NodeConfig], str, str]:
|
|
| 430 |
"Verification failure - premature termination + incorrect verification"
|
| 431 |
)
|
| 432 |
return nodes, description, failure_mode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
context_summarization_threshold: int = 500
|
| 54 |
min_review_depth: int = 1
|
| 55 |
consistency_check: bool = False
|
| 56 |
+
# IBM 2026: FC1 - Termination Awareness (FATAL)
|
| 57 |
+
explicit_termination: bool = False
|
| 58 |
+
max_iterations: int = 0
|
| 59 |
+
|
| 60 |
+
# IBM 2026: FC1 - Memory/Context Management (FATAL)
|
| 61 |
+
context_summarization: bool = False
|
| 62 |
+
sliding_window: bool = False
|
| 63 |
+
|
| 64 |
+
# IBM 2026: FC2 - Reasoning-Action Alignment (FATAL)
|
| 65 |
+
action_validation: bool = False
|
| 66 |
+
reasoning_consistency_check: bool = False
|
| 67 |
|
| 68 |
|
| 69 |
@dataclass
|
|
|
|
| 437 |
"Verification failure - premature termination + incorrect verification"
|
| 438 |
)
|
| 439 |
return nodes, description, failure_mode
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
def create_termination_task() -> tuple[list[NodeConfig], str, str]:
|
| 443 |
+
"""
|
| 444 |
+
Termination task: FM-1.5/FM-3.1 (IBM 2026 - FATAL FAILURE)
|
| 445 |
+
|
| 446 |
+
Research: Kimi-K2 shows +46% spike in termination issues.
|
| 447 |
+
Task: Agent struggles to recognize when task is complete - loops or prematurely exits.
|
| 448 |
+
Fix: Implement explicit termination conditions with success criteria.
|
| 449 |
+
"""
|
| 450 |
+
nodes = [
|
| 451 |
+
NodeConfig(
|
| 452 |
+
node_id="researcher",
|
| 453 |
+
role="researcher",
|
| 454 |
+
role_definition="Research and produce a detailed report",
|
| 455 |
+
latency_ms=100,
|
| 456 |
+
),
|
| 457 |
+
NodeConfig(
|
| 458 |
+
node_id="worker1",
|
| 459 |
+
role="worker",
|
| 460 |
+
role_definition="Process research findings",
|
| 461 |
+
fail_rate=0.2, # Occasional failures
|
| 462 |
+
latency_ms=100,
|
| 463 |
+
),
|
| 464 |
+
NodeConfig(
|
| 465 |
+
node_id="worker2",
|
| 466 |
+
role="worker",
|
| 467 |
+
role_definition="Process worker1 output",
|
| 468 |
+
fail_rate=0.2,
|
| 469 |
+
latency_ms=100,
|
| 470 |
+
),
|
| 471 |
+
]
|
| 472 |
+
description = (
|
| 473 |
+
"The agent struggles to recognize when a task is complete. It either: "
|
| 474 |
+
"- Loops indefinitely (FM-1.3 Step Repetition) "
|
| 475 |
+
"- Prematurely exits without confirming success (FM-3.1) "
|
| 476 |
+
"- Is unaware of termination conditions (FM-1.5) "
|
| 477 |
+
"Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues. "
|
| 478 |
+
"Your task: Implement explicit termination conditions with success criteria verification."
|
| 479 |
+
)
|
| 480 |
+
failure_mode = "FM-1.5/FM-3.1: Unaware of termination + premature termination"
|
| 481 |
+
return nodes, description, failure_mode
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
def create_memory_task() -> tuple[list[NodeConfig], str, str]:
|
| 485 |
+
"""
|
| 486 |
+
Memory task: FM-1.4 (IBM 2026 - FATAL FAILURE)
|
| 487 |
+
|
| 488 |
+
Research: GPT-OSS-120B shows 24% memory loss in long traces.
|
| 489 |
+
Task: As conversation history grows, agent loses context and derails.
|
| 490 |
+
Fix: Implement context management (sliding window, summarization, state machine).
|
| 491 |
+
"""
|
| 492 |
+
nodes = [
|
| 493 |
+
NodeConfig(
|
| 494 |
+
node_id="analyzer1",
|
| 495 |
+
role="analyzer",
|
| 496 |
+
role_definition="Analyze data and produce findings",
|
| 497 |
+
context_limit=200, # Small context to trigger memory issues
|
| 498 |
+
latency_ms=100,
|
| 499 |
+
),
|
| 500 |
+
NodeConfig(
|
| 501 |
+
node_id="analyzer2",
|
| 502 |
+
role="analyzer",
|
| 503 |
+
role_definition="Analyze analyzer1 output with original context",
|
| 504 |
+
context_limit=200,
|
| 505 |
+
latency_ms=100,
|
| 506 |
+
),
|
| 507 |
+
NodeConfig(
|
| 508 |
+
node_id="analyzer3",
|
| 509 |
+
role="analyzer",
|
| 510 |
+
role_definition="Synthesize all previous findings",
|
| 511 |
+
context_limit=200,
|
| 512 |
+
latency_ms=100,
|
| 513 |
+
),
|
| 514 |
+
]
|
| 515 |
+
description = (
|
| 516 |
+
"As conversation history grows, the agent loses context and derails. "
|
| 517 |
+
"This is FM-1.4 (Loss of Conversation History) - unique fatal flaw. "
|
| 518 |
+
"Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces. "
|
| 519 |
+
"Your task: Implement context management - sliding window, summarization, or state machine."
|
| 520 |
+
)
|
| 521 |
+
failure_mode = "FM-1.4: Loss of conversation history - agent forgets original task"
|
| 522 |
+
return nodes, description, failure_mode
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
def create_reasoning_task() -> tuple[list[NodeConfig], str, str]:
|
| 526 |
+
"""
|
| 527 |
+
Reasoning-Action task: FM-2.6 (IBM 2026 - FATAL FAILURE)
|
| 528 |
+
|
| 529 |
+
Research: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this.
|
| 530 |
+
Task: Agent identifies correct next step but executes redundant/irrelevant command.
|
| 531 |
+
Fix: Implement action validation layer checking execution against reasoning.
|
| 532 |
+
"""
|
| 533 |
+
nodes = [
|
| 534 |
+
NodeConfig(
|
| 535 |
+
node_id="planner",
|
| 536 |
+
role="planner",
|
| 537 |
+
role_definition="Plan the next action based on current state",
|
| 538 |
+
latency_ms=100,
|
| 539 |
+
),
|
| 540 |
+
NodeConfig(
|
| 541 |
+
node_id="executor",
|
| 542 |
+
role="executor",
|
| 543 |
+
role_definition="Execute the planned action",
|
| 544 |
+
output_corruption_rate=0.4, # 40% chance of executing wrong action
|
| 545 |
+
latency_ms=100,
|
| 546 |
+
),
|
| 547 |
+
NodeConfig(
|
| 548 |
+
node_id="verifier",
|
| 549 |
+
role="verifier",
|
| 550 |
+
role_definition="Verify execution matches plan",
|
| 551 |
+
latency_ms=100,
|
| 552 |
+
),
|
| 553 |
+
]
|
| 554 |
+
description = (
|
| 555 |
+
"The agent identifies the correct next step but executes a redundant or irrelevant command. "
|
| 556 |
+
"FM-2.6: Reasoning-Action Mismatch - describes correct plan but executes unrelated tool call. "
|
| 557 |
+
"Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this. "
|
| 558 |
+
"Your task: Implement action validation layer that checks execution against reasoning."
|
| 559 |
+
)
|
| 560 |
+
failure_mode = (
|
| 561 |
+
"FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution"
|
| 562 |
+
)
|
| 563 |
+
return nodes, description, failure_mode
|