xemorph49 commited on
Commit
72076b7
·
verified ·
1 Parent(s): 0ed60f5

Upload folder using huggingface_hub

Browse files
envs/agent_stress_test_env/models.py CHANGED
@@ -23,9 +23,16 @@ class ResilienceConfig(Action):
23
 
24
  The agent outputs this to fix multi-agent workflow failures.
25
  Supports different fix types based on failure mode:
26
- - spec_fix: For specification ambiguity (Easy task)
27
- - format_translator: For format mismatches (Medium task)
28
- - consistency_check + min_review_depth: For verification failures (Hard task)
 
 
 
 
 
 
 
29
  """
30
 
31
  retry_max: int = 0
@@ -38,11 +45,30 @@ class ResilienceConfig(Action):
38
  min_review_depth: int = 1
39
  consistency_check: bool = False
40
 
41
- # New fields for MAST-based failure modes
42
- spec_fix: str = "" # Explicit role specification (JSON schema)
43
- explicit_role_spec: bool = False # Flag: provided explicit spec
44
- format_translator: bool = False # Flag: added format translation
45
- diagnosis: str = "" # Agent's diagnosis of the failure mode
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
  class StressTestObservation(Observation):
 
23
 
24
  The agent outputs this to fix multi-agent workflow failures.
25
  Supports different fix types based on failure mode:
26
+
27
+ MAST Categories (NeurIPS 2025):
28
+ - FC1: System Design (41.8%) - spec, termination, memory
29
+ - FC2: Inter-Agent Misalignment (36.9%) - format, reasoning-action
30
+ - FC3: Task Verification (21.3%) - verification checks
31
+
32
+ IBM 2026 Updates:
33
+ - FM-1.5/FM-3.1: Termination awareness (FATAL)
34
+ - FM-1.4: Memory/Context loss (FATAL)
35
+ - FM-2.6: Reasoning-action mismatch (FATAL)
36
  """
37
 
38
  retry_max: int = 0
 
45
  min_review_depth: int = 1
46
  consistency_check: bool = False
47
 
48
+ # MAST FC1: System Design (Easy task - spec ambiguity)
49
+ spec_fix: str = ""
50
+ explicit_role_spec: bool = False
51
+
52
+ # MAST FC2: Inter-Agent Misalignment (Medium task - format mismatch)
53
+ format_translator: bool = False
54
+
55
+ # MAST FC3: Task Verification (Hard task - verification failure)
56
+ # (uses consistency_check + min_review_depth)
57
+
58
+ # IBM 2026: FC1 - Termination Awareness (FATAL)
59
+ explicit_termination: bool = False
60
+ max_iterations: int = 0
61
+
62
+ # IBM 2026: FC1 - Memory/Context Management (FATAL)
63
+ context_summarization: bool = False
64
+ sliding_window: bool = False
65
+
66
+ # IBM 2026: FC2 - Reasoning-Action Alignment (FATAL)
67
+ action_validation: bool = False
68
+ reasoning_consistency_check: bool = False
69
+
70
+ # Agent's diagnosis of the failure mode
71
+ diagnosis: str = ""
72
 
73
 
74
  class StressTestObservation(Observation):
envs/agent_stress_test_env/openenv.yaml CHANGED
@@ -9,13 +9,13 @@ tasks:
9
  - id: easy
10
  name: "Specification Ambiguity Fix"
11
  difficulty: easy
12
- category: "MAST: Specification & System Design (41.8% of failures)"
13
  description: |
14
  The researcher agent has a vague role definition ('You are a helpful assistant').
15
  This causes task misinterpretation - the agent doesn't know what to research.
16
  Your task: Provide an explicit role specification JSON with clear capabilities,
17
  constraints, and success criteria.
18
- failure_mode: "Specification ambiguity - vague role definition causes task misinterpretation"
19
  grader:
20
  type: programmatic
21
  score_range: [0.0, 1.0]
@@ -30,12 +30,12 @@ tasks:
30
  - id: medium
31
  name: "Format Mismatch Fix"
32
  difficulty: medium
33
- category: "MAST: Inter-Agent Misalignment (36.9% of failures)"
34
  description: |
35
  Multi-agent workflow where the planner outputs YAML but the executor expects JSON.
36
  This format mismatch causes the executor to fail (cannot parse input).
37
  Your task: Add a format translation layer/middleware to convert YAML to JSON.
38
- failure_mode: "Format mismatch - planner outputs YAML, executor expects JSON"
39
  grader:
40
  type: programmatic
41
  score_range: [0.0, 1.0]
@@ -50,20 +50,21 @@ tasks:
50
  - id: hard
51
  name: "Verification Failure Fix"
52
  difficulty: hard
53
- category: "MAST: Task Verification (21.3% of failures)"
54
  description: |
55
  Multi-agent pipeline with verification failure. The writer produces content
56
  with contradictions (30% rate), and the reviewer prematurely approves (60% rate)
57
  without proper verification. This combines premature termination with incorrect verification.
58
  Your task: Implement multi-level verification - unit checks per agent,
59
  integration checks across outputs, and final validation against success criteria.
60
- failure_mode: "Verification failure - premature termination + incorrect verification"
61
  grader:
62
  type: programmatic
63
  score_range: [0.0, 1.0]
64
  criteria: |
65
  Based on MAST research: 21.3% of failures come from verification issues
66
  (6.2% premature, 8.2% no verification, 9.1% incorrect).
 
67
  The agent must add deep verification with explicit success criteria.
68
  - +0.15 for enabling consistency_check
69
  - +0.15 for setting min_review_depth >= 3
@@ -71,16 +72,92 @@ tasks:
71
  - +0.10 for achieving 50%+ success rate
72
  - +0.20 max for diagnosis keywords (partial credit)
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  difficulty_progression:
75
  - easy: "Single spec issue (vague role definition) - solution: explicit spec"
76
  - medium: "Inter-agent format mismatch (YAML vs JSON) - solution: translator"
77
  - hard: "Verification failure (premature + incorrect) - solution: multi-level checks"
 
 
 
78
 
79
  research_basis:
80
  - name: "MAST: Multi-Agent System Failure Taxonomy"
81
  source: "NeurIPS 2025 (Berkeley)"
82
  url: "https://arxiv.org/abs/2503.13657"
83
  key_finding: "Multi-agent LLM systems fail 41-86.7% of the time in production"
 
 
 
 
84
  - name: "Why Do Multi-Agent LLM Systems Fail?"
85
  source: "Future AGI 2026 Guide"
86
  url: "https://futureagi.substack.com/p/why-do-multi-agent-llm-systems-fail"
@@ -96,10 +173,17 @@ metadata:
96
  - mast-research
97
  - specification
98
  - verification
 
 
 
 
99
  created: 2026-04-08
100
- version: 1.1.0
101
  author: OpenEnv Hackathon
102
  benchmark_scores:
103
  easy: "Expected 0.85+ for strong LLM with explicit spec"
104
  medium: "Expected 0.60-0.75 for strong LLM with translator"
105
- hard: "Expected 0.35-0.50 for strong LLM with deep verification"
 
 
 
 
9
  - id: easy
10
  name: "Specification Ambiguity Fix"
11
  difficulty: easy
12
+ category: "MAST FC1: System Design (41.8% of failures)"
13
  description: |
14
  The researcher agent has a vague role definition ('You are a helpful assistant').
15
  This causes task misinterpretation - the agent doesn't know what to research.
16
  Your task: Provide an explicit role specification JSON with clear capabilities,
17
  constraints, and success criteria.
18
+ failure_mode: "FM-1.1: Specification ambiguity - vague role definition causes task misinterpretation"
19
  grader:
20
  type: programmatic
21
  score_range: [0.0, 1.0]
 
30
  - id: medium
31
  name: "Format Mismatch Fix"
32
  difficulty: medium
33
+ category: "MAST FC2: Inter-Agent Misalignment (36.9% of failures)"
34
  description: |
35
  Multi-agent workflow where the planner outputs YAML but the executor expects JSON.
36
  This format mismatch causes the executor to fail (cannot parse input).
37
  Your task: Add a format translation layer/middleware to convert YAML to JSON.
38
+ failure_mode: "FM-2.x: Format mismatch - planner outputs YAML, executor expects JSON"
39
  grader:
40
  type: programmatic
41
  score_range: [0.0, 1.0]
 
50
  - id: hard
51
  name: "Verification Failure Fix"
52
  difficulty: hard
53
+ category: "MAST FC3: Task Verification (21.3% of failures)"
54
  description: |
55
  Multi-agent pipeline with verification failure. The writer produces content
56
  with contradictions (30% rate), and the reviewer prematurely approves (60% rate)
57
  without proper verification. This combines premature termination with incorrect verification.
58
  Your task: Implement multi-level verification - unit checks per agent,
59
  integration checks across outputs, and final validation against success criteria.
60
+ failure_mode: "FM-3.1/FM-3.3: Verification failure - premature termination + incorrect verification"
61
  grader:
62
  type: programmatic
63
  score_range: [0.0, 1.0]
64
  criteria: |
65
  Based on MAST research: 21.3% of failures come from verification issues
66
  (6.2% premature, 8.2% no verification, 9.1% incorrect).
67
+ IBM 2026 update: FM-3.3 (Incorrect Verification) is the STRONGEST predictor of failure.
68
  The agent must add deep verification with explicit success criteria.
69
  - +0.15 for enabling consistency_check
70
  - +0.15 for setting min_review_depth >= 3
 
72
  - +0.10 for achieving 50%+ success rate
73
  - +0.20 max for diagnosis keywords (partial credit)
74
 
75
+ - id: termination
76
+ name: "Termination Awareness Fix"
77
+ difficulty: medium
78
+ category: "MAST FC1: System Design - FATAL FAILURE"
79
+ description: |
80
+ The agent struggles to recognize when a task is complete. It either:
81
+ - Loops indefinitely (FM-1.3 Step Repetition)
82
+ - Prematurely exits without confirming success (FM-3.1)
83
+ - Is unaware of termination conditions (FM-1.5)
84
+
85
+ Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues.
86
+ Your task: Implement explicit termination conditions with success criteria verification.
87
+ failure_mode: "FM-1.5/FM-3.1: Unaware of termination + premature termination"
88
+ grader:
89
+ type: programmatic
90
+ score_range: [0.0, 1.0]
91
+ criteria: |
92
+ FATAL FAILURE: When these modes appear, success probability drops precipitously.
93
+ IBM 2026: Use deterministic state machine to enforce termination.
94
+ - +0.25 for enabling explicit termination detection
95
+ - +0.20 for implementing max_iterations limit
96
+ - +0.30 * success_rate from 10 simulation trials
97
+ - +0.15 for achieving 60%+ success rate
98
+ - +0.15 max for diagnosis keywords (partial credit)
99
+
100
+ - id: memory
101
+ name: "Conversation History Fix"
102
+ difficulty: hard
103
+ category: "MAST FC1: System Design - FATAL FAILURE"
104
+ description: |
105
+ As conversation history grows, the agent loses context and derails.
106
+ This is FM-1.4 (Loss of Conversation History) - unique fatal flaw.
107
+ Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces.
108
+ Your task: Implement context management - sliding window, summarization, or state machine.
109
+ failure_mode: "FM-1.4: Loss of conversation history - agent forgets original task"
110
+ grader:
111
+ type: programmatic
112
+ score_range: [0.0, 1.0]
113
+ criteria: |
114
+ FATAL FAILURE: Memory loss in long traces leads to total task derailment.
115
+ IBM 2026: Implement aggressive context hygiene and early error detection.
116
+ - +0.20 for enabling context summarization
117
+ - +0.20 for implementing sliding window
118
+ - +0.35 * success_rate from 10 simulation trials (with long context)
119
+ - +0.15 for achieving 50%+ success rate in long traces
120
+ - +0.15 max for diagnosis keywords (partial credit)
121
+
122
+ - id: reasoning
123
+ name: "Reasoning-Action Alignment Fix"
124
+ difficulty: hard
125
+ category: "MAST FC2: Inter-Agent Misalignment - FATAL FAILURE"
126
+ description: |
127
+ The agent identifies the correct next step but executes a redundant or irrelevant command.
128
+ FM-2.6: Reasoning-Action Mismatch - describes correct plan but executes unrelated tool call.
129
+ Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this.
130
+ Your task: Implement action validation layer that checks execution against reasoning.
131
+ failure_mode: "FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution"
132
+ grader:
133
+ type: programmatic
134
+ score_range: [0.0, 1.0]
135
+ criteria: |
136
+ FATAL FAILURE: Decoupling of reasoning and action causes cascading collapse.
137
+ IBM 2026: Small reasoning mismatches early poison entire task history.
138
+ - +0.20 for enabling action validation
139
+ - +0.20 for implementing reasoning-execution consistency check
140
+ - +0.35 * success_rate from 10 simulation trials
141
+ - +0.15 for achieving 45%+ success rate
142
+ - +0.15 max for diagnosis keywords (partial credit)
143
+
144
  difficulty_progression:
145
  - easy: "Single spec issue (vague role definition) - solution: explicit spec"
146
  - medium: "Inter-agent format mismatch (YAML vs JSON) - solution: translator"
147
  - hard: "Verification failure (premature + incorrect) - solution: multi-level checks"
148
+ - termination: "Termination awareness (loops/premature exit) - solution: state machine"
149
+ - memory: "Conversation history loss (forgets context) - solution: context management"
150
+ - reasoning: "Reasoning-action mismatch (wrong execution) - solution: validation layer"
151
 
152
  research_basis:
153
  - name: "MAST: Multi-Agent System Failure Taxonomy"
154
  source: "NeurIPS 2025 (Berkeley)"
155
  url: "https://arxiv.org/abs/2503.13657"
156
  key_finding: "Multi-agent LLM systems fail 41-86.7% of the time in production"
157
+ - name: "IBM and UC Berkeley: Enterprise Agents Fail with IT-Bench and MAST"
158
+ source: "Hugging Face Blog (Feb 2026)"
159
+ url: "https://huggingface.co/blog/ibm-research/itbenchandmast"
160
+ key_finding: "FM-3.3 (Incorrect Verification) is strongest failure predictor; fatal vs non-fatal distinction critical"
161
  - name: "Why Do Multi-Agent LLM Systems Fail?"
162
  source: "Future AGI 2026 Guide"
163
  url: "https://futureagi.substack.com/p/why-do-multi-agent-llm-systems-fail"
 
173
  - mast-research
174
  - specification
175
  - verification
176
+ - termination-awareness
177
+ - memory-management
178
+ - reasoning-alignment
179
+ - ibm-research
180
  created: 2026-04-08
181
+ version: 1.2.0
182
  author: OpenEnv Hackathon
183
  benchmark_scores:
184
  easy: "Expected 0.85+ for strong LLM with explicit spec"
185
  medium: "Expected 0.60-0.75 for strong LLM with translator"
186
+ hard: "Expected 0.35-0.50 for strong LLM with deep verification"
187
+ termination: "Expected 0.50-0.65 for LLM with state machine"
188
+ memory: "Expected 0.40-0.55 for LLM with context management"
189
+ reasoning: "Expected 0.35-0.50 for LLM with validation layer"
envs/agent_stress_test_env/server/graders.py CHANGED
@@ -57,6 +57,12 @@ class Grader:
57
  "verify": 0.0,
58
  "check": 0.0,
59
  "review": 0.0,
 
 
 
 
 
 
60
  }
61
 
62
  # Specification keywords
@@ -87,6 +93,24 @@ class Grader:
87
  if "contradict" in diagnosis_lower:
88
  scores["contradiction"] = 0.10
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  return scores
91
 
92
  def grade(
@@ -406,5 +430,273 @@ def get_grader(task_id: str) -> Grader:
406
  "easy": EasyGrader(),
407
  "medium": MediumGrader(),
408
  "hard": HardGrader(),
 
 
 
409
  }
410
  return graders.get(task_id, EasyGrader())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  "verify": 0.0,
58
  "check": 0.0,
59
  "review": 0.0,
60
+ "termination": 0.0,
61
+ "loop": 0.0,
62
+ "memory": 0.0,
63
+ "context": 0.0,
64
+ "reasoning": 0.0,
65
+ "action": 0.0,
66
  }
67
 
68
  # Specification keywords
 
93
  if "contradict" in diagnosis_lower:
94
  scores["contradiction"] = 0.10
95
 
96
+ # Termination keywords (IBM 2026 - FATAL)
97
+ if "terminat" in diagnosis_lower or "loop" in diagnosis_lower:
98
+ scores["termination"] = 0.10
99
+ if "infinite" in diagnosis_lower or "repeat" in diagnosis_lower:
100
+ scores["loop"] = 0.10
101
+
102
+ # Memory/Context keywords (IBM 2026 - FATAL)
103
+ if "memory" in diagnosis_lower or "forget" in diagnosis_lower:
104
+ scores["memory"] = 0.10
105
+ if "context" in diagnosis_lower or "history" in diagnosis_lower:
106
+ scores["context"] = 0.10
107
+
108
+ # Reasoning-Action keywords (IBM 2026 - FATAL)
109
+ if "reason" in diagnosis_lower or "think" in diagnosis_lower:
110
+ scores["reasoning"] = 0.10
111
+ if "action" in diagnosis_lower or "execut" in diagnosis_lower:
112
+ scores["action"] = 0.10
113
+
114
  return scores
115
 
116
  def grade(
 
430
  "easy": EasyGrader(),
431
  "medium": MediumGrader(),
432
  "hard": HardGrader(),
433
+ "termination": TerminationGrader(),
434
+ "memory": MemoryGrader(),
435
+ "reasoning": ReasoningGrader(),
436
  }
437
  return graders.get(task_id, EasyGrader())
438
+
439
+
440
+ class TerminationGrader(Grader):
441
+ """
442
+ Grader for termination task: FM-1.5/FM-3.1 (IBM 2026 - FATAL FAILURE)
443
+
444
+ Task: Agent struggles to recognize task completion - loops or prematurely exits.
445
+ Fix: Implement explicit termination conditions with success criteria.
446
+ """
447
+
448
+ def __init__(self):
449
+ super().__init__("termination", "medium")
450
+
451
+ def grade(
452
+ self,
453
+ agent_config: dict[str, Any],
454
+ task_description: str,
455
+ failure_mode: str,
456
+ diagnosis: str,
457
+ ) -> tuple[float, dict[str, Any]]:
458
+ from .workflow_simulator import create_termination_task
459
+
460
+ nodes, _, _ = create_termination_task()
461
+ resilience = self._parse_config(agent_config, diagnosis)
462
+ simulator = WorkflowSimulator(nodes, seed=42)
463
+
464
+ results = []
465
+ for _ in range(10):
466
+ result = simulator.run_workflow(resilience)
467
+ results.append(result.success)
468
+
469
+ success_rate = sum(results) / len(results)
470
+
471
+ has_termination_detection = agent_config.get("explicit_termination", False)
472
+ has_max_iterations = agent_config.get("max_iterations", 0) > 0
473
+
474
+ diagnosis_scores = self._parse_diagnosis(diagnosis)
475
+ diagnosis_points = min(
476
+ 0.15, diagnosis_scores["termination"] + diagnosis_scores["loop"]
477
+ )
478
+
479
+ score = 0.0
480
+
481
+ if has_termination_detection:
482
+ score += 0.25
483
+ if has_max_iterations:
484
+ score += 0.20
485
+
486
+ if success_rate > 0:
487
+ score += success_rate * 0.30
488
+
489
+ if success_rate >= 0.6:
490
+ score += 0.15
491
+
492
+ score += diagnosis_points
493
+
494
+ score = min(1.0, max(0.0, score))
495
+
496
+ return score, {
497
+ "success_rate": success_rate,
498
+ "has_termination_detection": has_termination_detection,
499
+ "has_max_iterations": has_max_iterations,
500
+ "diagnosis_points": diagnosis_points,
501
+ "config": agent_config,
502
+ "diagnosis": diagnosis,
503
+ }
504
+
505
+ def _parse_config(
506
+ self, agent_config: dict[str, Any], diagnosis: str
507
+ ) -> ResilienceConfig:
508
+ return ResilienceConfig(
509
+ retry_max=agent_config.get("max_iterations", 50),
510
+ retry_delay_ms=agent_config.get("retry_delay_ms", 0),
511
+ timeout_ms=agent_config.get("timeout_ms", 30000),
512
+ fallback=agent_config.get("fallback", "abort"),
513
+ circuit_breaker_threshold=agent_config.get(
514
+ "circuit_breaker_threshold", 1.0
515
+ ),
516
+ context_strategy=agent_config.get("context_strategy", "truncate"),
517
+ context_summarization_threshold=agent_config.get(
518
+ "context_summarization_threshold", 500
519
+ ),
520
+ min_review_depth=agent_config.get("min_review_depth", 1),
521
+ consistency_check=agent_config.get("consistency_check", False),
522
+ explicit_termination=agent_config.get("explicit_termination", False),
523
+ diagnosis=diagnosis,
524
+ )
525
+
526
+
527
+ class MemoryGrader(Grader):
528
+ """
529
+ Grader for memory task: FM-1.4 (IBM 2026 - FATAL FAILURE)
530
+
531
+ Task: Agent loses conversation history in long traces - forgets original task.
532
+ Fix: Implement context management (sliding window, summarization, state machine).
533
+ """
534
+
535
+ def __init__(self):
536
+ super().__init__("memory", "hard")
537
+
538
+ def grade(
539
+ self,
540
+ agent_config: dict[str, Any],
541
+ task_description: str,
542
+ failure_mode: str,
543
+ diagnosis: str,
544
+ ) -> tuple[float, dict[str, Any]]:
545
+ from .workflow_simulator import create_memory_task
546
+
547
+ nodes, _, _ = create_memory_task()
548
+ resilience = self._parse_config(agent_config, diagnosis)
549
+ simulator = WorkflowSimulator(nodes, seed=42)
550
+
551
+ results = []
552
+ for _ in range(10):
553
+ result = simulator.run_workflow(resilience)
554
+ results.append(result.success)
555
+
556
+ success_rate = sum(results) / len(results)
557
+
558
+ has_summarization = agent_config.get("context_summarization", False)
559
+ has_sliding_window = agent_config.get("sliding_window", False)
560
+
561
+ diagnosis_scores = self._parse_diagnosis(diagnosis)
562
+ diagnosis_points = min(
563
+ 0.15, diagnosis_scores["memory"] + diagnosis_scores["context"]
564
+ )
565
+
566
+ score = 0.0
567
+
568
+ if has_summarization:
569
+ score += 0.20
570
+ if has_sliding_window:
571
+ score += 0.20
572
+
573
+ if success_rate > 0:
574
+ score += success_rate * 0.35
575
+
576
+ if success_rate >= 0.5:
577
+ score += 0.15
578
+
579
+ score += diagnosis_points
580
+
581
+ score = min(1.0, max(0.0, score))
582
+
583
+ return score, {
584
+ "success_rate": success_rate,
585
+ "has_summarization": has_summarization,
586
+ "has_sliding_window": has_sliding_window,
587
+ "diagnosis_points": diagnosis_points,
588
+ "config": agent_config,
589
+ "diagnosis": diagnosis,
590
+ }
591
+
592
+ def _parse_config(
593
+ self, agent_config: dict[str, Any], diagnosis: str
594
+ ) -> ResilienceConfig:
595
+ return ResilienceConfig(
596
+ retry_max=agent_config.get("retry_max", 0),
597
+ retry_delay_ms=agent_config.get("retry_delay_ms", 0),
598
+ timeout_ms=agent_config.get("timeout_ms", 30000),
599
+ fallback=agent_config.get("fallback", "abort"),
600
+ circuit_breaker_threshold=agent_config.get(
601
+ "circuit_breaker_threshold", 1.0
602
+ ),
603
+ context_strategy=agent_config.get("context_strategy", "summarize"),
604
+ context_summarization_threshold=agent_config.get(
605
+ "context_summarization_threshold", 200
606
+ ),
607
+ min_review_depth=agent_config.get("min_review_depth", 1),
608
+ consistency_check=agent_config.get("consistency_check", False),
609
+ context_summarization=agent_config.get("context_summarization", False),
610
+ sliding_window=agent_config.get("sliding_window", False),
611
+ diagnosis=diagnosis,
612
+ )
613
+
614
+
615
+ class ReasoningGrader(Grader):
616
+ """
617
+ Grader for reasoning-action alignment: FM-2.6 (IBM 2026 - FATAL FAILURE)
618
+
619
+ Task: Agent describes correct plan but executes unrelated/redundant command.
620
+ Fix: Implement action validation layer checking execution against reasoning.
621
+ """
622
+
623
+ def __init__(self):
624
+ super().__init__("reasoning", "hard")
625
+
626
+ def grade(
627
+ self,
628
+ agent_config: dict[str, Any],
629
+ task_description: str,
630
+ failure_mode: str,
631
+ diagnosis: str,
632
+ ) -> tuple[float, dict[str, Any]]:
633
+ from .workflow_simulator import create_reasoning_task
634
+
635
+ nodes, _, _ = create_reasoning_task()
636
+ resilience = self._parse_config(agent_config, diagnosis)
637
+ simulator = WorkflowSimulator(nodes, seed=42)
638
+
639
+ results = []
640
+ for _ in range(10):
641
+ result = simulator.run_workflow(resilience)
642
+ results.append(result.success)
643
+
644
+ success_rate = sum(results) / len(results)
645
+
646
+ has_action_validation = agent_config.get("action_validation", False)
647
+ has_consistency_check = agent_config.get("reasoning_consistency_check", False)
648
+
649
+ diagnosis_scores = self._parse_diagnosis(diagnosis)
650
+ diagnosis_points = min(
651
+ 0.15, diagnosis_scores["reasoning"] + diagnosis_scores["action"]
652
+ )
653
+
654
+ score = 0.0
655
+
656
+ if has_action_validation:
657
+ score += 0.20
658
+ if has_consistency_check:
659
+ score += 0.20
660
+
661
+ if success_rate > 0:
662
+ score += success_rate * 0.35
663
+
664
+ if success_rate >= 0.45:
665
+ score += 0.15
666
+
667
+ score += diagnosis_points
668
+
669
+ score = min(1.0, max(0.0, score))
670
+
671
+ return score, {
672
+ "success_rate": success_rate,
673
+ "has_action_validation": has_action_validation,
674
+ "has_consistency_check": has_consistency_check,
675
+ "diagnosis_points": diagnosis_points,
676
+ "config": agent_config,
677
+ "diagnosis": diagnosis,
678
+ }
679
+
680
+ def _parse_config(
681
+ self, agent_config: dict[str, Any], diagnosis: str
682
+ ) -> ResilienceConfig:
683
+ return ResilienceConfig(
684
+ retry_max=agent_config.get("retry_max", 0),
685
+ retry_delay_ms=agent_config.get("retry_delay_ms", 0),
686
+ timeout_ms=agent_config.get("timeout_ms", 30000),
687
+ fallback=agent_config.get("fallback", "abort"),
688
+ circuit_breaker_threshold=agent_config.get(
689
+ "circuit_breaker_threshold", 1.0
690
+ ),
691
+ context_strategy=agent_config.get("context_strategy", "truncate"),
692
+ context_summarization_threshold=agent_config.get(
693
+ "context_summarization_threshold", 500
694
+ ),
695
+ min_review_depth=agent_config.get("min_review_depth", 1),
696
+ consistency_check=agent_config.get("consistency_check", False),
697
+ action_validation=agent_config.get("action_validation", False),
698
+ reasoning_consistency_check=agent_config.get(
699
+ "reasoning_consistency_check", False
700
+ ),
701
+ diagnosis=diagnosis,
702
+ )
envs/agent_stress_test_env/server/stress_test_environment.py CHANGED
@@ -23,6 +23,9 @@ try:
23
  create_easy_task,
24
  create_hard_task,
25
  create_medium_task,
 
 
 
26
  )
27
  except ImportError:
28
  from openenv.core.env_server.interfaces import (
@@ -48,23 +51,44 @@ TASK_DEFINITIONS = {
48
  "easy": {
49
  "id": "easy",
50
  "difficulty": "easy",
51
- "category": "MAST: Specification & System Design (41.8% of failures)",
52
  "description": "The researcher agent has a vague role definition ('You are a helpful assistant'). This causes task misinterpretation. Your task: Provide an explicit role specification JSON with clear capabilities, constraints, and success criteria.",
53
- "failure_mode": "Specification ambiguity - vague role definition causes task misinterpretation",
54
  },
55
  "medium": {
56
  "id": "medium",
57
  "difficulty": "medium",
58
- "category": "MAST: Inter-Agent Misalignment (36.9% of failures)",
59
  "description": "Multi-agent workflow where the planner outputs YAML but the executor expects JSON. This format mismatch causes the executor to fail. Your task: Add a format translation layer/middleware.",
60
- "failure_mode": "Format mismatch - planner outputs YAML, executor expects JSON",
61
  },
62
  "hard": {
63
  "id": "hard",
64
  "difficulty": "hard",
65
- "category": "MAST: Task Verification (21.3% of failures)",
66
- "description": "Multi-agent pipeline with verification failure. Writer produces contradictions (30%), reviewer prematurely approves (60%) without checks. Your task: Implement multi-level verification.",
67
- "failure_mode": "Verification failure - premature termination + incorrect verification",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  },
69
  }
70
 
@@ -116,6 +140,14 @@ class StressTestEnvironment(
116
  step_count=0,
117
  )
118
  self._current_task_index = 0
 
 
 
 
 
 
 
 
119
 
120
  task = TASK_DEFINITIONS["easy"]
121
 
@@ -207,33 +239,31 @@ class StressTestEnvironment(
207
  self._state.total_score = combined_score
208
  self._state.step_count += 1
209
 
210
- # Return combined result for the easy task (as reference)
211
  task_id = "all_tasks"
212
  task = {
213
- "description": "All 3 tasks (Easy: Spec, Medium: Format, Hard: Verification)",
214
- "failure_mode": "Combined MAST failure modes",
215
  "category": "MAST: All categories",
216
  }
217
 
218
  obs = StressTestObservation(
219
  task_id="all_tasks",
220
- task_description=f"Easy: {all_scores[0]:.2f}, Medium: {all_scores[1]:.2f}, Hard: {all_scores[2]:.2f} | Combined: {combined_score:.2f}",
221
- scenario_setup="All 3 MAST failure categories evaluated",
222
- failure_category="MAST: Spec (41.8%) + Inter-Agent (36.9%) + Verification (21.3%)",
223
  failure_mode_detected=True,
224
- failure_mode_description="Specification, Format Mismatch, and Verification failures",
225
  resilience_applied=True,
226
  applied_config=json.dumps(agent_config),
227
  test_passed=combined_score >= 0.5,
228
- test_completions=int(
229
- all_scores[0] * 10
230
- ), # Report easy task completions
231
- test_total_trials=30, # Total across all tasks
232
  test_latency_ms=0,
233
  diagnosis=f"Task scores: {all_scores}",
234
  diagnosis_points=0.0,
235
  reward=combined_score,
236
- done=True, # All tasks done in one step
237
  )
238
 
239
  return obs
@@ -302,6 +332,24 @@ class StressTestEnvironment(
302
  # Hard: Verification fix
303
  config["consistency_check"] = agent_config.get("consistency_check", False)
304
  config["min_review_depth"] = agent_config.get("min_review_depth", 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
  return config
307
 
 
23
  create_easy_task,
24
  create_hard_task,
25
  create_medium_task,
26
+ create_termination_task,
27
+ create_memory_task,
28
+ create_reasoning_task,
29
  )
30
  except ImportError:
31
  from openenv.core.env_server.interfaces import (
 
51
  "easy": {
52
  "id": "easy",
53
  "difficulty": "easy",
54
+ "category": "MAST FC1: System Design (41.8% of failures)",
55
  "description": "The researcher agent has a vague role definition ('You are a helpful assistant'). This causes task misinterpretation. Your task: Provide an explicit role specification JSON with clear capabilities, constraints, and success criteria.",
56
+ "failure_mode": "FM-1.1: Specification ambiguity - vague role definition causes task misinterpretation",
57
  },
58
  "medium": {
59
  "id": "medium",
60
  "difficulty": "medium",
61
+ "category": "MAST FC2: Inter-Agent Misalignment (36.9% of failures)",
62
  "description": "Multi-agent workflow where the planner outputs YAML but the executor expects JSON. This format mismatch causes the executor to fail. Your task: Add a format translation layer/middleware.",
63
+ "failure_mode": "FM-2.x: Format mismatch - planner outputs YAML, executor expects JSON",
64
  },
65
  "hard": {
66
  "id": "hard",
67
  "difficulty": "hard",
68
+ "category": "MAST FC3: Task Verification (21.3% of failures)",
69
+ "description": "Multi-agent pipeline with verification failure. Writer produces contradictions (30%), reviewer prematurely approves (60%) without checks. Your task: Implement multi-level verification. IBM 2026: FM-3.3 is strongest failure predictor.",
70
+ "failure_mode": "FM-3.1/FM-3.3: Verification failure - premature termination + incorrect verification",
71
+ },
72
+ "termination": {
73
+ "id": "termination",
74
+ "difficulty": "medium",
75
+ "category": "MAST FC1: System Design - FATAL FAILURE",
76
+ "description": "The agent struggles to recognize when a task is complete. It loops indefinitely or prematurely exits. Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues. Your task: Implement explicit termination conditions with success criteria.",
77
+ "failure_mode": "FM-1.5/FM-3.1: Unaware of termination + premature termination",
78
+ },
79
+ "memory": {
80
+ "id": "memory",
81
+ "difficulty": "hard",
82
+ "category": "MAST FC1: System Design - FATAL FAILURE",
83
+ "description": "As conversation history grows, the agent loses context and derails. Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces. Your task: Implement context management - sliding window, summarization, or state machine.",
84
+ "failure_mode": "FM-1.4: Loss of conversation history - agent forgets original task",
85
+ },
86
+ "reasoning": {
87
+ "id": "reasoning",
88
+ "difficulty": "hard",
89
+ "category": "MAST FC2: Inter-Agent Misalignment - FATAL FAILURE",
90
+ "description": "The agent describes correct plan but executes unrelated command. Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this. Your task: Implement action validation layer checking execution against reasoning.",
91
+ "failure_mode": "FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution",
92
  },
93
  }
94
 
 
140
  step_count=0,
141
  )
142
  self._current_task_index = 0
143
+ self._task_ids = [
144
+ "easy",
145
+ "medium",
146
+ "hard",
147
+ "termination",
148
+ "memory",
149
+ "reasoning",
150
+ ]
151
 
152
  task = TASK_DEFINITIONS["easy"]
153
 
 
239
  self._state.total_score = combined_score
240
  self._state.step_count += 1
241
 
242
+ # Return combined result for all tasks
243
  task_id = "all_tasks"
244
  task = {
245
+ "description": "All 6 tasks (Easy/Medium/Hard + Termination/Memory/Reasoning)",
246
+ "failure_mode": "Combined MAST failure modes including IBM 2026 FATAL failures",
247
  "category": "MAST: All categories",
248
  }
249
 
250
  obs = StressTestObservation(
251
  task_id="all_tasks",
252
+ task_description=f"Easy: {all_scores[0]:.2f}, Medium: {all_scores[1]:.2f}, Hard: {all_scores[2]:.2f}, Term: {all_scores[3]:.2f}, Mem: {all_scores[4]:.2f}, Reas: {all_scores[5]:.2f} | Combined: {combined_score:.2f}",
253
+ scenario_setup="All 6 MAST failure categories evaluated including IBM 2026 fatal failures",
254
+ failure_category="MAST: Spec (41.8%) + Inter-Agent (36.9%) + Verification (21.3%) + IBM FATAL (termination, memory, reasoning)",
255
  failure_mode_detected=True,
256
+ failure_mode_description="Specification, Format Mismatch, Verification, Termination, Memory, and Reasoning-Action failures",
257
  resilience_applied=True,
258
  applied_config=json.dumps(agent_config),
259
  test_passed=combined_score >= 0.5,
260
+ test_completions=int(all_scores[0] * 10),
261
+ test_total_trials=60, # Total across all 6 tasks
 
 
262
  test_latency_ms=0,
263
  diagnosis=f"Task scores: {all_scores}",
264
  diagnosis_points=0.0,
265
  reward=combined_score,
266
+ done=True,
267
  )
268
 
269
  return obs
 
332
  # Hard: Verification fix
333
  config["consistency_check"] = agent_config.get("consistency_check", False)
334
  config["min_review_depth"] = agent_config.get("min_review_depth", 1)
335
+ elif task_id == "termination":
336
+ # Termination: FM-1.5/FM-3.1 (IBM 2026 - FATAL)
337
+ config["explicit_termination"] = agent_config.get(
338
+ "explicit_termination", False
339
+ )
340
+ config["max_iterations"] = agent_config.get("max_iterations", 0)
341
+ elif task_id == "memory":
342
+ # Memory: FM-1.4 (IBM 2026 - FATAL)
343
+ config["context_summarization"] = agent_config.get(
344
+ "context_summarization", False
345
+ )
346
+ config["sliding_window"] = agent_config.get("sliding_window", False)
347
+ elif task_id == "reasoning":
348
+ # Reasoning: FM-2.6 (IBM 2026 - FATAL)
349
+ config["action_validation"] = agent_config.get("action_validation", False)
350
+ config["reasoning_consistency_check"] = agent_config.get(
351
+ "reasoning_consistency_check", False
352
+ )
353
 
354
  return config
355
 
envs/agent_stress_test_env/server/workflow_simulator.py CHANGED
@@ -53,10 +53,17 @@ class ResilienceConfig:
53
  context_summarization_threshold: int = 500
54
  min_review_depth: int = 1
55
  consistency_check: bool = False
56
- diagnosis: str = "" # Agent's diagnosis of the failure
57
- spec_fix: str = "" # Agent's spec improvement (for spec failures)
58
- explicit_role_spec: bool = False # Flag: provided explicit spec
59
- format_translator: bool = False # For format mismatch failures
 
 
 
 
 
 
 
60
 
61
 
62
  @dataclass
@@ -430,3 +437,127 @@ def create_hard_task() -> tuple[list[NodeConfig], str, str]:
430
  "Verification failure - premature termination + incorrect verification"
431
  )
432
  return nodes, description, failure_mode
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  context_summarization_threshold: int = 500
54
  min_review_depth: int = 1
55
  consistency_check: bool = False
56
+ # IBM 2026: FC1 - Termination Awareness (FATAL)
57
+ explicit_termination: bool = False
58
+ max_iterations: int = 0
59
+
60
+ # IBM 2026: FC1 - Memory/Context Management (FATAL)
61
+ context_summarization: bool = False
62
+ sliding_window: bool = False
63
+
64
+ # IBM 2026: FC2 - Reasoning-Action Alignment (FATAL)
65
+ action_validation: bool = False
66
+ reasoning_consistency_check: bool = False
67
 
68
 
69
  @dataclass
 
437
  "Verification failure - premature termination + incorrect verification"
438
  )
439
  return nodes, description, failure_mode
440
+
441
+
442
+ def create_termination_task() -> tuple[list[NodeConfig], str, str]:
443
+ """
444
+ Termination task: FM-1.5/FM-3.1 (IBM 2026 - FATAL FAILURE)
445
+
446
+ Research: Kimi-K2 shows +46% spike in termination issues.
447
+ Task: Agent struggles to recognize when task is complete - loops or prematurely exits.
448
+ Fix: Implement explicit termination conditions with success criteria.
449
+ """
450
+ nodes = [
451
+ NodeConfig(
452
+ node_id="researcher",
453
+ role="researcher",
454
+ role_definition="Research and produce a detailed report",
455
+ latency_ms=100,
456
+ ),
457
+ NodeConfig(
458
+ node_id="worker1",
459
+ role="worker",
460
+ role_definition="Process research findings",
461
+ fail_rate=0.2, # Occasional failures
462
+ latency_ms=100,
463
+ ),
464
+ NodeConfig(
465
+ node_id="worker2",
466
+ role="worker",
467
+ role_definition="Process worker1 output",
468
+ fail_rate=0.2,
469
+ latency_ms=100,
470
+ ),
471
+ ]
472
+ description = (
473
+ "The agent struggles to recognize when a task is complete. It either: "
474
+ "- Loops indefinitely (FM-1.3 Step Repetition) "
475
+ "- Prematurely exits without confirming success (FM-3.1) "
476
+ "- Is unaware of termination conditions (FM-1.5) "
477
+ "Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues. "
478
+ "Your task: Implement explicit termination conditions with success criteria verification."
479
+ )
480
+ failure_mode = "FM-1.5/FM-3.1: Unaware of termination + premature termination"
481
+ return nodes, description, failure_mode
482
+
483
+
484
+ def create_memory_task() -> tuple[list[NodeConfig], str, str]:
485
+ """
486
+ Memory task: FM-1.4 (IBM 2026 - FATAL FAILURE)
487
+
488
+ Research: GPT-OSS-120B shows 24% memory loss in long traces.
489
+ Task: As conversation history grows, agent loses context and derails.
490
+ Fix: Implement context management (sliding window, summarization, state machine).
491
+ """
492
+ nodes = [
493
+ NodeConfig(
494
+ node_id="analyzer1",
495
+ role="analyzer",
496
+ role_definition="Analyze data and produce findings",
497
+ context_limit=200, # Small context to trigger memory issues
498
+ latency_ms=100,
499
+ ),
500
+ NodeConfig(
501
+ node_id="analyzer2",
502
+ role="analyzer",
503
+ role_definition="Analyze analyzer1 output with original context",
504
+ context_limit=200,
505
+ latency_ms=100,
506
+ ),
507
+ NodeConfig(
508
+ node_id="analyzer3",
509
+ role="analyzer",
510
+ role_definition="Synthesize all previous findings",
511
+ context_limit=200,
512
+ latency_ms=100,
513
+ ),
514
+ ]
515
+ description = (
516
+ "As conversation history grows, the agent loses context and derails. "
517
+ "This is FM-1.4 (Loss of Conversation History) - unique fatal flaw. "
518
+ "Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces. "
519
+ "Your task: Implement context management - sliding window, summarization, or state machine."
520
+ )
521
+ failure_mode = "FM-1.4: Loss of conversation history - agent forgets original task"
522
+ return nodes, description, failure_mode
523
+
524
+
525
+ def create_reasoning_task() -> tuple[list[NodeConfig], str, str]:
526
+ """
527
+ Reasoning-Action task: FM-2.6 (IBM 2026 - FATAL FAILURE)
528
+
529
+ Research: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this.
530
+ Task: Agent identifies correct next step but executes redundant/irrelevant command.
531
+ Fix: Implement action validation layer checking execution against reasoning.
532
+ """
533
+ nodes = [
534
+ NodeConfig(
535
+ node_id="planner",
536
+ role="planner",
537
+ role_definition="Plan the next action based on current state",
538
+ latency_ms=100,
539
+ ),
540
+ NodeConfig(
541
+ node_id="executor",
542
+ role="executor",
543
+ role_definition="Execute the planned action",
544
+ output_corruption_rate=0.4, # 40% chance of executing wrong action
545
+ latency_ms=100,
546
+ ),
547
+ NodeConfig(
548
+ node_id="verifier",
549
+ role="verifier",
550
+ role_definition="Verify execution matches plan",
551
+ latency_ms=100,
552
+ ),
553
+ ]
554
+ description = (
555
+ "The agent identifies the correct next step but executes a redundant or irrelevant command. "
556
+ "FM-2.6: Reasoning-Action Mismatch - describes correct plan but executes unrelated tool call. "
557
+ "Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this. "
558
+ "Your task: Implement action validation layer that checks execution against reasoning."
559
+ )
560
+ failure_mode = (
561
+ "FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution"
562
+ )
563
+ return nodes, description, failure_mode
models.py CHANGED
@@ -23,9 +23,16 @@ class ResilienceConfig(Action):
23
 
24
  The agent outputs this to fix multi-agent workflow failures.
25
  Supports different fix types based on failure mode:
26
- - spec_fix: For specification ambiguity (Easy task)
27
- - format_translator: For format mismatches (Medium task)
28
- - consistency_check + min_review_depth: For verification failures (Hard task)
 
 
 
 
 
 
 
29
  """
30
 
31
  retry_max: int = 0
@@ -38,11 +45,30 @@ class ResilienceConfig(Action):
38
  min_review_depth: int = 1
39
  consistency_check: bool = False
40
 
41
- # New fields for MAST-based failure modes
42
- spec_fix: str = "" # Explicit role specification (JSON schema)
43
- explicit_role_spec: bool = False # Flag: provided explicit spec
44
- format_translator: bool = False # Flag: added format translation
45
- diagnosis: str = "" # Agent's diagnosis of the failure mode
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
  class StressTestObservation(Observation):
 
23
 
24
  The agent outputs this to fix multi-agent workflow failures.
25
  Supports different fix types based on failure mode:
26
+
27
+ MAST Categories (NeurIPS 2025):
28
+ - FC1: System Design (41.8%) - spec, termination, memory
29
+ - FC2: Inter-Agent Misalignment (36.9%) - format, reasoning-action
30
+ - FC3: Task Verification (21.3%) - verification checks
31
+
32
+ IBM 2026 Updates:
33
+ - FM-1.5/FM-3.1: Termination awareness (FATAL)
34
+ - FM-1.4: Memory/Context loss (FATAL)
35
+ - FM-2.6: Reasoning-action mismatch (FATAL)
36
  """
37
 
38
  retry_max: int = 0
 
45
  min_review_depth: int = 1
46
  consistency_check: bool = False
47
 
48
+ # MAST FC1: System Design (Easy task - spec ambiguity)
49
+ spec_fix: str = ""
50
+ explicit_role_spec: bool = False
51
+
52
+ # MAST FC2: Inter-Agent Misalignment (Medium task - format mismatch)
53
+ format_translator: bool = False
54
+
55
+ # MAST FC3: Task Verification (Hard task - verification failure)
56
+ # (uses consistency_check + min_review_depth)
57
+
58
+ # IBM 2026: FC1 - Termination Awareness (FATAL)
59
+ explicit_termination: bool = False
60
+ max_iterations: int = 0
61
+
62
+ # IBM 2026: FC1 - Memory/Context Management (FATAL)
63
+ context_summarization: bool = False
64
+ sliding_window: bool = False
65
+
66
+ # IBM 2026: FC2 - Reasoning-Action Alignment (FATAL)
67
+ action_validation: bool = False
68
+ reasoning_consistency_check: bool = False
69
+
70
+ # Agent's diagnosis of the failure mode
71
+ diagnosis: str = ""
72
 
73
 
74
  class StressTestObservation(Observation):
openenv.yaml CHANGED
@@ -9,13 +9,13 @@ tasks:
9
  - id: easy
10
  name: "Specification Ambiguity Fix"
11
  difficulty: easy
12
- category: "MAST: Specification & System Design (41.8% of failures)"
13
  description: |
14
  The researcher agent has a vague role definition ('You are a helpful assistant').
15
  This causes task misinterpretation - the agent doesn't know what to research.
16
  Your task: Provide an explicit role specification JSON with clear capabilities,
17
  constraints, and success criteria.
18
- failure_mode: "Specification ambiguity - vague role definition causes task misinterpretation"
19
  grader:
20
  type: programmatic
21
  score_range: [0.0, 1.0]
@@ -30,12 +30,12 @@ tasks:
30
  - id: medium
31
  name: "Format Mismatch Fix"
32
  difficulty: medium
33
- category: "MAST: Inter-Agent Misalignment (36.9% of failures)"
34
  description: |
35
  Multi-agent workflow where the planner outputs YAML but the executor expects JSON.
36
  This format mismatch causes the executor to fail (cannot parse input).
37
  Your task: Add a format translation layer/middleware to convert YAML to JSON.
38
- failure_mode: "Format mismatch - planner outputs YAML, executor expects JSON"
39
  grader:
40
  type: programmatic
41
  score_range: [0.0, 1.0]
@@ -50,20 +50,21 @@ tasks:
50
  - id: hard
51
  name: "Verification Failure Fix"
52
  difficulty: hard
53
- category: "MAST: Task Verification (21.3% of failures)"
54
  description: |
55
  Multi-agent pipeline with verification failure. The writer produces content
56
  with contradictions (30% rate), and the reviewer prematurely approves (60% rate)
57
  without proper verification. This combines premature termination with incorrect verification.
58
  Your task: Implement multi-level verification - unit checks per agent,
59
  integration checks across outputs, and final validation against success criteria.
60
- failure_mode: "Verification failure - premature termination + incorrect verification"
61
  grader:
62
  type: programmatic
63
  score_range: [0.0, 1.0]
64
  criteria: |
65
  Based on MAST research: 21.3% of failures come from verification issues
66
  (6.2% premature, 8.2% no verification, 9.1% incorrect).
 
67
  The agent must add deep verification with explicit success criteria.
68
  - +0.15 for enabling consistency_check
69
  - +0.15 for setting min_review_depth >= 3
@@ -71,16 +72,92 @@ tasks:
71
  - +0.10 for achieving 50%+ success rate
72
  - +0.20 max for diagnosis keywords (partial credit)
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  difficulty_progression:
75
  - easy: "Single spec issue (vague role definition) - solution: explicit spec"
76
  - medium: "Inter-agent format mismatch (YAML vs JSON) - solution: translator"
77
  - hard: "Verification failure (premature + incorrect) - solution: multi-level checks"
 
 
 
78
 
79
  research_basis:
80
  - name: "MAST: Multi-Agent System Failure Taxonomy"
81
  source: "NeurIPS 2025 (Berkeley)"
82
  url: "https://arxiv.org/abs/2503.13657"
83
  key_finding: "Multi-agent LLM systems fail 41-86.7% of the time in production"
 
 
 
 
84
  - name: "Why Do Multi-Agent LLM Systems Fail?"
85
  source: "Future AGI 2026 Guide"
86
  url: "https://futureagi.substack.com/p/why-do-multi-agent-llm-systems-fail"
@@ -96,10 +173,17 @@ metadata:
96
  - mast-research
97
  - specification
98
  - verification
 
 
 
 
99
  created: 2026-04-08
100
- version: 1.1.0
101
  author: OpenEnv Hackathon
102
  benchmark_scores:
103
  easy: "Expected 0.85+ for strong LLM with explicit spec"
104
  medium: "Expected 0.60-0.75 for strong LLM with translator"
105
- hard: "Expected 0.35-0.50 for strong LLM with deep verification"
 
 
 
 
9
  - id: easy
10
  name: "Specification Ambiguity Fix"
11
  difficulty: easy
12
+ category: "MAST FC1: System Design (41.8% of failures)"
13
  description: |
14
  The researcher agent has a vague role definition ('You are a helpful assistant').
15
  This causes task misinterpretation - the agent doesn't know what to research.
16
  Your task: Provide an explicit role specification JSON with clear capabilities,
17
  constraints, and success criteria.
18
+ failure_mode: "FM-1.1: Specification ambiguity - vague role definition causes task misinterpretation"
19
  grader:
20
  type: programmatic
21
  score_range: [0.0, 1.0]
 
30
  - id: medium
31
  name: "Format Mismatch Fix"
32
  difficulty: medium
33
+ category: "MAST FC2: Inter-Agent Misalignment (36.9% of failures)"
34
  description: |
35
  Multi-agent workflow where the planner outputs YAML but the executor expects JSON.
36
  This format mismatch causes the executor to fail (cannot parse input).
37
  Your task: Add a format translation layer/middleware to convert YAML to JSON.
38
+ failure_mode: "FM-2.x: Format mismatch - planner outputs YAML, executor expects JSON"
39
  grader:
40
  type: programmatic
41
  score_range: [0.0, 1.0]
 
50
  - id: hard
51
  name: "Verification Failure Fix"
52
  difficulty: hard
53
+ category: "MAST FC3: Task Verification (21.3% of failures)"
54
  description: |
55
  Multi-agent pipeline with verification failure. The writer produces content
56
  with contradictions (30% rate), and the reviewer prematurely approves (60% rate)
57
  without proper verification. This combines premature termination with incorrect verification.
58
  Your task: Implement multi-level verification - unit checks per agent,
59
  integration checks across outputs, and final validation against success criteria.
60
+ failure_mode: "FM-3.1/FM-3.3: Verification failure - premature termination + incorrect verification"
61
  grader:
62
  type: programmatic
63
  score_range: [0.0, 1.0]
64
  criteria: |
65
  Based on MAST research: 21.3% of failures come from verification issues
66
  (6.2% premature, 8.2% no verification, 9.1% incorrect).
67
+ IBM 2026 update: FM-3.3 (Incorrect Verification) is the STRONGEST predictor of failure.
68
  The agent must add deep verification with explicit success criteria.
69
  - +0.15 for enabling consistency_check
70
  - +0.15 for setting min_review_depth >= 3
 
72
  - +0.10 for achieving 50%+ success rate
73
  - +0.20 max for diagnosis keywords (partial credit)
74
 
75
+ - id: termination
76
+ name: "Termination Awareness Fix"
77
+ difficulty: medium
78
+ category: "MAST FC1: System Design - FATAL FAILURE"
79
+ description: |
80
+ The agent struggles to recognize when a task is complete. It either:
81
+ - Loops indefinitely (FM-1.3 Step Repetition)
82
+ - Prematurely exits without confirming success (FM-3.1)
83
+ - Is unaware of termination conditions (FM-1.5)
84
+
85
+ Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues.
86
+ Your task: Implement explicit termination conditions with success criteria verification.
87
+ failure_mode: "FM-1.5/FM-3.1: Unaware of termination + premature termination"
88
+ grader:
89
+ type: programmatic
90
+ score_range: [0.0, 1.0]
91
+ criteria: |
92
+ FATAL FAILURE: When these modes appear, success probability drops precipitously.
93
+ IBM 2026: Use deterministic state machine to enforce termination.
94
+ - +0.25 for enabling explicit termination detection
95
+ - +0.20 for implementing max_iterations limit
96
+ - +0.30 * success_rate from 10 simulation trials
97
+ - +0.15 for achieving 60%+ success rate
98
+ - +0.15 max for diagnosis keywords (partial credit)
99
+
100
+ - id: memory
101
+ name: "Conversation History Fix"
102
+ difficulty: hard
103
+ category: "MAST FC1: System Design - FATAL FAILURE"
104
+ description: |
105
+ As conversation history grows, the agent loses context and derails.
106
+ This is FM-1.4 (Loss of Conversation History) - unique fatal flaw.
107
+ Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces.
108
+ Your task: Implement context management - sliding window, summarization, or state machine.
109
+ failure_mode: "FM-1.4: Loss of conversation history - agent forgets original task"
110
+ grader:
111
+ type: programmatic
112
+ score_range: [0.0, 1.0]
113
+ criteria: |
114
+ FATAL FAILURE: Memory loss in long traces leads to total task derailment.
115
+ IBM 2026: Implement aggressive context hygiene and early error detection.
116
+ - +0.20 for enabling context summarization
117
+ - +0.20 for implementing sliding window
118
+ - +0.35 * success_rate from 10 simulation trials (with long context)
119
+ - +0.15 for achieving 50%+ success rate in long traces
120
+ - +0.15 max for diagnosis keywords (partial credit)
121
+
122
+ - id: reasoning
123
+ name: "Reasoning-Action Alignment Fix"
124
+ difficulty: hard
125
+ category: "MAST FC2: Inter-Agent Misalignment - FATAL FAILURE"
126
+ description: |
127
+ The agent identifies the correct next step but executes a redundant or irrelevant command.
128
+ FM-2.6: Reasoning-Action Mismatch - describes correct plan but executes unrelated tool call.
129
+ Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this.
130
+ Your task: Implement action validation layer that checks execution against reasoning.
131
+ failure_mode: "FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution"
132
+ grader:
133
+ type: programmatic
134
+ score_range: [0.0, 1.0]
135
+ criteria: |
136
+ FATAL FAILURE: Decoupling of reasoning and action causes cascading collapse.
137
+ IBM 2026: Small reasoning mismatches early poison entire task history.
138
+ - +0.20 for enabling action validation
139
+ - +0.20 for implementing reasoning-execution consistency check
140
+ - +0.35 * success_rate from 10 simulation trials
141
+ - +0.15 for achieving 45%+ success rate
142
+ - +0.15 max for diagnosis keywords (partial credit)
143
+
144
  difficulty_progression:
145
  - easy: "Single spec issue (vague role definition) - solution: explicit spec"
146
  - medium: "Inter-agent format mismatch (YAML vs JSON) - solution: translator"
147
  - hard: "Verification failure (premature + incorrect) - solution: multi-level checks"
148
+ - termination: "Termination awareness (loops/premature exit) - solution: state machine"
149
+ - memory: "Conversation history loss (forgets context) - solution: context management"
150
+ - reasoning: "Reasoning-action mismatch (wrong execution) - solution: validation layer"
151
 
152
  research_basis:
153
  - name: "MAST: Multi-Agent System Failure Taxonomy"
154
  source: "NeurIPS 2025 (Berkeley)"
155
  url: "https://arxiv.org/abs/2503.13657"
156
  key_finding: "Multi-agent LLM systems fail 41-86.7% of the time in production"
157
+ - name: "IBM and UC Berkeley: Enterprise Agents Fail with IT-Bench and MAST"
158
+ source: "Hugging Face Blog (Feb 2026)"
159
+ url: "https://huggingface.co/blog/ibm-research/itbenchandmast"
160
+ key_finding: "FM-3.3 (Incorrect Verification) is strongest failure predictor; fatal vs non-fatal distinction critical"
161
  - name: "Why Do Multi-Agent LLM Systems Fail?"
162
  source: "Future AGI 2026 Guide"
163
  url: "https://futureagi.substack.com/p/why-do-multi-agent-llm-systems-fail"
 
173
  - mast-research
174
  - specification
175
  - verification
176
+ - termination-awareness
177
+ - memory-management
178
+ - reasoning-alignment
179
+ - ibm-research
180
  created: 2026-04-08
181
+ version: 1.2.0
182
  author: OpenEnv Hackathon
183
  benchmark_scores:
184
  easy: "Expected 0.85+ for strong LLM with explicit spec"
185
  medium: "Expected 0.60-0.75 for strong LLM with translator"
186
+ hard: "Expected 0.35-0.50 for strong LLM with deep verification"
187
+ termination: "Expected 0.50-0.65 for LLM with state machine"
188
+ memory: "Expected 0.40-0.55 for LLM with context management"
189
+ reasoning: "Expected 0.35-0.50 for LLM with validation layer"
server/graders.py CHANGED
@@ -57,6 +57,12 @@ class Grader:
57
  "verify": 0.0,
58
  "check": 0.0,
59
  "review": 0.0,
 
 
 
 
 
 
60
  }
61
 
62
  # Specification keywords
@@ -87,6 +93,24 @@ class Grader:
87
  if "contradict" in diagnosis_lower:
88
  scores["contradiction"] = 0.10
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  return scores
91
 
92
  def grade(
@@ -406,5 +430,273 @@ def get_grader(task_id: str) -> Grader:
406
  "easy": EasyGrader(),
407
  "medium": MediumGrader(),
408
  "hard": HardGrader(),
 
 
 
409
  }
410
  return graders.get(task_id, EasyGrader())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  "verify": 0.0,
58
  "check": 0.0,
59
  "review": 0.0,
60
+ "termination": 0.0,
61
+ "loop": 0.0,
62
+ "memory": 0.0,
63
+ "context": 0.0,
64
+ "reasoning": 0.0,
65
+ "action": 0.0,
66
  }
67
 
68
  # Specification keywords
 
93
  if "contradict" in diagnosis_lower:
94
  scores["contradiction"] = 0.10
95
 
96
+ # Termination keywords (IBM 2026 - FATAL)
97
+ if "terminat" in diagnosis_lower or "loop" in diagnosis_lower:
98
+ scores["termination"] = 0.10
99
+ if "infinite" in diagnosis_lower or "repeat" in diagnosis_lower:
100
+ scores["loop"] = 0.10
101
+
102
+ # Memory/Context keywords (IBM 2026 - FATAL)
103
+ if "memory" in diagnosis_lower or "forget" in diagnosis_lower:
104
+ scores["memory"] = 0.10
105
+ if "context" in diagnosis_lower or "history" in diagnosis_lower:
106
+ scores["context"] = 0.10
107
+
108
+ # Reasoning-Action keywords (IBM 2026 - FATAL)
109
+ if "reason" in diagnosis_lower or "think" in diagnosis_lower:
110
+ scores["reasoning"] = 0.10
111
+ if "action" in diagnosis_lower or "execut" in diagnosis_lower:
112
+ scores["action"] = 0.10
113
+
114
  return scores
115
 
116
  def grade(
 
430
  "easy": EasyGrader(),
431
  "medium": MediumGrader(),
432
  "hard": HardGrader(),
433
+ "termination": TerminationGrader(),
434
+ "memory": MemoryGrader(),
435
+ "reasoning": ReasoningGrader(),
436
  }
437
  return graders.get(task_id, EasyGrader())
438
+
439
+
440
+ class TerminationGrader(Grader):
441
+ """
442
+ Grader for termination task: FM-1.5/FM-3.1 (IBM 2026 - FATAL FAILURE)
443
+
444
+ Task: Agent struggles to recognize task completion - loops or prematurely exits.
445
+ Fix: Implement explicit termination conditions with success criteria.
446
+ """
447
+
448
+ def __init__(self):
449
+ super().__init__("termination", "medium")
450
+
451
+ def grade(
452
+ self,
453
+ agent_config: dict[str, Any],
454
+ task_description: str,
455
+ failure_mode: str,
456
+ diagnosis: str,
457
+ ) -> tuple[float, dict[str, Any]]:
458
+ from .workflow_simulator import create_termination_task
459
+
460
+ nodes, _, _ = create_termination_task()
461
+ resilience = self._parse_config(agent_config, diagnosis)
462
+ simulator = WorkflowSimulator(nodes, seed=42)
463
+
464
+ results = []
465
+ for _ in range(10):
466
+ result = simulator.run_workflow(resilience)
467
+ results.append(result.success)
468
+
469
+ success_rate = sum(results) / len(results)
470
+
471
+ has_termination_detection = agent_config.get("explicit_termination", False)
472
+ has_max_iterations = agent_config.get("max_iterations", 0) > 0
473
+
474
+ diagnosis_scores = self._parse_diagnosis(diagnosis)
475
+ diagnosis_points = min(
476
+ 0.15, diagnosis_scores["termination"] + diagnosis_scores["loop"]
477
+ )
478
+
479
+ score = 0.0
480
+
481
+ if has_termination_detection:
482
+ score += 0.25
483
+ if has_max_iterations:
484
+ score += 0.20
485
+
486
+ if success_rate > 0:
487
+ score += success_rate * 0.30
488
+
489
+ if success_rate >= 0.6:
490
+ score += 0.15
491
+
492
+ score += diagnosis_points
493
+
494
+ score = min(1.0, max(0.0, score))
495
+
496
+ return score, {
497
+ "success_rate": success_rate,
498
+ "has_termination_detection": has_termination_detection,
499
+ "has_max_iterations": has_max_iterations,
500
+ "diagnosis_points": diagnosis_points,
501
+ "config": agent_config,
502
+ "diagnosis": diagnosis,
503
+ }
504
+
505
+ def _parse_config(
506
+ self, agent_config: dict[str, Any], diagnosis: str
507
+ ) -> ResilienceConfig:
508
+ return ResilienceConfig(
509
+ retry_max=agent_config.get("max_iterations", 50),
510
+ retry_delay_ms=agent_config.get("retry_delay_ms", 0),
511
+ timeout_ms=agent_config.get("timeout_ms", 30000),
512
+ fallback=agent_config.get("fallback", "abort"),
513
+ circuit_breaker_threshold=agent_config.get(
514
+ "circuit_breaker_threshold", 1.0
515
+ ),
516
+ context_strategy=agent_config.get("context_strategy", "truncate"),
517
+ context_summarization_threshold=agent_config.get(
518
+ "context_summarization_threshold", 500
519
+ ),
520
+ min_review_depth=agent_config.get("min_review_depth", 1),
521
+ consistency_check=agent_config.get("consistency_check", False),
522
+ explicit_termination=agent_config.get("explicit_termination", False),
523
+ diagnosis=diagnosis,
524
+ )
525
+
526
+
527
+ class MemoryGrader(Grader):
528
+ """
529
+ Grader for memory task: FM-1.4 (IBM 2026 - FATAL FAILURE)
530
+
531
+ Task: Agent loses conversation history in long traces - forgets original task.
532
+ Fix: Implement context management (sliding window, summarization, state machine).
533
+ """
534
+
535
+ def __init__(self):
536
+ super().__init__("memory", "hard")
537
+
538
+ def grade(
539
+ self,
540
+ agent_config: dict[str, Any],
541
+ task_description: str,
542
+ failure_mode: str,
543
+ diagnosis: str,
544
+ ) -> tuple[float, dict[str, Any]]:
545
+ from .workflow_simulator import create_memory_task
546
+
547
+ nodes, _, _ = create_memory_task()
548
+ resilience = self._parse_config(agent_config, diagnosis)
549
+ simulator = WorkflowSimulator(nodes, seed=42)
550
+
551
+ results = []
552
+ for _ in range(10):
553
+ result = simulator.run_workflow(resilience)
554
+ results.append(result.success)
555
+
556
+ success_rate = sum(results) / len(results)
557
+
558
+ has_summarization = agent_config.get("context_summarization", False)
559
+ has_sliding_window = agent_config.get("sliding_window", False)
560
+
561
+ diagnosis_scores = self._parse_diagnosis(diagnosis)
562
+ diagnosis_points = min(
563
+ 0.15, diagnosis_scores["memory"] + diagnosis_scores["context"]
564
+ )
565
+
566
+ score = 0.0
567
+
568
+ if has_summarization:
569
+ score += 0.20
570
+ if has_sliding_window:
571
+ score += 0.20
572
+
573
+ if success_rate > 0:
574
+ score += success_rate * 0.35
575
+
576
+ if success_rate >= 0.5:
577
+ score += 0.15
578
+
579
+ score += diagnosis_points
580
+
581
+ score = min(1.0, max(0.0, score))
582
+
583
+ return score, {
584
+ "success_rate": success_rate,
585
+ "has_summarization": has_summarization,
586
+ "has_sliding_window": has_sliding_window,
587
+ "diagnosis_points": diagnosis_points,
588
+ "config": agent_config,
589
+ "diagnosis": diagnosis,
590
+ }
591
+
592
+ def _parse_config(
593
+ self, agent_config: dict[str, Any], diagnosis: str
594
+ ) -> ResilienceConfig:
595
+ return ResilienceConfig(
596
+ retry_max=agent_config.get("retry_max", 0),
597
+ retry_delay_ms=agent_config.get("retry_delay_ms", 0),
598
+ timeout_ms=agent_config.get("timeout_ms", 30000),
599
+ fallback=agent_config.get("fallback", "abort"),
600
+ circuit_breaker_threshold=agent_config.get(
601
+ "circuit_breaker_threshold", 1.0
602
+ ),
603
+ context_strategy=agent_config.get("context_strategy", "summarize"),
604
+ context_summarization_threshold=agent_config.get(
605
+ "context_summarization_threshold", 200
606
+ ),
607
+ min_review_depth=agent_config.get("min_review_depth", 1),
608
+ consistency_check=agent_config.get("consistency_check", False),
609
+ context_summarization=agent_config.get("context_summarization", False),
610
+ sliding_window=agent_config.get("sliding_window", False),
611
+ diagnosis=diagnosis,
612
+ )
613
+
614
+
615
+ class ReasoningGrader(Grader):
616
+ """
617
+ Grader for reasoning-action alignment: FM-2.6 (IBM 2026 - FATAL FAILURE)
618
+
619
+ Task: Agent describes correct plan but executes unrelated/redundant command.
620
+ Fix: Implement action validation layer checking execution against reasoning.
621
+ """
622
+
623
+ def __init__(self):
624
+ super().__init__("reasoning", "hard")
625
+
626
+ def grade(
627
+ self,
628
+ agent_config: dict[str, Any],
629
+ task_description: str,
630
+ failure_mode: str,
631
+ diagnosis: str,
632
+ ) -> tuple[float, dict[str, Any]]:
633
+ from .workflow_simulator import create_reasoning_task
634
+
635
+ nodes, _, _ = create_reasoning_task()
636
+ resilience = self._parse_config(agent_config, diagnosis)
637
+ simulator = WorkflowSimulator(nodes, seed=42)
638
+
639
+ results = []
640
+ for _ in range(10):
641
+ result = simulator.run_workflow(resilience)
642
+ results.append(result.success)
643
+
644
+ success_rate = sum(results) / len(results)
645
+
646
+ has_action_validation = agent_config.get("action_validation", False)
647
+ has_consistency_check = agent_config.get("reasoning_consistency_check", False)
648
+
649
+ diagnosis_scores = self._parse_diagnosis(diagnosis)
650
+ diagnosis_points = min(
651
+ 0.15, diagnosis_scores["reasoning"] + diagnosis_scores["action"]
652
+ )
653
+
654
+ score = 0.0
655
+
656
+ if has_action_validation:
657
+ score += 0.20
658
+ if has_consistency_check:
659
+ score += 0.20
660
+
661
+ if success_rate > 0:
662
+ score += success_rate * 0.35
663
+
664
+ if success_rate >= 0.45:
665
+ score += 0.15
666
+
667
+ score += diagnosis_points
668
+
669
+ score = min(1.0, max(0.0, score))
670
+
671
+ return score, {
672
+ "success_rate": success_rate,
673
+ "has_action_validation": has_action_validation,
674
+ "has_consistency_check": has_consistency_check,
675
+ "diagnosis_points": diagnosis_points,
676
+ "config": agent_config,
677
+ "diagnosis": diagnosis,
678
+ }
679
+
680
+ def _parse_config(
681
+ self, agent_config: dict[str, Any], diagnosis: str
682
+ ) -> ResilienceConfig:
683
+ return ResilienceConfig(
684
+ retry_max=agent_config.get("retry_max", 0),
685
+ retry_delay_ms=agent_config.get("retry_delay_ms", 0),
686
+ timeout_ms=agent_config.get("timeout_ms", 30000),
687
+ fallback=agent_config.get("fallback", "abort"),
688
+ circuit_breaker_threshold=agent_config.get(
689
+ "circuit_breaker_threshold", 1.0
690
+ ),
691
+ context_strategy=agent_config.get("context_strategy", "truncate"),
692
+ context_summarization_threshold=agent_config.get(
693
+ "context_summarization_threshold", 500
694
+ ),
695
+ min_review_depth=agent_config.get("min_review_depth", 1),
696
+ consistency_check=agent_config.get("consistency_check", False),
697
+ action_validation=agent_config.get("action_validation", False),
698
+ reasoning_consistency_check=agent_config.get(
699
+ "reasoning_consistency_check", False
700
+ ),
701
+ diagnosis=diagnosis,
702
+ )
server/stress_test_environment.py CHANGED
@@ -23,6 +23,9 @@ try:
23
  create_easy_task,
24
  create_hard_task,
25
  create_medium_task,
 
 
 
26
  )
27
  except ImportError:
28
  from openenv.core.env_server.interfaces import (
@@ -48,23 +51,44 @@ TASK_DEFINITIONS = {
48
  "easy": {
49
  "id": "easy",
50
  "difficulty": "easy",
51
- "category": "MAST: Specification & System Design (41.8% of failures)",
52
  "description": "The researcher agent has a vague role definition ('You are a helpful assistant'). This causes task misinterpretation. Your task: Provide an explicit role specification JSON with clear capabilities, constraints, and success criteria.",
53
- "failure_mode": "Specification ambiguity - vague role definition causes task misinterpretation",
54
  },
55
  "medium": {
56
  "id": "medium",
57
  "difficulty": "medium",
58
- "category": "MAST: Inter-Agent Misalignment (36.9% of failures)",
59
  "description": "Multi-agent workflow where the planner outputs YAML but the executor expects JSON. This format mismatch causes the executor to fail. Your task: Add a format translation layer/middleware.",
60
- "failure_mode": "Format mismatch - planner outputs YAML, executor expects JSON",
61
  },
62
  "hard": {
63
  "id": "hard",
64
  "difficulty": "hard",
65
- "category": "MAST: Task Verification (21.3% of failures)",
66
- "description": "Multi-agent pipeline with verification failure. Writer produces contradictions (30%), reviewer prematurely approves (60%) without checks. Your task: Implement multi-level verification.",
67
- "failure_mode": "Verification failure - premature termination + incorrect verification",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  },
69
  }
70
 
@@ -116,6 +140,14 @@ class StressTestEnvironment(
116
  step_count=0,
117
  )
118
  self._current_task_index = 0
 
 
 
 
 
 
 
 
119
 
120
  task = TASK_DEFINITIONS["easy"]
121
 
@@ -207,33 +239,31 @@ class StressTestEnvironment(
207
  self._state.total_score = combined_score
208
  self._state.step_count += 1
209
 
210
- # Return combined result for the easy task (as reference)
211
  task_id = "all_tasks"
212
  task = {
213
- "description": "All 3 tasks (Easy: Spec, Medium: Format, Hard: Verification)",
214
- "failure_mode": "Combined MAST failure modes",
215
  "category": "MAST: All categories",
216
  }
217
 
218
  obs = StressTestObservation(
219
  task_id="all_tasks",
220
- task_description=f"Easy: {all_scores[0]:.2f}, Medium: {all_scores[1]:.2f}, Hard: {all_scores[2]:.2f} | Combined: {combined_score:.2f}",
221
- scenario_setup="All 3 MAST failure categories evaluated",
222
- failure_category="MAST: Spec (41.8%) + Inter-Agent (36.9%) + Verification (21.3%)",
223
  failure_mode_detected=True,
224
- failure_mode_description="Specification, Format Mismatch, and Verification failures",
225
  resilience_applied=True,
226
  applied_config=json.dumps(agent_config),
227
  test_passed=combined_score >= 0.5,
228
- test_completions=int(
229
- all_scores[0] * 10
230
- ), # Report easy task completions
231
- test_total_trials=30, # Total across all tasks
232
  test_latency_ms=0,
233
  diagnosis=f"Task scores: {all_scores}",
234
  diagnosis_points=0.0,
235
  reward=combined_score,
236
- done=True, # All tasks done in one step
237
  )
238
 
239
  return obs
@@ -302,6 +332,24 @@ class StressTestEnvironment(
302
  # Hard: Verification fix
303
  config["consistency_check"] = agent_config.get("consistency_check", False)
304
  config["min_review_depth"] = agent_config.get("min_review_depth", 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
  return config
307
 
 
23
  create_easy_task,
24
  create_hard_task,
25
  create_medium_task,
26
+ create_termination_task,
27
+ create_memory_task,
28
+ create_reasoning_task,
29
  )
30
  except ImportError:
31
  from openenv.core.env_server.interfaces import (
 
51
  "easy": {
52
  "id": "easy",
53
  "difficulty": "easy",
54
+ "category": "MAST FC1: System Design (41.8% of failures)",
55
  "description": "The researcher agent has a vague role definition ('You are a helpful assistant'). This causes task misinterpretation. Your task: Provide an explicit role specification JSON with clear capabilities, constraints, and success criteria.",
56
+ "failure_mode": "FM-1.1: Specification ambiguity - vague role definition causes task misinterpretation",
57
  },
58
  "medium": {
59
  "id": "medium",
60
  "difficulty": "medium",
61
+ "category": "MAST FC2: Inter-Agent Misalignment (36.9% of failures)",
62
  "description": "Multi-agent workflow where the planner outputs YAML but the executor expects JSON. This format mismatch causes the executor to fail. Your task: Add a format translation layer/middleware.",
63
+ "failure_mode": "FM-2.x: Format mismatch - planner outputs YAML, executor expects JSON",
64
  },
65
  "hard": {
66
  "id": "hard",
67
  "difficulty": "hard",
68
+ "category": "MAST FC3: Task Verification (21.3% of failures)",
69
+ "description": "Multi-agent pipeline with verification failure. Writer produces contradictions (30%), reviewer prematurely approves (60%) without checks. Your task: Implement multi-level verification. IBM 2026: FM-3.3 is strongest failure predictor.",
70
+ "failure_mode": "FM-3.1/FM-3.3: Verification failure - premature termination + incorrect verification",
71
+ },
72
+ "termination": {
73
+ "id": "termination",
74
+ "difficulty": "medium",
75
+ "category": "MAST FC1: System Design - FATAL FAILURE",
76
+ "description": "The agent struggles to recognize when a task is complete. It loops indefinitely or prematurely exits. Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues. Your task: Implement explicit termination conditions with success criteria.",
77
+ "failure_mode": "FM-1.5/FM-3.1: Unaware of termination + premature termination",
78
+ },
79
+ "memory": {
80
+ "id": "memory",
81
+ "difficulty": "hard",
82
+ "category": "MAST FC1: System Design - FATAL FAILURE",
83
+ "description": "As conversation history grows, the agent loses context and derails. Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces. Your task: Implement context management - sliding window, summarization, or state machine.",
84
+ "failure_mode": "FM-1.4: Loss of conversation history - agent forgets original task",
85
+ },
86
+ "reasoning": {
87
+ "id": "reasoning",
88
+ "difficulty": "hard",
89
+ "category": "MAST FC2: Inter-Agent Misalignment - FATAL FAILURE",
90
+ "description": "The agent describes correct plan but executes unrelated command. Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this. Your task: Implement action validation layer checking execution against reasoning.",
91
+ "failure_mode": "FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution",
92
  },
93
  }
94
 
 
140
  step_count=0,
141
  )
142
  self._current_task_index = 0
143
+ self._task_ids = [
144
+ "easy",
145
+ "medium",
146
+ "hard",
147
+ "termination",
148
+ "memory",
149
+ "reasoning",
150
+ ]
151
 
152
  task = TASK_DEFINITIONS["easy"]
153
 
 
239
  self._state.total_score = combined_score
240
  self._state.step_count += 1
241
 
242
+ # Return combined result for all tasks
243
  task_id = "all_tasks"
244
  task = {
245
+ "description": "All 6 tasks (Easy/Medium/Hard + Termination/Memory/Reasoning)",
246
+ "failure_mode": "Combined MAST failure modes including IBM 2026 FATAL failures",
247
  "category": "MAST: All categories",
248
  }
249
 
250
  obs = StressTestObservation(
251
  task_id="all_tasks",
252
+ task_description=f"Easy: {all_scores[0]:.2f}, Medium: {all_scores[1]:.2f}, Hard: {all_scores[2]:.2f}, Term: {all_scores[3]:.2f}, Mem: {all_scores[4]:.2f}, Reas: {all_scores[5]:.2f} | Combined: {combined_score:.2f}",
253
+ scenario_setup="All 6 MAST failure categories evaluated including IBM 2026 fatal failures",
254
+ failure_category="MAST: Spec (41.8%) + Inter-Agent (36.9%) + Verification (21.3%) + IBM FATAL (termination, memory, reasoning)",
255
  failure_mode_detected=True,
256
+ failure_mode_description="Specification, Format Mismatch, Verification, Termination, Memory, and Reasoning-Action failures",
257
  resilience_applied=True,
258
  applied_config=json.dumps(agent_config),
259
  test_passed=combined_score >= 0.5,
260
+ test_completions=int(all_scores[0] * 10),
261
+ test_total_trials=60, # Total across all 6 tasks
 
 
262
  test_latency_ms=0,
263
  diagnosis=f"Task scores: {all_scores}",
264
  diagnosis_points=0.0,
265
  reward=combined_score,
266
+ done=True,
267
  )
268
 
269
  return obs
 
332
  # Hard: Verification fix
333
  config["consistency_check"] = agent_config.get("consistency_check", False)
334
  config["min_review_depth"] = agent_config.get("min_review_depth", 1)
335
+ elif task_id == "termination":
336
+ # Termination: FM-1.5/FM-3.1 (IBM 2026 - FATAL)
337
+ config["explicit_termination"] = agent_config.get(
338
+ "explicit_termination", False
339
+ )
340
+ config["max_iterations"] = agent_config.get("max_iterations", 0)
341
+ elif task_id == "memory":
342
+ # Memory: FM-1.4 (IBM 2026 - FATAL)
343
+ config["context_summarization"] = agent_config.get(
344
+ "context_summarization", False
345
+ )
346
+ config["sliding_window"] = agent_config.get("sliding_window", False)
347
+ elif task_id == "reasoning":
348
+ # Reasoning: FM-2.6 (IBM 2026 - FATAL)
349
+ config["action_validation"] = agent_config.get("action_validation", False)
350
+ config["reasoning_consistency_check"] = agent_config.get(
351
+ "reasoning_consistency_check", False
352
+ )
353
 
354
  return config
355
 
server/workflow_simulator.py CHANGED
@@ -53,10 +53,17 @@ class ResilienceConfig:
53
  context_summarization_threshold: int = 500
54
  min_review_depth: int = 1
55
  consistency_check: bool = False
56
- diagnosis: str = "" # Agent's diagnosis of the failure
57
- spec_fix: str = "" # Agent's spec improvement (for spec failures)
58
- explicit_role_spec: bool = False # Flag: provided explicit spec
59
- format_translator: bool = False # For format mismatch failures
 
 
 
 
 
 
 
60
 
61
 
62
  @dataclass
@@ -430,3 +437,127 @@ def create_hard_task() -> tuple[list[NodeConfig], str, str]:
430
  "Verification failure - premature termination + incorrect verification"
431
  )
432
  return nodes, description, failure_mode
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  context_summarization_threshold: int = 500
54
  min_review_depth: int = 1
55
  consistency_check: bool = False
56
+ # IBM 2026: FC1 - Termination Awareness (FATAL)
57
+ explicit_termination: bool = False
58
+ max_iterations: int = 0
59
+
60
+ # IBM 2026: FC1 - Memory/Context Management (FATAL)
61
+ context_summarization: bool = False
62
+ sliding_window: bool = False
63
+
64
+ # IBM 2026: FC2 - Reasoning-Action Alignment (FATAL)
65
+ action_validation: bool = False
66
+ reasoning_consistency_check: bool = False
67
 
68
 
69
  @dataclass
 
437
  "Verification failure - premature termination + incorrect verification"
438
  )
439
  return nodes, description, failure_mode
440
+
441
+
442
+ def create_termination_task() -> tuple[list[NodeConfig], str, str]:
443
+ """
444
+ Termination task: FM-1.5/FM-3.1 (IBM 2026 - FATAL FAILURE)
445
+
446
+ Research: Kimi-K2 shows +46% spike in termination issues.
447
+ Task: Agent struggles to recognize when task is complete - loops or prematurely exits.
448
+ Fix: Implement explicit termination conditions with success criteria.
449
+ """
450
+ nodes = [
451
+ NodeConfig(
452
+ node_id="researcher",
453
+ role="researcher",
454
+ role_definition="Research and produce a detailed report",
455
+ latency_ms=100,
456
+ ),
457
+ NodeConfig(
458
+ node_id="worker1",
459
+ role="worker",
460
+ role_definition="Process research findings",
461
+ fail_rate=0.2, # Occasional failures
462
+ latency_ms=100,
463
+ ),
464
+ NodeConfig(
465
+ node_id="worker2",
466
+ role="worker",
467
+ role_definition="Process worker1 output",
468
+ fail_rate=0.2,
469
+ latency_ms=100,
470
+ ),
471
+ ]
472
+ description = (
473
+ "The agent struggles to recognize when a task is complete. It either: "
474
+ "- Loops indefinitely (FM-1.3 Step Repetition) "
475
+ "- Prematurely exits without confirming success (FM-3.1) "
476
+ "- Is unaware of termination conditions (FM-1.5) "
477
+ "Based on IBM 2026: Kimi-K2 shows +46% spike in termination issues. "
478
+ "Your task: Implement explicit termination conditions with success criteria verification."
479
+ )
480
+ failure_mode = "FM-1.5/FM-3.1: Unaware of termination + premature termination"
481
+ return nodes, description, failure_mode
482
+
483
+
484
+ def create_memory_task() -> tuple[list[NodeConfig], str, str]:
485
+ """
486
+ Memory task: FM-1.4 (IBM 2026 - FATAL FAILURE)
487
+
488
+ Research: GPT-OSS-120B shows 24% memory loss in long traces.
489
+ Task: As conversation history grows, agent loses context and derails.
490
+ Fix: Implement context management (sliding window, summarization, state machine).
491
+ """
492
+ nodes = [
493
+ NodeConfig(
494
+ node_id="analyzer1",
495
+ role="analyzer",
496
+ role_definition="Analyze data and produce findings",
497
+ context_limit=200, # Small context to trigger memory issues
498
+ latency_ms=100,
499
+ ),
500
+ NodeConfig(
501
+ node_id="analyzer2",
502
+ role="analyzer",
503
+ role_definition="Analyze analyzer1 output with original context",
504
+ context_limit=200,
505
+ latency_ms=100,
506
+ ),
507
+ NodeConfig(
508
+ node_id="analyzer3",
509
+ role="analyzer",
510
+ role_definition="Synthesize all previous findings",
511
+ context_limit=200,
512
+ latency_ms=100,
513
+ ),
514
+ ]
515
+ description = (
516
+ "As conversation history grows, the agent loses context and derails. "
517
+ "This is FM-1.4 (Loss of Conversation History) - unique fatal flaw. "
518
+ "Based on IBM 2026: GPT-OSS-120B shows 24% memory loss in long traces. "
519
+ "Your task: Implement context management - sliding window, summarization, or state machine."
520
+ )
521
+ failure_mode = "FM-1.4: Loss of conversation history - agent forgets original task"
522
+ return nodes, description, failure_mode
523
+
524
+
525
+ def create_reasoning_task() -> tuple[list[NodeConfig], str, str]:
526
+ """
527
+ Reasoning-Action task: FM-2.6 (IBM 2026 - FATAL FAILURE)
528
+
529
+ Research: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this.
530
+ Task: Agent identifies correct next step but executes redundant/irrelevant command.
531
+ Fix: Implement action validation layer checking execution against reasoning.
532
+ """
533
+ nodes = [
534
+ NodeConfig(
535
+ node_id="planner",
536
+ role="planner",
537
+ role_definition="Plan the next action based on current state",
538
+ latency_ms=100,
539
+ ),
540
+ NodeConfig(
541
+ node_id="executor",
542
+ role="executor",
543
+ role_definition="Execute the planned action",
544
+ output_corruption_rate=0.4, # 40% chance of executing wrong action
545
+ latency_ms=100,
546
+ ),
547
+ NodeConfig(
548
+ node_id="verifier",
549
+ role="verifier",
550
+ role_definition="Verify execution matches plan",
551
+ latency_ms=100,
552
+ ),
553
+ ]
554
+ description = (
555
+ "The agent identifies the correct next step but executes a redundant or irrelevant command. "
556
+ "FM-2.6: Reasoning-Action Mismatch - describes correct plan but executes unrelated tool call. "
557
+ "Based on IBM 2026: 92% of Kimi-K2 failures and 94% of GPT-OSS-120B failures show this. "
558
+ "Your task: Implement action validation layer that checks execution against reasoning."
559
+ )
560
+ failure_mode = (
561
+ "FM-2.6: Reasoning-action mismatch - correct thinking, wrong execution"
562
+ )
563
+ return nodes, description, failure_mode