{ "benchmarks": { "gsm8k": { "name": "Grade School Math 8K", "dataset": "openai/gsm8k", "lower_is_better": false, "models": [ { "model_id": "XiaomiMiMo/MiMo-V2.5-Pro", "short_name": "MiMo-V2.5-Pro", "provider": "XiaomiMiMo", "score": 99.6, "date": "2026-04-27" }, { "model_id": "meta-llama/Llama-3.1-405B", "short_name": "Llama-3.1-405B", "provider": "meta-llama", "score": 96.8, "date": "2024-07-16" }, { "model_id": "ibm-granite/granite-4.1-30b", "short_name": "granite-4.1-30b", "provider": "ibm-granite", "score": 94.16, "date": "2026-04-06" }, { "model_id": "deepseek-ai/DeepSeek-V4-Pro", "short_name": "DeepSeek-V4-Pro", "provider": "deepseek-ai", "score": 92.6, "date": "2026-04-22" }, { "model_id": "ibm-granite/granite-4.1-8b", "short_name": "granite-4.1-8b", "provider": "ibm-granite", "score": 92.49, "date": "2026-04-06" }, { "model_id": "microsoft/Phi-3-medium-4k-instruct", "short_name": "Phi-3-medium-4k-instruct", "provider": "microsoft", "score": 91.0, "date": "2024-05-07" }, { "model_id": "prism-ml/Ternary-Bonsai-8B-mlx-2bit", "short_name": "Ternary-Bonsai-8B-mlx-2bit", "provider": "prism-ml", "score": 91.0, "date": "2026-04-13" }, { "model_id": "prism-ml/Ternary-Bonsai-4B-mlx-2bit", "short_name": "Ternary-Bonsai-4B-mlx-2bit", "provider": "prism-ml", "score": 90.5, "date": "2026-04-13" }, { "model_id": "Qwen/Qwen2-72B", "short_name": "Qwen2-72B", "provider": "Qwen", "score": 89.5, "date": "2024-05-22" }, { "model_id": "deepseek-ai/DeepSeek-V3", "short_name": "DeepSeek-V3", "provider": "deepseek-ai", "score": 89.3, "date": "2024-12-25" }, { "model_id": "prism-ml/Bonsai-8B-mlx-1bit", "short_name": "Bonsai-8B-mlx-1bit", "provider": "prism-ml", "score": 88.0, "date": "2026-03-18" }, { "model_id": "ibm-granite/granite-4.1-3b", "short_name": "granite-4.1-3b", "provider": "ibm-granite", "score": 86.88, "date": "2026-04-06" }, { "model_id": "microsoft/Phi-3.5-mini-instruct", "short_name": "Phi-3.5-mini-instruct", "provider": "microsoft", "score": 86.2, "date": "2024-08-16" }, { "model_id": "internlm/internlm2_5-7b-chat", "short_name": "internlm2_5-7b-chat", "provider": "internlm", "score": 86.0, "date": "2024-06-27" }, { "model_id": "microsoft/Phi-3-mini-4k-instruct", "short_name": "Phi-3-mini-4k-instruct", "provider": "microsoft", "score": 85.7, "date": "2024-04-22" }, { "model_id": "meta-llama/Llama-3.1-8B-Instruct", "short_name": "Llama-3.1-8B-Instruct", "provider": "meta-llama", "score": 84.5, "date": "2024-07-18" }, { "model_id": "Qwen/Qwen2-7B", "short_name": "Qwen2-7B", "provider": "Qwen", "score": 79.9, "date": "2024-06-04" }, { "model_id": "internlm/internlm2-chat-20b", "short_name": "internlm2-chat-20b", "provider": "internlm", "score": 79.6, "date": "2024-01-10" }, { "model_id": "deepseek-ai/DeepSeek-V2", "short_name": "DeepSeek-V2", "provider": "deepseek-ai", "score": 79.2, "date": "2024-04-22" }, { "model_id": "prism-ml/Ternary-Bonsai-1.7B-mlx-2bit", "short_name": "Ternary-Bonsai-1.7B-mlx-2bit", "provider": "prism-ml", "score": 74.2, "date": "2026-04-14" }, { "model_id": "Xerv-AI/MAXWELL", "short_name": "MAXWELL", "provider": "Xerv-AI", "score": 70.0, "date": "2026-05-04" } ] }, "ParseBench": { "name": "ParseBench", "dataset": "llamaindex/ParseBench", "lower_is_better": false, "models": [ { "model_id": "infly/Infinity-Parser2-Pro", "short_name": "Infinity-Parser2-Pro", "provider": "infly", "score": 74.3, "date": "2026-04-08" }, { "model_id": "infly/Infinity-Parser2-Flash", "short_name": "Infinity-Parser2-Flash", "provider": "infly", "score": 73.25, "date": "2026-02-27" }, { "model_id": "datalab-to/chandra-ocr-2", "short_name": "chandra-ocr-2", "provider": "datalab-to", "score": 70.1, "date": "2026-03-16" }, { "model_id": "PaddlePaddle/PaddleOCR-VL-1.5", "short_name": "PaddleOCR-VL-1.5", "provider": "PaddlePaddle", "score": 65.95, "date": "2026-01-28" }, { "model_id": "google/gemma-4-31B-it", "short_name": "gemma-4-31B-it", "provider": "google", "score": 62.4, "date": "2026-03-11" }, { "model_id": "google/gemma-4-26B-A4B-it", "short_name": "gemma-4-26B-A4B-it", "provider": "google", "score": 58.5, "date": "2026-03-11" }, { "model_id": "rednote-hilab/dots.mocr", "short_name": "dots.mocr", "provider": "rednote-hilab", "score": 55.8, "date": "2026-03-19" }, { "model_id": "tiiuae/Falcon-OCR", "short_name": "Falcon-OCR", "provider": "tiiuae", "score": 53.08, "date": "2026-02-22" }, { "model_id": "docling-project/docling-models", "short_name": "docling-models", "provider": "docling-project", "score": 50.6, "date": "2024-07-02" }, { "model_id": "lightonai/LightOnOCR-2-1B", "short_name": "LightOnOCR-2-1B", "provider": "lightonai", "score": 48.0, "date": "2026-01-16" }, { "model_id": "Qwen/Qwen3-VL-8B-Instruct", "short_name": "Qwen3-VL-8B-Instruct", "provider": "Qwen", "score": 46.8, "date": "2025-10-11" }, { "model_id": "baidu/Qianfan-OCR", "short_name": "Qianfan-OCR", "provider": "baidu", "score": 46.2, "date": "2026-03-18" }, { "model_id": "opendatalab/MinerU2.5-2509-1.2B", "short_name": "MinerU2.5-2509-1.2B", "provider": "opendatalab", "score": 45.9, "date": "2025-09-17" }, { "model_id": "Qwen/Qwen3.6-35B-A3B", "short_name": "Qwen3.6-35B-A3B", "provider": "Qwen", "score": 44.1, "date": "2026-04-15" }, { "model_id": "deepseek-ai/DeepSeek-OCR-2", "short_name": "DeepSeek-OCR-2", "provider": "deepseek-ai", "score": 41.2, "date": "2026-01-27" }, { "model_id": "PaddlePaddle/PaddleOCR-VL", "short_name": "PaddleOCR-VL", "provider": "PaddlePaddle", "score": 40.9, "date": "2025-10-16" }, { "model_id": "google/gemma-4-E4B-it", "short_name": "gemma-4-E4B-it", "provider": "google", "score": 40.5, "date": "2026-03-02" }, { "model_id": "ibm-granite/granite-vision-4.1-4b", "short_name": "granite-vision-4.1-4b", "provider": "ibm-granite", "score": 39.45, "date": "2026-04-16" }, { "model_id": "Qwen/Qwen3.5-4B", "short_name": "Qwen3.5-4B", "provider": "Qwen", "score": 35.4, "date": "2026-02-27" }, { "model_id": "Qwen/Qwen3.5-9B", "short_name": "Qwen3.5-9B", "provider": "Qwen", "score": 31.9, "date": "2026-02-27" }, { "model_id": "zai-org/GLM-OCR", "short_name": "GLM-OCR", "provider": "zai-org", "score": 29.6, "date": "2026-01-30" }, { "model_id": "Qwen/Qwen3.5-0.8B", "short_name": "Qwen3.5-0.8B", "provider": "Qwen", "score": 28.4, "date": "2026-02-28" }, { "model_id": "Qwen/Qwen3.5-2B", "short_name": "Qwen3.5-2B", "provider": "Qwen", "score": 27.3, "date": "2026-02-28" } ] }, "hle": { "name": "HLE", "dataset": "cais/hle", "lower_is_better": false, "models": [ { "model_id": "moonshotai/Kimi-K2.6", "short_name": "Kimi-K2.6", "provider": "moonshotai", "score": 54.0, "date": "2026-04-14" }, { "model_id": "zai-org/GLM-5.1", "short_name": "GLM-5.1", "provider": "zai-org", "score": 52.3, "date": "2026-04-03" }, { "model_id": "zai-org/GLM-5", "short_name": "GLM-5", "provider": "zai-org", "score": 50.4, "date": "2026-02-11" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 50.2, "date": "2026-01-01" }, { "model_id": "Qwen/Qwen3.5-27B", "short_name": "Qwen3.5-27B", "provider": "Qwen", "score": 48.5, "date": "2026-02-24" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 48.3, "date": "2026-02-16" }, { "model_id": "XiaomiMiMo/MiMo-V2.5-Pro", "short_name": "MiMo-V2.5-Pro", "provider": "XiaomiMiMo", "score": 48.0, "date": "2026-04-27" }, { "model_id": "Qwen/Qwen3.5-122B-A10B", "short_name": "Qwen3.5-122B-A10B", "provider": "Qwen", "score": 47.5, "date": "2026-02-24" }, { "model_id": "moonshotai/Kimi-K2-Thinking", "short_name": "Kimi-K2-Thinking", "provider": "moonshotai", "score": 44.9, "date": "2025-11-04" }, { "model_id": "zai-org/GLM-4.7", "short_name": "GLM-4.7", "provider": "zai-org", "score": 42.8, "date": "2025-12-22" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 40.8, "date": "2025-12-01" }, { "model_id": "miromind-ai/MiroThinker-v1.5-235B", "short_name": "MiroThinker-v1.5-235B", "provider": "miromind-ai", "score": 39.2, "date": "2026-01-04" }, { "model_id": "deepseek-ai/DeepSeek-V4-Pro", "short_name": "DeepSeek-V4-Pro", "provider": "deepseek-ai", "score": 37.7, "date": "2026-04-22" }, { "model_id": "nvidia/Nemotron-Orchestrator-8B", "short_name": "Nemotron-Orchestrator-8B", "provider": "nvidia", "score": 37.1, "date": "2025-11-25" }, { "model_id": "deepseek-ai/DeepSeek-V4-Flash", "short_name": "DeepSeek-V4-Flash", "provider": "deepseek-ai", "score": 34.8, "date": "2026-04-22" }, { "model_id": "PolarSeeker/OpenSeeker-v2-30B-SFT", "short_name": "OpenSeeker-v2-30B-SFT", "provider": "PolarSeeker", "score": 34.6, "date": "2026-05-05" }, { "model_id": "miromind-ai/MiroThinker-v1.5-30B", "short_name": "MiroThinker-v1.5-30B", "provider": "miromind-ai", "score": 31.0, "date": "2026-01-04" }, { "model_id": "tencent/Hy3-preview", "short_name": "Hy3-preview", "provider": "tencent", "score": 30.0, "date": "2026-04-13" }, { "model_id": "google/gemma-4-31B-it", "short_name": "gemma-4-31B-it", "provider": "google", "score": 26.5, "date": "2026-03-11" }, { "model_id": "Qwen/Qwen3.6-27B", "short_name": "Qwen3.6-27B", "provider": "Qwen", "score": 24.0, "date": "2026-04-21" }, { "model_id": "stepfun-ai/Step-3.5-Flash", "short_name": "Step-3.5-Flash", "provider": "stepfun-ai", "score": 23.1, "date": "2026-02-01" }, { "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "nvidia", "score": 22.82, "date": "2026-03-10" }, { "model_id": "RedHatAI/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "RedHatAI", "score": 22.82, "date": "2026-03-26" }, { "model_id": "Qwen/Qwen3.5-35B-A3B", "short_name": "Qwen3.5-35B-A3B", "provider": "Qwen", "score": 22.4, "date": "2026-02-24" }, { "model_id": "Nanbeige/Nanbeige4.1-3B", "short_name": "Nanbeige4.1-3B", "provider": "Nanbeige", "score": 22.29, "date": "2026-02-10" }, { "model_id": "MiniMaxAI/MiniMax-M2.1", "short_name": "MiniMax-M2.1", "provider": "MiniMaxAI", "score": 22.2, "date": "2025-12-20" }, { "model_id": "XiaomiMiMo/MiMo-V2-Flash", "short_name": "MiMo-V2-Flash", "provider": "XiaomiMiMo", "score": 22.1, "date": "2025-12-16" }, { "model_id": "internlm/Intern-S2-Preview", "short_name": "Intern-S2-Preview", "provider": "internlm", "score": 21.94, "date": "2026-05-15" }, { "model_id": "Qwen/Qwen3.6-35B-A3B", "short_name": "Qwen3.6-35B-A3B", "provider": "Qwen", "score": 21.4, "date": "2026-04-15" }, { "model_id": "MiniMaxAI/MiniMax-M2.5", "short_name": "MiniMax-M2.5", "provider": "MiniMaxAI", "score": 19.4, "date": "2026-02-12" }, { "model_id": "openai/gpt-oss-120b", "short_name": "gpt-oss-120b", "provider": "openai", "score": 19.0, "date": "2025-08-04" }, { "model_id": "openai/gpt-oss-20b", "short_name": "gpt-oss-20b", "provider": "openai", "score": 17.3, "date": "2025-08-04" }, { "model_id": "google/gemma-4-26B-A4B-it", "short_name": "gemma-4-26B-A4B-it", "provider": "google", "score": 17.2, "date": "2026-03-11" }, { "model_id": "zai-org/GLM-4.7-Flash", "short_name": "GLM-4.7-Flash", "provider": "zai-org", "score": 14.4, "date": "2026-01-19" }, { "model_id": "LGAI-EXAONE/K-EXAONE-236B-A23B", "short_name": "K-EXAONE-236B-A23B", "provider": "LGAI-EXAONE", "score": 13.6, "date": "2025-12-26" }, { "model_id": "MiniMaxAI/MiniMax-M2", "short_name": "MiniMax-M2", "provider": "MiniMaxAI", "score": 12.5, "date": "2025-10-22" }, { "model_id": "HelpingAI/Dhanishtha-2.0-0126", "short_name": "Dhanishtha-2.0-0126", "provider": "HelpingAI", "score": 9.92, "date": "2026-01-01" } ] }, "results": { "name": "Open Agent Leaderboard Results", "dataset": "open-agent-leaderboard/results", "lower_is_better": false, "models": [ { "model_id": "open-agent-leaderboard/openai-solo", "short_name": "openai-solo", "provider": "open-agent-leaderboard", "score": 0.73, "date": "2026-05-18" }, { "model_id": "open-agent-leaderboard/claude-code", "short_name": "claude-code", "provider": "open-agent-leaderboard", "score": 0.67, "date": "2026-05-18" }, { "model_id": "open-agent-leaderboard/smolagent", "short_name": "smolagent", "provider": "open-agent-leaderboard", "score": 0.66, "date": "2026-05-18" }, { "model_id": "open-agent-leaderboard/react-shortlisting", "short_name": "react-shortlisting", "provider": "open-agent-leaderboard", "score": 0.62, "date": "2026-05-18" }, { "model_id": "open-agent-leaderboard/react", "short_name": "react", "provider": "open-agent-leaderboard", "score": 0.61, "date": "2026-05-18" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 0.46, "date": "2025-12-01" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 0.43, "date": "2026-01-01" } ] }, "gpqa": { "name": "GPQA Diamond", "dataset": "Idavidrein/gpqa", "lower_is_better": false, "models": [ { "model_id": "moonshotai/Kimi-K2.6", "short_name": "Kimi-K2.6", "provider": "moonshotai", "score": 90.5, "date": "2026-04-14" }, { "model_id": "deepseek-ai/DeepSeek-V4-Pro", "short_name": "DeepSeek-V4-Pro", "provider": "deepseek-ai", "score": 90.1, "date": "2026-04-22" }, { "model_id": "FINAL-Bench/Darwin-28B-REASON", "short_name": "Darwin-28B-REASON", "provider": "FINAL-Bench", "score": 89.39, "date": "2026-05-17" }, { "model_id": "OrionLLM/GRM-2.6-Opus", "short_name": "GRM-2.6-Opus", "provider": "OrionLLM", "score": 89.2, "date": "2026-05-07" }, { "model_id": "FINAL-Bench/Darwin-28B-Opus", "short_name": "Darwin-28B-Opus", "provider": "FINAL-Bench", "score": 88.89, "date": "2026-04-24" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 88.4, "date": "2026-02-16" }, { "model_id": "FINAL-Bench/Darwin-36B-Opus", "short_name": "Darwin-36B-Opus", "provider": "FINAL-Bench", "score": 88.4, "date": "2026-04-22" }, { "model_id": "OrionLLM/GRM-2.6-Plus", "short_name": "GRM-2.6-Plus", "provider": "OrionLLM", "score": 88.3, "date": "2026-04-23" }, { "model_id": "inclusionAI/Ring-2.6-1T", "short_name": "Ring-2.6-1T", "provider": "inclusionAI", "score": 88.27, "date": "2026-05-14" }, { "model_id": "deepseek-ai/DeepSeek-V4-Flash", "short_name": "DeepSeek-V4-Flash", "provider": "deepseek-ai", "score": 88.1, "date": "2026-04-22" }, { "model_id": "Qwen/Qwen3.6-27B", "short_name": "Qwen3.6-27B", "provider": "Qwen", "score": 87.8, "date": "2026-04-21" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 87.6, "date": "2026-01-01" }, { "model_id": "tencent/Hy3-preview", "short_name": "Hy3-preview", "provider": "tencent", "score": 87.2, "date": "2026-04-13" }, { "model_id": "FINAL-Bench/Darwin-27B-Opus", "short_name": "Darwin-27B-Opus", "provider": "FINAL-Bench", "score": 86.9, "date": "2026-04-12" }, { "model_id": "Qwen/Qwen3.5-122B-A10B", "short_name": "Qwen3.5-122B-A10B", "provider": "Qwen", "score": 86.6, "date": "2026-02-24" }, { "model_id": "zai-org/GLM-5.1", "short_name": "GLM-5.1", "provider": "zai-org", "score": 86.2, "date": "2026-04-03" }, { "model_id": "zai-org/GLM-5", "short_name": "GLM-5", "provider": "zai-org", "score": 86.0, "date": "2026-02-11" }, { "model_id": "Qwen/Qwen3.6-35B-A3B", "short_name": "Qwen3.6-35B-A3B", "provider": "Qwen", "score": 86.0, "date": "2026-04-15" }, { "model_id": "FINAL-Bench/Darwin-31B-Opus", "short_name": "Darwin-31B-Opus", "provider": "FINAL-Bench", "score": 85.9, "date": "2026-04-06" }, { "model_id": "zai-org/GLM-4.7", "short_name": "GLM-4.7", "provider": "zai-org", "score": 85.7, "date": "2025-12-22" }, { "model_id": "Qwen/Qwen3.5-27B", "short_name": "Qwen3.5-27B", "provider": "Qwen", "score": 85.5, "date": "2026-02-24" }, { "model_id": "MiniMaxAI/MiniMax-M2.5", "short_name": "MiniMax-M2.5", "provider": "MiniMaxAI", "score": 85.2, "date": "2026-02-12" }, { "model_id": "moonshotai/Kimi-K2-Thinking", "short_name": "Kimi-K2-Thinking", "provider": "moonshotai", "score": 84.5, "date": "2025-11-04" }, { "model_id": "FINAL-Bench/Darwin-9B-NEG", "short_name": "Darwin-9B-NEG", "provider": "FINAL-Bench", "score": 84.34, "date": "2026-04-24" }, { "model_id": "google/gemma-4-31B-it", "short_name": "gemma-4-31B-it", "provider": "google", "score": 84.3, "date": "2026-03-11" }, { "model_id": "Qwen/Qwen3.5-35B-A3B", "short_name": "Qwen3.5-35B-A3B", "provider": "Qwen", "score": 84.2, "date": "2026-02-24" }, { "model_id": "Nanbeige/Nanbeige4.1-3B", "short_name": "Nanbeige4.1-3B", "provider": "Nanbeige", "score": 83.8, "date": "2026-02-10" }, { "model_id": "stepfun-ai/Step-3.5-Flash", "short_name": "Step-3.5-Flash", "provider": "stepfun-ai", "score": 83.5, "date": "2026-02-01" }, { "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "nvidia", "score": 82.7, "date": "2026-03-10" }, { "model_id": "RedHatAI/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "RedHatAI", "score": 82.7, "date": "2026-03-26" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 82.4, "date": "2025-12-01" }, { "model_id": "google/gemma-4-26B-A4B-it", "short_name": "gemma-4-26B-A4B-it", "provider": "google", "score": 82.3, "date": "2026-03-11" }, { "model_id": "Qwen/Qwen3.5-9B", "short_name": "Qwen3.5-9B", "provider": "Qwen", "score": 81.7, "date": "2026-02-27" }, { "model_id": "openai/gpt-oss-120b", "short_name": "gpt-oss-120b", "provider": "openai", "score": 80.9, "date": "2025-08-04" }, { "model_id": "meituan-longcat/LongCat-Flash-Thinking-2601", "short_name": "LongCat-Flash-Thinking-2601", "provider": "meituan-longcat", "score": 80.5, "date": "2026-01-14" }, { "model_id": "LGAI-EXAONE/EXAONE-4.5-33B", "short_name": "EXAONE-4.5-33B", "provider": "LGAI-EXAONE", "score": 80.5, "date": "2026-04-04" }, { "model_id": "LGAI-EXAONE/K-EXAONE-236B-A23B", "short_name": "K-EXAONE-236B-A23B", "provider": "LGAI-EXAONE", "score": 79.1, "date": "2025-12-26" }, { "model_id": "OrionLLM/GRM-2.5", "short_name": "GRM-2.5", "provider": "OrionLLM", "score": 76.7, "date": "2026-04-07" }, { "model_id": "arcee-ai/Trinity-Large-Thinking", "short_name": "Trinity-Large-Thinking", "provider": "arcee-ai", "score": 76.3, "date": "2026-04-01" }, { "model_id": "Qwen/Qwen3.5-4B", "short_name": "Qwen3.5-4B", "provider": "Qwen", "score": 76.2, "date": "2026-02-27" }, { "model_id": "nvidia/Nemotron-Cascade-2-30B-A3B", "short_name": "Nemotron-Cascade-2-30B-A3B", "provider": "nvidia", "score": 76.1, "date": "2026-03-18" }, { "model_id": "zai-org/GLM-4.7-Flash", "short_name": "GLM-4.7-Flash", "provider": "zai-org", "score": 75.2, "date": "2026-01-19" }, { "model_id": "jdopensource/JoyAI-LLM-Flash", "short_name": "JoyAI-LLM-Flash", "provider": "jdopensource", "score": 74.43, "date": "2026-02-14" }, { "model_id": "openai/gpt-oss-20b", "short_name": "gpt-oss-20b", "provider": "openai", "score": 74.2, "date": "2025-08-04" }, { "model_id": "deepseek-ai/DeepSeek-R1", "short_name": "DeepSeek-R1", "provider": "deepseek-ai", "score": 71.5, "date": "2025-01-20" }, { "model_id": "mistralai/Mistral-Small-4-119B-2603", "short_name": "Mistral-Small-4-119B-2603", "provider": "mistralai", "score": 71.2, "date": "2026-01-23" }, { "model_id": "Zyphra/ZAYA1-8B", "short_name": "ZAYA1-8B", "provider": "Zyphra", "score": 71.0, "date": "2026-05-04" }, { "model_id": "XiaomiMiMo/MiMo-V2.5-Pro", "short_name": "MiMo-V2.5-Pro", "provider": "XiaomiMiMo", "score": 66.7, "date": "2026-04-27" }, { "model_id": "Qwen/Qwen3-4B-Thinking-2507", "short_name": "Qwen3-4B-Thinking-2507", "provider": "Qwen", "score": 65.8, "date": "2025-08-05" }, { "model_id": "Qwen/Qwen3-4B-Instruct-2507", "short_name": "Qwen3-4B-Instruct-2507", "provider": "Qwen", "score": 62.0, "date": "2025-08-05" }, { "model_id": "google/gemma-4-E4B-it", "short_name": "gemma-4-E4B-it", "provider": "google", "score": 58.6, "date": "2026-03-02" }, { "model_id": "OrionLLM/GRM-7b", "short_name": "GRM-7b", "provider": "OrionLLM", "score": 53.7, "date": "2026-03-13" }, { "model_id": "ibm-granite/granite-4.1-30b", "short_name": "granite-4.1-30b", "provider": "ibm-granite", "score": 45.76, "date": "2026-04-06" }, { "model_id": "google/gemma-4-E2B-it", "short_name": "gemma-4-E2B-it", "provider": "google", "score": 43.4, "date": "2026-03-02" }, { "model_id": "ibm-granite/granite-4.1-8b", "short_name": "granite-4.1-8b", "provider": "ibm-granite", "score": 41.96, "date": "2026-04-06" }, { "model_id": "LiquidAI/LFM2.5-1.2B-Instruct", "short_name": "LFM2.5-1.2B-Instruct", "provider": "LiquidAI", "score": 38.89, "date": "2026-01-06" }, { "model_id": "ibm-granite/granite-4.1-3b", "short_name": "granite-4.1-3b", "provider": "ibm-granite", "score": 31.7, "date": "2026-04-06" }, { "model_id": "LiquidAI/LFM2.5-350M", "short_name": "LFM2.5-350M", "provider": "LiquidAI", "score": 30.64, "date": "2026-03-31" }, { "model_id": "meta-llama/Llama-3.1-8B-Instruct", "short_name": "Llama-3.1-8B-Instruct", "provider": "meta-llama", "score": 30.4, "date": "2024-07-18" }, { "model_id": "OrionLLM/GRM-1.5b", "short_name": "GRM-1.5b", "provider": "OrionLLM", "score": 29.5, "date": "2026-03-13" }, { "model_id": "OrionLLM/GRM-2.5-Air", "short_name": "GRM-2.5-Air", "provider": "OrionLLM", "score": 12.5, "date": "2026-04-07" }, { "model_id": "Qwen/Qwen3.5-0.8B", "short_name": "Qwen3.5-0.8B", "provider": "Qwen", "score": 11.9, "date": "2026-02-28" } ] }, "open_asr_leaderboard": { "name": "open-asr-leaderboard", "dataset": "hf-audio/open-asr-leaderboard", "lower_is_better": true, "models": [ { "model_id": "ibm-granite/granite-speech-4.1-2b", "short_name": "granite-speech-4.1-2b", "provider": "ibm-granite", "score": 5.33, "date": "2026-04-16" }, { "model_id": "CohereLabs/cohere-transcribe-03-2026", "short_name": "cohere-transcribe-03-2026", "provider": "CohereLabs", "score": 5.42, "date": "2026-03-24" }, { "model_id": "ibm-granite/granite-4.0-1b-speech", "short_name": "granite-4.0-1b-speech", "provider": "ibm-granite", "score": 5.52, "date": "2026-02-27" }, { "model_id": "nvidia/canary-qwen-2.5b", "short_name": "canary-qwen-2.5b", "provider": "nvidia", "score": 5.63, "date": "2025-06-26" }, { "model_id": "ibm-granite/granite-speech-3.3-8b", "short_name": "granite-speech-3.3-8b", "provider": "ibm-granite", "score": 5.74, "date": "2025-04-14" }, { "model_id": "Qwen/Qwen3-ASR-1.7B", "short_name": "Qwen3-ASR-1.7B", "provider": "Qwen", "score": 5.76, "date": "2026-01-28" }, { "model_id": "okestro-ai-lab/SYMPHONY-ASR", "short_name": "SYMPHONY-ASR", "provider": "okestro-ai-lab", "score": 5.91, "date": "2026-01-12" }, { "model_id": "ibm-granite/granite-speech-3.3-2b", "short_name": "granite-speech-3.3-2b", "provider": "ibm-granite", "score": 6.0, "date": "2025-04-28" }, { "model_id": "microsoft/Phi-4-multimodal-instruct", "short_name": "Phi-4-multimodal-instruct", "provider": "microsoft", "score": 6.02, "date": "2025-02-24" }, { "model_id": "nvidia/parakeet-tdt-0.6b-v2", "short_name": "parakeet-tdt-0.6b-v2", "provider": "nvidia", "score": 6.05, "date": "2025-04-15" }, { "model_id": "nvidia/parakeet-tdt-0.6b-v3", "short_name": "parakeet-tdt-0.6b-v3", "provider": "nvidia", "score": 6.32, "date": "2025-08-04" }, { "model_id": "nvidia/canary-1b-flash", "short_name": "canary-1b-flash", "provider": "nvidia", "score": 6.35, "date": "2025-03-07" }, { "model_id": "kyutai/stt-2.6b-en", "short_name": "stt-2.6b-en", "provider": "kyutai", "score": 6.4, "date": "2025-06-06" }, { "model_id": "Qwen/Qwen3-ASR-0.6B", "short_name": "Qwen3-ASR-0.6B", "provider": "Qwen", "score": 6.42, "date": "2026-01-28" }, { "model_id": "okestro-ai-lab/SYMPHONY", "short_name": "SYMPHONY", "provider": "okestro-ai-lab", "score": 6.48, "date": "2025-10-27" }, { "model_id": "nvidia/canary-1b", "short_name": "canary-1b", "provider": "nvidia", "score": 6.5, "date": "2024-02-07" }, { "model_id": "mistralai/Voxtral-Small-24B-2507", "short_name": "Voxtral-Small-24B-2507", "provider": "mistralai", "score": 6.62, "date": "2025-07-01" }, { "model_id": "UsefulSensors/moonshine-streaming-medium", "short_name": "moonshine-streaming-medium", "provider": "UsefulSensors", "score": 6.66, "date": "2026-01-06" }, { "model_id": "nyrahealth/CrisperWhisper", "short_name": "CrisperWhisper", "provider": "nyrahealth", "score": 6.67, "date": "2024-08-29" }, { "model_id": "nvidia/parakeet-tdt-1.1b", "short_name": "parakeet-tdt-1.1b", "provider": "nvidia", "score": 7.01, "date": "2024-01-25" }, { "model_id": "zai-org/GLM-ASR-Nano-2512", "short_name": "GLM-ASR-Nano-2512", "provider": "zai-org", "score": 7.03, "date": "2025-12-09" }, { "model_id": "mistralai/Voxtral-Mini-3B-2507", "short_name": "Voxtral-Mini-3B-2507", "provider": "mistralai", "score": 7.05, "date": "2025-07-01" }, { "model_id": "soundsgoodai/Zipformer-transducer-XL-290M", "short_name": "Zipformer-transducer-XL-290M", "provider": "soundsgoodai", "score": 7.06, "date": "2026-05-12" }, { "model_id": "nvidia/canary-180m-flash", "short_name": "canary-180m-flash", "provider": "nvidia", "score": 7.12, "date": "2025-03-11" }, { "model_id": "nvidia/parakeet-rnnt-1.1b", "short_name": "parakeet-rnnt-1.1b", "provider": "nvidia", "score": 7.12, "date": "2023-12-27" }, { "model_id": "nvidia/canary-1b-v2", "short_name": "canary-1b-v2", "provider": "nvidia", "score": 7.15, "date": "2025-08-04" }, { "model_id": "distil-whisper/distil-large-v3.5", "short_name": "distil-large-v3.5", "provider": "distil-whisper", "score": 7.21, "date": "2024-12-05" }, { "model_id": "efficient-speech/lite-whisper-large-v3-acc", "short_name": "lite-whisper-large-v3-acc", "provider": "efficient-speech", "score": 7.23, "date": "2025-02-26" }, { "model_id": "nvidia/parakeet-ctc-1.1b", "short_name": "parakeet-ctc-1.1b", "provider": "nvidia", "score": 7.4, "date": "2023-12-28" }, { "model_id": "espnet/owsm_ctc_v4_1B", "short_name": "owsm_ctc_v4_1B", "provider": "espnet", "score": 7.42, "date": "2025-01-16" }, { "model_id": "efficient-speech/lite-whisper-large-v3", "short_name": "lite-whisper-large-v3", "provider": "efficient-speech", "score": 7.43, "date": "2025-02-26" }, { "model_id": "openai/whisper-large-v3", "short_name": "whisper-large-v3", "provider": "openai", "score": 7.44, "date": "2023-11-07" }, { "model_id": "nvidia/parakeet-tdt_ctc-110m", "short_name": "parakeet-tdt_ctc-110m", "provider": "nvidia", "score": 7.49, "date": "2024-09-17" }, { "model_id": "nvidia/parakeet-rnnt-0.6b", "short_name": "parakeet-rnnt-0.6b", "provider": "nvidia", "score": 7.5, "date": "2023-12-28" }, { "model_id": "distil-whisper/distil-large-v3", "short_name": "distil-large-v3", "provider": "distil-whisper", "score": 7.52, "date": "2024-03-21" }, { "model_id": "mistralai/Voxtral-Mini-4B-Realtime-2602", "short_name": "Voxtral-Mini-4B-Realtime-2602", "provider": "mistralai", "score": 7.68, "date": "2026-01-21" }, { "model_id": "nvidia/parakeet-ctc-0.6b", "short_name": "parakeet-ctc-0.6b", "provider": "nvidia", "score": 7.69, "date": "2023-12-28" }, { "model_id": "efficient-speech/lite-whisper-large-v3-turbo-acc", "short_name": "lite-whisper-large-v3-turbo-acc", "provider": "efficient-speech", "score": 7.77, "date": "2025-02-26" }, { "model_id": "microsoft/VibeVoice-ASR-HF", "short_name": "VibeVoice-ASR-HF", "provider": "microsoft", "score": 7.77, "date": "2026-03-02" }, { "model_id": "openai/whisper-large-v3-turbo", "short_name": "whisper-large-v3-turbo", "provider": "openai", "score": 7.83, "date": "2024-10-01" }, { "model_id": "openai/whisper-large-v2", "short_name": "whisper-large-v2", "provider": "openai", "score": 7.83, "date": "2022-12-05" }, { "model_id": "UsefulSensors/moonshine-streaming-small", "short_name": "moonshine-streaming-small", "provider": "UsefulSensors", "score": 7.84, "date": "2026-01-06" }, { "model_id": "distil-whisper/distil-large-v2", "short_name": "distil-large-v2", "provider": "distil-whisper", "score": 7.92, "date": "2023-10-24" }, { "model_id": "openai/whisper-large", "short_name": "whisper-large", "provider": "openai", "score": 7.94, "date": "2022-09-26" }, { "model_id": "openai/whisper-medium.en", "short_name": "whisper-medium.en", "provider": "openai", "score": 8.09, "date": "2022-09-26" }, { "model_id": "espnet/owsm_ctc_v3.1_1B", "short_name": "owsm_ctc_v3.1_1B", "provider": "espnet", "score": 8.12, "date": "2024-02-23" }, { "model_id": "efficient-speech/lite-whisper-large-v3-fast", "short_name": "lite-whisper-large-v3-fast", "provider": "efficient-speech", "score": 8.16, "date": "2025-02-26" }, { "model_id": "nvidia/stt_en_conformer_ctc_large", "short_name": "stt_en_conformer_ctc_large", "provider": "nvidia", "score": 8.32, "date": "2022-04-09" }, { "model_id": "speechbrain/asr-conformer-loquacious", "short_name": "asr-conformer-loquacious", "provider": "speechbrain", "score": 8.48, "date": "2025-02-06" }, { "model_id": "distil-whisper/distil-small.en", "short_name": "distil-small.en", "provider": "distil-whisper", "score": 8.57, "date": "2023-12-06" }, { "model_id": "openai/whisper-small.en", "short_name": "whisper-small.en", "provider": "openai", "score": 8.59, "date": "2022-09-26" }, { "model_id": "distil-whisper/distil-medium.en", "short_name": "distil-medium.en", "provider": "distil-whisper", "score": 8.77, "date": "2023-10-24" }, { "model_id": "abr-ai/niagara-38m-batch.en", "short_name": "niagara-38m-batch.en", "provider": "abr-ai", "score": 8.91, "date": "2026-02-19" }, { "model_id": "nvidia/stt_en_fastconformer_ctc_large", "short_name": "stt_en_fastconformer_ctc_large", "provider": "nvidia", "score": 8.96, "date": "2023-06-08" }, { "model_id": "nvidia/stt_en_fastconformer_transducer_large", "short_name": "stt_en_fastconformer_transducer_large", "provider": "nvidia", "score": 9.06, "date": "2023-06-08" }, { "model_id": "UsefulSensors/moonshine-base", "short_name": "moonshine-base", "provider": "UsefulSensors", "score": 9.99, "date": "2024-11-02" }, { "model_id": "openai/whisper-base.en", "short_name": "whisper-base.en", "provider": "openai", "score": 10.32, "date": "2022-09-26" }, { "model_id": "abr-ai/niagara-19m-batch.en", "short_name": "niagara-19m-batch.en", "provider": "abr-ai", "score": 10.47, "date": "2025-11-13" }, { "model_id": "nvidia/stt_en_conformer_ctc_small", "short_name": "stt_en_conformer_ctc_small", "provider": "nvidia", "score": 11.16, "date": "2023-06-12" }, { "model_id": "UsefulSensors/moonshine-streaming-tiny", "short_name": "moonshine-streaming-tiny", "provider": "UsefulSensors", "score": 12.0, "date": "2026-01-06" }, { "model_id": "UsefulSensors/moonshine-tiny", "short_name": "moonshine-tiny", "provider": "UsefulSensors", "score": 12.65, "date": "2024-10-30" }, { "model_id": "openai/whisper-tiny.en", "short_name": "whisper-tiny.en", "provider": "openai", "score": 12.81, "date": "2022-09-26" }, { "model_id": "speechbrain/asr-wav2vec2-librispeech", "short_name": "asr-wav2vec2-librispeech", "provider": "speechbrain", "score": 14.35, "date": "2022-06-05" }, { "model_id": "facebook/wav2vec2-large-960h-lv60-self", "short_name": "wav2vec2-large-960h-lv60-self", "provider": "facebook", "score": 21.27, "date": "2022-03-02" }, { "model_id": "facebook/mms-1b-all", "short_name": "mms-1b-all", "provider": "facebook", "score": 22.54, "date": "2023-05-27" }, { "model_id": "facebook/hubert-xlarge-ls960-ft", "short_name": "hubert-xlarge-ls960-ft", "provider": "facebook", "score": 22.55, "date": "2022-03-02" }, { "model_id": "facebook/hubert-large-ls960-ft", "short_name": "hubert-large-ls960-ft", "provider": "facebook", "score": 22.69, "date": "2022-03-02" }, { "model_id": "facebook/wav2vec2-large-robust-ft-libri-960h", "short_name": "wav2vec2-large-robust-ft-libri-960h", "provider": "facebook", "score": 22.93, "date": "2022-03-02" }, { "model_id": "facebook/data2vec-audio-large-960h", "short_name": "data2vec-audio-large-960h", "provider": "facebook", "score": 23.21, "date": "2022-04-02" }, { "model_id": "facebook/wav2vec2-conformer-rope-large-960h-ft", "short_name": "wav2vec2-conformer-rope-large-960h-ft", "provider": "facebook", "score": 23.28, "date": "2022-04-18" }, { "model_id": "facebook/wav2vec2-conformer-rel-pos-large-960h-ft", "short_name": "wav2vec2-conformer-rel-pos-large-960h-ft", "provider": "facebook", "score": 23.29, "date": "2022-04-18" }, { "model_id": "facebook/wav2vec2-large-960h", "short_name": "wav2vec2-large-960h", "provider": "facebook", "score": 26.77, "date": "2022-03-02" }, { "model_id": "facebook/data2vec-audio-base-960h", "short_name": "data2vec-audio-base-960h", "provider": "facebook", "score": 28.3, "date": "2022-03-02" }, { "model_id": "facebook/wav2vec2-base-960h", "short_name": "wav2vec2-base-960h", "provider": "facebook", "score": 29.4, "date": "2022-03-02" }, { "model_id": "facebook/mms-1b-fl102", "short_name": "mms-1b-fl102", "provider": "facebook", "score": 39.8, "date": "2023-05-27" } ] }, "olmOcr": { "name": "olmOCR-bench", "dataset": "allenai/olmOCR-bench", "lower_is_better": false, "models": [ { "model_id": "infly/Infinity-Parser2-Pro", "short_name": "Infinity-Parser2-Pro", "provider": "infly", "score": 87.6, "date": "2026-04-08" }, { "model_id": "datalab-to/chandra-ocr-2", "short_name": "chandra-ocr-2", "provider": "datalab-to", "score": 85.9, "date": "2026-03-16" }, { "model_id": "rednote-hilab/dots.mocr", "short_name": "dots.mocr", "provider": "rednote-hilab", "score": 83.9, "date": "2026-03-19" }, { "model_id": "lightonai/LightOnOCR-2-1B", "short_name": "LightOnOCR-2-1B", "provider": "lightonai", "score": 83.2, "date": "2026-01-16" }, { "model_id": "datalab-to/chandra", "short_name": "chandra", "provider": "datalab-to", "score": 83.1, "date": "2025-10-21" }, { "model_id": "infly/Infinity-Parser-7B", "short_name": "Infinity-Parser-7B", "provider": "infly", "score": 82.5, "date": "2025-10-17" }, { "model_id": "tiiuae/Falcon-OCR", "short_name": "Falcon-OCR", "provider": "tiiuae", "score": 80.3, "date": "2026-02-22" }, { "model_id": "PaddlePaddle/PaddleOCR-VL", "short_name": "PaddleOCR-VL", "provider": "PaddlePaddle", "score": 80.0, "date": "2025-10-16" }, { "model_id": "baidu/Qianfan-OCR", "short_name": "Qianfan-OCR", "provider": "baidu", "score": 79.8, "date": "2026-03-18" }, { "model_id": "rednote-hilab/dots.ocr", "short_name": "dots.ocr", "provider": "rednote-hilab", "score": 79.1, "date": "2025-07-30" }, { "model_id": "deepseek-ai/DeepSeek-OCR-2", "short_name": "DeepSeek-OCR-2", "provider": "deepseek-ai", "score": 76.3, "date": "2026-01-27" }, { "model_id": "lightonai/LightOnOCR-1B-1025", "short_name": "LightOnOCR-1B-1025", "provider": "lightonai", "score": 76.1, "date": "2025-10-20" }, { "model_id": "deepseek-ai/DeepSeek-OCR", "short_name": "DeepSeek-OCR", "provider": "deepseek-ai", "score": 75.7, "date": "2025-10-17" }, { "model_id": "opendatalab/MinerU2.5-2509-1.2B", "short_name": "MinerU2.5-2509-1.2B", "provider": "opendatalab", "score": 75.2, "date": "2025-09-17" }, { "model_id": "zai-org/GLM-OCR", "short_name": "GLM-OCR", "provider": "zai-org", "score": 75.2, "date": "2026-01-30" }, { "model_id": "FireRedTeam/FireRed-OCR", "short_name": "FireRed-OCR", "provider": "FireRedTeam", "score": 70.2, "date": "2026-02-28" }, { "model_id": "nanonets/Nanonets-OCR2-3B", "short_name": "Nanonets-OCR2-3B", "provider": "nanonets", "score": 69.5, "date": "2025-10-13" } ] }, "sweVerified": { "name": "SWE-bench Verified", "dataset": "SWE-bench/SWE-bench_Verified", "lower_is_better": false, "models": [ { "model_id": "deepseek-ai/DeepSeek-V4-Pro", "short_name": "DeepSeek-V4-Pro", "provider": "deepseek-ai", "score": 80.6, "date": "2026-04-22" }, { "model_id": "moonshotai/Kimi-K2.6", "short_name": "Kimi-K2.6", "provider": "moonshotai", "score": 80.2, "date": "2026-04-14" }, { "model_id": "deepseek-ai/DeepSeek-V4-Flash", "short_name": "DeepSeek-V4-Flash", "provider": "deepseek-ai", "score": 79.0, "date": "2026-04-22" }, { "model_id": "XiaomiMiMo/MiMo-V2.5-Pro", "short_name": "MiMo-V2.5-Pro", "provider": "XiaomiMiMo", "score": 78.9, "date": "2026-04-27" }, { "model_id": "zai-org/GLM-5", "short_name": "GLM-5", "provider": "zai-org", "score": 77.8, "date": "2026-02-11" }, { "model_id": "OrionLLM/GRM-2.6-Plus", "short_name": "GRM-2.6-Plus", "provider": "OrionLLM", "score": 77.7, "date": "2026-04-23" }, { "model_id": "mistralai/Mistral-Medium-3.5-128B", "short_name": "Mistral-Medium-3.5-128B", "provider": "mistralai", "score": 77.6, "date": "2026-03-31" }, { "model_id": "Qwen/Qwen3.6-27B", "short_name": "Qwen3.6-27B", "provider": "Qwen", "score": 77.2, "date": "2026-04-21" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 76.4, "date": "2026-02-16" }, { "model_id": "MiniMaxAI/MiniMax-M2.5", "short_name": "MiniMax-M2.5", "provider": "MiniMaxAI", "score": 75.8, "date": "2026-02-12" }, { "model_id": "Multilingual-Multimodal-NLP/IndustrialCoder", "short_name": "IndustrialCoder", "provider": "Multilingual-Multimodal-NLP", "score": 74.8, "date": "2026-03-13" }, { "model_id": "stepfun-ai/Step-3.5-Flash", "short_name": "Step-3.5-Flash", "provider": "stepfun-ai", "score": 74.4, "date": "2026-02-01" }, { "model_id": "tencent/Hy3-preview", "short_name": "Hy3-preview", "provider": "tencent", "score": 74.4, "date": "2026-04-13" }, { "model_id": "MiniMaxAI/MiniMax-M2.1", "short_name": "MiniMax-M2.1", "provider": "MiniMaxAI", "score": 74.0, "date": "2025-12-20" }, { "model_id": "inclusionAI/Ring-2.6-1T", "short_name": "Ring-2.6-1T", "provider": "inclusionAI", "score": 74.0, "date": "2026-05-14" }, { "model_id": "zai-org/GLM-4.7", "short_name": "GLM-4.7", "provider": "zai-org", "score": 73.8, "date": "2025-12-22" }, { "model_id": "Qwen/Qwen3.6-35B-A3B", "short_name": "Qwen3.6-35B-A3B", "provider": "Qwen", "score": 73.4, "date": "2026-04-15" }, { "model_id": "Qwen/Qwen3.5-27B", "short_name": "Qwen3.5-27B", "provider": "Qwen", "score": 72.4, "date": "2026-02-24" }, { "model_id": "inclusionAI/Ling-2.6-1T", "short_name": "Ling-2.6-1T", "provider": "inclusionAI", "score": 72.2, "date": "2026-04-29" }, { "model_id": "Qwen/Qwen3.5-122B-A10B", "short_name": "Qwen3.5-122B-A10B", "provider": "Qwen", "score": 72.0, "date": "2026-02-24" }, { "model_id": "moonshotai/Kimi-K2-Thinking", "short_name": "Kimi-K2-Thinking", "provider": "moonshotai", "score": 71.3, "date": "2025-11-04" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 70.8, "date": "2026-01-01" }, { "model_id": "Qwen/Qwen3-Coder-Next", "short_name": "Qwen3-Coder-Next", "provider": "Qwen", "score": 70.6, "date": "2026-01-30" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 70.0, "date": "2025-12-01" }, { "model_id": "MiniMaxAI/MiniMax-M2", "short_name": "MiniMax-M2", "provider": "MiniMaxAI", "score": 69.4, "date": "2025-10-22" }, { "model_id": "Qwen/Qwen3.5-35B-A3B", "short_name": "Qwen3.5-35B-A3B", "provider": "Qwen", "score": 69.2, "date": "2026-02-24" }, { "model_id": "poolside/Laguna-XS.2", "short_name": "Laguna-XS.2", "provider": "poolside", "score": 68.2, "date": "2026-04-23" }, { "model_id": "GAIR/OpenSWE-72B", "short_name": "OpenSWE-72B", "provider": "GAIR", "score": 66.0, "date": "2026-03-15" }, { "model_id": "internlm/Intern-S2-Preview", "short_name": "Intern-S2-Preview", "provider": "internlm", "score": 64.0, "date": "2026-05-15" }, { "model_id": "arcee-ai/Trinity-Large-Thinking", "short_name": "Trinity-Large-Thinking", "provider": "arcee-ai", "score": 63.2, "date": "2026-04-01" }, { "model_id": "openai/gpt-oss-120b", "short_name": "gpt-oss-120b", "provider": "openai", "score": 62.4, "date": "2025-08-04" }, { "model_id": "GAIR/OpenSWE-32B", "short_name": "OpenSWE-32B", "provider": "GAIR", "score": 62.4, "date": "2026-03-15" }, { "model_id": "inclusionAI/Ling-2.6-flash", "short_name": "Ling-2.6-flash", "provider": "inclusionAI", "score": 61.2, "date": "2026-04-28" }, { "model_id": "openai/gpt-oss-20b", "short_name": "gpt-oss-20b", "provider": "openai", "score": 60.7, "date": "2025-08-04" }, { "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "nvidia", "score": 60.47, "date": "2026-03-10" }, { "model_id": "RedHatAI/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "RedHatAI", "score": 60.47, "date": "2026-03-26" }, { "model_id": "zai-org/GLM-4.7-Flash", "short_name": "GLM-4.7-Flash", "provider": "zai-org", "score": 59.2, "date": "2026-01-19" }, { "model_id": "facebook/cwm", "short_name": "cwm", "provider": "facebook", "score": 53.9, "date": "2025-08-25" }, { "model_id": "SWE-Lego/SWE-Lego-Qwen3-32B", "short_name": "SWE-Lego-Qwen3-32B", "provider": "SWE-Lego", "score": 52.6, "date": "2026-01-05" }, { "model_id": "SWE-Lego/SWE-Lego-Qwen3-8B", "short_name": "SWE-Lego-Qwen3-8B", "provider": "SWE-Lego", "score": 42.2, "date": "2025-12-29" }, { "model_id": "OrionLLM/Terminus-Qwen3-8b", "short_name": "Terminus-Qwen3-8b", "provider": "OrionLLM", "score": 15.7, "date": "2026-03-06" } ] }, "mmluPro": { "name": "MMLU-Pro", "dataset": "TIGER-Lab/MMLU-Pro", "lower_is_better": false, "models": [ { "model_id": "internlm/Intern-S2-Preview", "short_name": "Intern-S2-Preview", "provider": "internlm", "score": 88.0, "date": "2026-05-15" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 87.8, "date": "2026-02-16" }, { "model_id": "deepseek-ai/DeepSeek-V4-Pro", "short_name": "DeepSeek-V4-Pro", "provider": "deepseek-ai", "score": 87.5, "date": "2026-04-22" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 87.1, "date": "2026-01-01" }, { "model_id": "OrionLLM/GRM-2.6-Plus", "short_name": "GRM-2.6-Plus", "provider": "OrionLLM", "score": 86.8, "date": "2026-04-23" }, { "model_id": "deepseek-ai/DeepSeek-V4-Flash", "short_name": "DeepSeek-V4-Flash", "provider": "deepseek-ai", "score": 86.4, "date": "2026-04-22" }, { "model_id": "Qwen/Qwen3.6-27B", "short_name": "Qwen3.6-27B", "provider": "Qwen", "score": 86.2, "date": "2026-04-21" }, { "model_id": "google/gemma-4-31B-it", "short_name": "gemma-4-31B-it", "provider": "google", "score": 85.2, "date": "2026-03-11" }, { "model_id": "Qwen/Qwen3.6-35B-A3B", "short_name": "Qwen3.6-35B-A3B", "provider": "Qwen", "score": 85.2, "date": "2026-04-15" }, { "model_id": "deepseek-ai/DeepSeek-R1-0528", "short_name": "DeepSeek-R1-0528", "provider": "deepseek-ai", "score": 85.0, "date": "2025-05-28" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 85.0, "date": "2025-12-01" }, { "model_id": "stepfun-ai/Step-3.5-Flash", "short_name": "Step-3.5-Flash", "provider": "stepfun-ai", "score": 84.4, "date": "2026-02-01" }, { "model_id": "deepseek-ai/DeepSeek-R1", "short_name": "DeepSeek-R1", "provider": "deepseek-ai", "score": 84.0, "date": "2025-01-20" }, { "model_id": "LGAI-EXAONE/K-EXAONE-236B-A23B", "short_name": "K-EXAONE-236B-A23B", "provider": "LGAI-EXAONE", "score": 83.8, "date": "2025-12-26" }, { "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "nvidia", "score": 83.73, "date": "2026-03-10" }, { "model_id": "RedHatAI/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "RedHatAI", "score": 83.73, "date": "2026-03-26" }, { "model_id": "arcee-ai/Trinity-Large-Thinking", "short_name": "Trinity-Large-Thinking", "provider": "arcee-ai", "score": 83.4, "date": "2026-04-01" }, { "model_id": "LGAI-EXAONE/EXAONE-4.5-33B", "short_name": "EXAONE-4.5-33B", "provider": "LGAI-EXAONE", "score": 83.3, "date": "2026-04-04" }, { "model_id": "google/gemma-4-26B-A4B-it", "short_name": "gemma-4-26B-A4B-it", "provider": "google", "score": 82.6, "date": "2026-03-11" }, { "model_id": "Qwen/Qwen3.5-9B", "short_name": "Qwen3.5-9B", "provider": "Qwen", "score": 82.5, "date": "2026-02-27" }, { "model_id": "MiniMaxAI/MiniMax-M2", "short_name": "MiniMax-M2", "provider": "MiniMaxAI", "score": 82.0, "date": "2025-10-22" }, { "model_id": "deepseek-ai/DeepSeek-V3-0324", "short_name": "DeepSeek-V3-0324", "provider": "deepseek-ai", "score": 81.2, "date": "2025-03-24" }, { "model_id": "jdopensource/JoyAI-LLM-Flash", "short_name": "JoyAI-LLM-Flash", "provider": "jdopensource", "score": 81.02, "date": "2026-02-14" }, { "model_id": "OrionLLM/GRM-2.5", "short_name": "GRM-2.5", "provider": "OrionLLM", "score": 80.1, "date": "2026-04-07" }, { "model_id": "Qwen/Qwen3.5-4B", "short_name": "Qwen3.5-4B", "provider": "Qwen", "score": 79.1, "date": "2026-02-27" }, { "model_id": "meituan-longcat/LongCat-Flash-Lite", "short_name": "LongCat-Flash-Lite", "provider": "meituan-longcat", "score": 78.29, "date": "2026-01-27" }, { "model_id": "arcee-ai/Trinity-Large-Preview", "short_name": "Trinity-Large-Preview", "provider": "arcee-ai", "score": 75.2, "date": "2026-01-27" }, { "model_id": "Zyphra/ZAYA1-8B", "short_name": "ZAYA1-8B", "provider": "Zyphra", "score": 74.2, "date": "2026-05-04" }, { "model_id": "Qwen/Qwen3-4B-Instruct-2507", "short_name": "Qwen3-4B-Instruct-2507", "provider": "Qwen", "score": 69.6, "date": "2025-08-05" }, { "model_id": "google/gemma-4-E4B-it", "short_name": "gemma-4-E4B-it", "provider": "google", "score": 69.4, "date": "2026-03-02" }, { "model_id": "XiaomiMiMo/MiMo-V2.5-Pro", "short_name": "MiMo-V2.5-Pro", "provider": "XiaomiMiMo", "score": 68.5, "date": "2026-04-27" }, { "model_id": "deepseek-ai/DeepSeek-V3", "short_name": "DeepSeek-V3", "provider": "deepseek-ai", "score": 64.4, "date": "2024-12-25" }, { "model_id": "ibm-granite/granite-4.1-30b", "short_name": "granite-4.1-30b", "provider": "ibm-granite", "score": 64.09, "date": "2026-04-06" }, { "model_id": "google/gemma-4-E2B-it", "short_name": "gemma-4-E2B-it", "provider": "google", "score": 60.0, "date": "2026-03-02" }, { "model_id": "ibm-granite/granite-4.1-8b", "short_name": "granite-4.1-8b", "provider": "ibm-granite", "score": 55.99, "date": "2026-04-06" }, { "model_id": "Qwen/Qwen3.5-2B", "short_name": "Qwen3.5-2B", "provider": "Qwen", "score": 55.3, "date": "2026-02-28" }, { "model_id": "ibm-granite/granite-4.1-3b", "short_name": "granite-4.1-3b", "provider": "ibm-granite", "score": 49.83, "date": "2026-04-06" }, { "model_id": "Xerv-AI/MAXWELL", "short_name": "MAXWELL", "provider": "Xerv-AI", "score": 45.0, "date": "2026-05-04" }, { "model_id": "OrionLLM/GRM-2.5-Air", "short_name": "GRM-2.5-Air", "provider": "OrionLLM", "score": 43.6, "date": "2026-04-07" }, { "model_id": "Qwen/Qwen3.5-0.8B", "short_name": "Qwen3.5-0.8B", "provider": "Qwen", "score": 29.7, "date": "2026-02-28" }, { "model_id": "LiquidAI/LFM2.5-350M", "short_name": "LFM2.5-350M", "provider": "LiquidAI", "score": 20.01, "date": "2026-03-31" } ] }, "swePro": { "name": "SWE-bench Pro", "dataset": "ScaleAI/SWE-bench_Pro", "lower_is_better": false, "models": [ { "model_id": "moonshotai/Kimi-K2.6", "short_name": "Kimi-K2.6", "provider": "moonshotai", "score": 58.6, "date": "2026-04-14" }, { "model_id": "zai-org/GLM-5.1", "short_name": "GLM-5.1", "provider": "zai-org", "score": 58.4, "date": "2026-04-03" }, { "model_id": "XiaomiMiMo/MiMo-V2.5-Pro", "short_name": "MiMo-V2.5-Pro", "provider": "XiaomiMiMo", "score": 57.2, "date": "2026-04-27" }, { "model_id": "MiniMaxAI/MiniMax-M2.7", "short_name": "MiniMax-M2.7", "provider": "MiniMaxAI", "score": 56.2, "date": "2026-04-09" }, { "model_id": "XiaomiMiMo/MiMo-V2.5", "short_name": "MiMo-V2.5", "provider": "XiaomiMiMo", "score": 56.1, "date": "2026-04-27" }, { "model_id": "MiniMaxAI/MiniMax-M2.5", "short_name": "MiniMax-M2.5", "provider": "MiniMaxAI", "score": 55.4, "date": "2026-02-12" }, { "model_id": "deepseek-ai/DeepSeek-V4-Pro", "short_name": "DeepSeek-V4-Pro", "provider": "deepseek-ai", "score": 55.4, "date": "2026-04-22" }, { "model_id": "OrionLLM/GRM-2.6-Plus", "short_name": "GRM-2.6-Plus", "provider": "OrionLLM", "score": 54.0, "date": "2026-04-23" }, { "model_id": "Qwen/Qwen3.6-27B", "short_name": "Qwen3.6-27B", "provider": "Qwen", "score": 53.5, "date": "2026-04-21" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 50.7, "date": "2026-01-01" }, { "model_id": "Qwen/Qwen3.6-35B-A3B", "short_name": "Qwen3.6-35B-A3B", "provider": "Qwen", "score": 49.5, "date": "2026-04-15" }, { "model_id": "poolside/Laguna-XS.2", "short_name": "Laguna-XS.2", "provider": "poolside", "score": 44.5, "date": "2026-04-23" }, { "model_id": "Qwen/Qwen3-Coder-Next", "short_name": "Qwen3-Coder-Next", "provider": "Qwen", "score": 44.3, "date": "2026-01-30" }, { "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct", "short_name": "Qwen3-Coder-480B-A35B-Instruct", "provider": "Qwen", "score": 38.7, "date": "2025-07-22" }, { "model_id": "MiniMaxAI/MiniMax-M2.1", "short_name": "MiniMax-M2.1", "provider": "MiniMaxAI", "score": 36.81, "date": "2025-12-20" }, { "model_id": "moonshotai/Kimi-K2-Instruct", "short_name": "Kimi-K2-Instruct", "provider": "moonshotai", "score": 27.67, "date": "2025-07-11" }, { "model_id": "Qwen/Qwen3-235B-A22B", "short_name": "Qwen3-235B-A22B", "provider": "Qwen", "score": 21.41, "date": "2025-04-27" }, { "model_id": "openai/gpt-oss-120b", "short_name": "gpt-oss-120b", "provider": "openai", "score": 16.2, "date": "2025-08-04" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 15.56, "date": "2025-12-01" }, { "model_id": "google/gemma-3-27b-it", "short_name": "gemma-3-27b-it", "provider": "google", "score": 11.38, "date": "2025-03-01" }, { "model_id": "meta-llama/Llama-3.1-405B-Instruct", "short_name": "Llama-3.1-405B-Instruct", "provider": "meta-llama", "score": 11.18, "date": "2024-07-16" }, { "model_id": "zai-org/GLM-4.6", "short_name": "GLM-4.6", "provider": "zai-org", "score": 9.67, "date": "2025-09-29" }, { "model_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", "short_name": "Llama-4-Maverick-17B-128E-Instruct", "provider": "meta-llama", "score": 5.24, "date": "2025-04-01" } ] }, "MMMU_Pro": { "name": "MMMU_Pro", "dataset": "MMMU/MMMU_Pro", "lower_is_better": false, "models": [ { "model_id": "moonshotai/Kimi-K2.6", "short_name": "Kimi-K2.6", "provider": "moonshotai", "score": 79.4, "date": "2026-04-14" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 78.5, "date": "2026-01-01" }, { "model_id": "google/gemma-4-31B-it", "short_name": "gemma-4-31B-it", "provider": "google", "score": 76.9, "date": "2026-03-11" }, { "model_id": "internlm/Intern-S2-Preview", "short_name": "Intern-S2-Preview", "provider": "internlm", "score": 76.88, "date": "2026-05-15" }, { "model_id": "Qwen/Qwen3.6-27B", "short_name": "Qwen3.6-27B", "provider": "Qwen", "score": 75.8, "date": "2026-04-21" }, { "model_id": "Qwen/Qwen3.6-35B-A3B", "short_name": "Qwen3.6-35B-A3B", "provider": "Qwen", "score": 75.3, "date": "2026-04-15" }, { "model_id": "Qwen/Qwen3.5-35B-A3B", "short_name": "Qwen3.5-35B-A3B", "provider": "Qwen", "score": 75.1, "date": "2026-02-24" }, { "model_id": "Qwen/Qwen3.5-27B", "short_name": "Qwen3.5-27B", "provider": "Qwen", "score": 75.0, "date": "2026-02-24" }, { "model_id": "google/gemma-4-26B-A4B-it", "short_name": "gemma-4-26B-A4B-it", "provider": "google", "score": 73.8, "date": "2026-03-11" }, { "model_id": "Qwen/Qwen3.5-9B", "short_name": "Qwen3.5-9B", "provider": "Qwen", "score": 70.1, "date": "2026-02-27" }, { "model_id": "Qwen/Qwen3.5-4B", "short_name": "Qwen3.5-4B", "provider": "Qwen", "score": 66.3, "date": "2026-02-27" }, { "model_id": "AIDC-AI/Ovis2.6-80B-A3B", "short_name": "Ovis2.6-80B-A3B", "provider": "AIDC-AI", "score": 66.3, "date": "2026-05-11" }, { "model_id": "google/gemma-4-E4B-it", "short_name": "gemma-4-E4B-it", "provider": "google", "score": 52.6, "date": "2026-03-02" }, { "model_id": "Qwen/Qwen3.5-2B", "short_name": "Qwen3.5-2B", "provider": "Qwen", "score": 50.3, "date": "2026-02-28" }, { "model_id": "google/gemma-4-E2B-it", "short_name": "gemma-4-E2B-it", "provider": "google", "score": 44.2, "date": "2026-03-02" }, { "model_id": "Qwen/Qwen2.5-VL-7B-Instruct", "short_name": "Qwen2.5-VL-7B-Instruct", "provider": "Qwen", "score": 34.3, "date": "2025-01-26" }, { "model_id": "Qwen/Qwen2.5-VL-3B-Instruct", "short_name": "Qwen2.5-VL-3B-Instruct", "provider": "Qwen", "score": 32.7, "date": "2025-01-26" }, { "model_id": "Qwen/Qwen3.5-0.8B", "short_name": "Qwen3.5-0.8B", "provider": "Qwen", "score": 31.2, "date": "2026-02-28" } ] }, "ScreenSpot_Pro": { "name": "SS-Pro", "dataset": "likaixin/ScreenSpot-Pro", "lower_is_better": false, "models": [ { "model_id": "Hcompany/Holo2-235B-A22B", "short_name": "Holo2-235B-A22B", "provider": "Hcompany", "score": 78.5, "date": "2026-01-28" }, { "model_id": "Hcompany/Holo2-30B-A3B", "short_name": "Holo2-30B-A3B", "provider": "Hcompany", "score": 75.2, "date": "2025-11-10" }, { "model_id": "Hcompany/Holo2-8B", "short_name": "Holo2-8B", "provider": "Hcompany", "score": 71.4, "date": "2025-11-10" }, { "model_id": "Qwen/Qwen3.5-122B-A10B", "short_name": "Qwen3.5-122B-A10B", "provider": "Qwen", "score": 70.4, "date": "2026-02-24" }, { "model_id": "Qwen/Qwen3.5-27B", "short_name": "Qwen3.5-27B", "provider": "Qwen", "score": 70.3, "date": "2026-02-24" }, { "model_id": "inclusionAI/UI-Venus-1.5-30B-A3B", "short_name": "UI-Venus-1.5-30B-A3B", "provider": "inclusionAI", "score": 69.6, "date": "2026-02-09" }, { "model_id": "Hcompany/Holo2-4B", "short_name": "Holo2-4B", "provider": "Hcompany", "score": 68.6, "date": "2025-11-10" }, { "model_id": "Qwen/Qwen3.5-35B-A3B", "short_name": "Qwen3.5-35B-A3B", "provider": "Qwen", "score": 68.6, "date": "2026-02-24" }, { "model_id": "inclusionAI/UI-Venus-1.5-8B", "short_name": "UI-Venus-1.5-8B", "provider": "inclusionAI", "score": 68.4, "date": "2026-02-09" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 65.6, "date": "2026-02-16" }, { "model_id": "Qwen/Qwen3.5-9B", "short_name": "Qwen3.5-9B", "provider": "Qwen", "score": 65.2, "date": "2026-02-27" }, { "model_id": "Salesforce/GTA1-32B", "short_name": "GTA1-32B", "provider": "Salesforce", "score": 63.6, "date": "2025-09-25" }, { "model_id": "Hcompany/Holo1.5-72B", "short_name": "Holo1.5-72B", "provider": "Hcompany", "score": 63.3, "date": "2025-09-11" }, { "model_id": "inclusionAI/UI-Venus-Ground-72B", "short_name": "UI-Venus-Ground-72B", "provider": "inclusionAI", "score": 61.9, "date": "2025-08-16" }, { "model_id": "Qwen/Qwen3.5-4B", "short_name": "Qwen3.5-4B", "provider": "Qwen", "score": 60.3, "date": "2026-02-27" }, { "model_id": "Hcompany/Holo1.5-7B", "short_name": "Holo1.5-7B", "provider": "Hcompany", "score": 57.9, "date": "2025-09-11" }, { "model_id": "inclusionAI/UI-Venus-1.5-2B", "short_name": "UI-Venus-1.5-2B", "provider": "inclusionAI", "score": 57.7, "date": "2026-02-09" }, { "model_id": "Qwen/Qwen3.5-2B", "short_name": "Qwen3.5-2B", "provider": "Qwen", "score": 54.5, "date": "2026-02-28" }, { "model_id": "Qwen/Qwen2.5-VL-72B-Instruct", "short_name": "Qwen2.5-VL-72B-Instruct", "provider": "Qwen", "score": 53.3, "date": "2025-01-27" }, { "model_id": "moonshotai/Kimi-VL-A3B-Thinking-2506", "short_name": "Kimi-VL-A3B-Thinking-2506", "provider": "moonshotai", "score": 51.0, "date": "2025-06-21" }, { "model_id": "inclusionAI/UI-Venus-Ground-7B", "short_name": "UI-Venus-Ground-7B", "provider": "inclusionAI", "score": 50.8, "date": "2025-08-15" }, { "model_id": "Qwen/Qwen2.5-VL-32B-Instruct", "short_name": "Qwen2.5-VL-32B-Instruct", "provider": "Qwen", "score": 48.0, "date": "2025-03-21" }, { "model_id": "Qwen/Qwen3.5-0.8B", "short_name": "Qwen3.5-0.8B", "provider": "Qwen", "score": 46.5, "date": "2026-02-28" }, { "model_id": "KDEGroup/UI-AGILE-3B", "short_name": "UI-AGILE-3B", "provider": "KDEGroup", "score": 45.0, "date": "2025-08-07" }, { "model_id": "microsoft/GUI-Actor-7B-Qwen2.5-VL", "short_name": "GUI-Actor-7B-Qwen2.5-VL", "provider": "microsoft", "score": 44.6, "date": "2025-06-01" }, { "model_id": "microsoft/GUI-Actor-3B-Qwen2.5-VL", "short_name": "GUI-Actor-3B-Qwen2.5-VL", "provider": "microsoft", "score": 42.2, "date": "2025-06-01" }, { "model_id": "ByteDance-Seed/UI-TARS-72B-SFT", "short_name": "UI-TARS-72B-SFT", "provider": "ByteDance-Seed", "score": 38.1, "date": "2025-01-20" }, { "model_id": "microsoft/GUI-Actor-2B-Qwen2-VL", "short_name": "GUI-Actor-2B-Qwen2-VL", "provider": "microsoft", "score": 36.7, "date": "2025-06-01" }, { "model_id": "ByteDance-Seed/UI-TARS-7B-SFT", "short_name": "UI-TARS-7B-SFT", "provider": "ByteDance-Seed", "score": 35.7, "date": "2025-01-20" }, { "model_id": "ByteDance-Seed/UI-TARS-2B-SFT", "short_name": "UI-TARS-2B-SFT", "provider": "ByteDance-Seed", "score": 27.7, "date": "2025-01-20" }, { "model_id": "Qwen/Qwen2.5-VL-7B-Instruct", "short_name": "Qwen2.5-VL-7B-Instruct", "provider": "Qwen", "score": 26.8, "date": "2025-01-26" }, { "model_id": "xlangai/Aguvis-7B-720P", "short_name": "Aguvis-7B-720P", "provider": "xlangai", "score": 22.9, "date": "2025-01-07" }, { "model_id": "OS-Copilot/OS-Atlas-Pro-7B", "short_name": "OS-Atlas-Pro-7B", "provider": "OS-Copilot", "score": 18.9, "date": "2024-11-15" }, { "model_id": "osunlp/UGround-V1-7B", "short_name": "UGround-V1-7B", "provider": "osunlp", "score": 16.5, "date": "2025-01-03" }, { "model_id": "showlab/ShowUI-2B", "short_name": "ShowUI-2B", "provider": "showlab", "score": 7.7, "date": "2024-11-16" }, { "model_id": "OS-Copilot/OS-Atlas-Pro-4B", "short_name": "OS-Atlas-Pro-4B", "provider": "OS-Copilot", "score": 3.7, "date": "2024-11-15" }, { "model_id": "openbmb/MiniCPM-V-2", "short_name": "MiniCPM-V-2", "provider": "openbmb", "score": 3.0, "date": "2024-04-09" }, { "model_id": "Qwen/Qwen2-VL-7B-Instruct", "short_name": "Qwen2-VL-7B-Instruct", "provider": "Qwen", "score": 1.6, "date": "2024-08-28" }, { "model_id": "Qwen/Qwen2-VL-72B-Instruct", "short_name": "Qwen2-VL-72B-Instruct", "provider": "Qwen", "score": 1.0, "date": "2024-09-17" }, { "model_id": "Qwen/Qwen-VL", "short_name": "Qwen-VL", "provider": "Qwen", "score": 0.1, "date": "2023-08-18" } ] }, "APEX_v1_extended": { "name": "APEX-v1-extended", "dataset": "mercor/APEX-v1-extended", "lower_is_better": false, "models": [ { "model_id": "zai-org/GLM-4.7", "short_name": "GLM-4.7", "provider": "zai-org", "score": 51.7, "date": "2025-12-22" }, { "model_id": "zai-org/GLM-5", "short_name": "GLM-5", "provider": "zai-org", "score": 49.0, "date": "2026-02-11" } ] }, "terminalBench": { "name": "Terminal-Bench 2.0", "dataset": "harborframework/terminal-bench-2.0", "lower_is_better": false, "models": [ { "model_id": "zai-org/GLM-5.1", "short_name": "GLM-5.1", "provider": "zai-org", "score": 69.0, "date": "2026-04-03" }, { "model_id": "XiaomiMiMo/MiMo-V2.5-Pro", "short_name": "MiMo-V2.5-Pro", "provider": "XiaomiMiMo", "score": 68.4, "date": "2026-04-27" }, { "model_id": "deepseek-ai/DeepSeek-V4-Pro", "short_name": "DeepSeek-V4-Pro", "provider": "deepseek-ai", "score": 67.9, "date": "2026-04-22" }, { "model_id": "moonshotai/Kimi-K2.6", "short_name": "Kimi-K2.6", "provider": "moonshotai", "score": 66.7, "date": "2026-04-14" }, { "model_id": "XiaomiMiMo/MiMo-V2.5", "short_name": "MiMo-V2.5", "provider": "XiaomiMiMo", "score": 65.8, "date": "2026-04-27" }, { "model_id": "OrionLLM/GRM-2.6-Plus", "short_name": "GRM-2.6-Plus", "provider": "OrionLLM", "score": 59.8, "date": "2026-04-23" }, { "model_id": "Qwen/Qwen3.6-27B", "short_name": "Qwen3.6-27B", "provider": "Qwen", "score": 59.3, "date": "2026-04-21" }, { "model_id": "MiniMaxAI/MiniMax-M2.7", "short_name": "MiniMax-M2.7", "provider": "MiniMaxAI", "score": 57.0, "date": "2026-04-09" }, { "model_id": "deepseek-ai/DeepSeek-V4-Flash", "short_name": "DeepSeek-V4-Flash", "provider": "deepseek-ai", "score": 56.9, "date": "2026-04-22" }, { "model_id": "tencent/Hy3-preview", "short_name": "Hy3-preview", "provider": "tencent", "score": 54.4, "date": "2026-04-13" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 52.5, "date": "2026-02-16" }, { "model_id": "zai-org/GLM-5", "short_name": "GLM-5", "provider": "zai-org", "score": 52.4, "date": "2026-02-11" }, { "model_id": "Qwen/Qwen3.6-35B-A3B", "short_name": "Qwen3.6-35B-A3B", "provider": "Qwen", "score": 51.5, "date": "2026-04-15" }, { "model_id": "stepfun-ai/Step-3.5-Flash", "short_name": "Step-3.5-Flash", "provider": "stepfun-ai", "score": 51.0, "date": "2026-02-01" }, { "model_id": "Qwen/Qwen3.5-122B-A10B", "short_name": "Qwen3.5-122B-A10B", "provider": "Qwen", "score": 49.4, "date": "2026-02-24" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 43.2, "date": "2026-01-01" }, { "model_id": "Qwen/Qwen3.5-27B", "short_name": "Qwen3.5-27B", "provider": "Qwen", "score": 41.6, "date": "2026-02-24" }, { "model_id": "Qwen/Qwen3.5-35B-A3B", "short_name": "Qwen3.5-35B-A3B", "provider": "Qwen", "score": 40.5, "date": "2026-02-24" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 39.6, "date": "2025-12-01" }, { "model_id": "Qwen/Qwen3-Coder-Next", "short_name": "Qwen3-Coder-Next", "provider": "Qwen", "score": 36.2, "date": "2026-01-30" }, { "model_id": "moonshotai/Kimi-K2-Thinking", "short_name": "Kimi-K2-Thinking", "provider": "moonshotai", "score": 35.7, "date": "2025-11-04" }, { "model_id": "zai-org/GLM-4.7", "short_name": "GLM-4.7", "provider": "zai-org", "score": 33.4, "date": "2025-12-22" }, { "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "nvidia", "score": 31.0, "date": "2026-03-10" }, { "model_id": "RedHatAI/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "RedHatAI", "score": 31.0, "date": "2026-03-26" }, { "model_id": "poolside/Laguna-XS.2", "short_name": "Laguna-XS.2", "provider": "poolside", "score": 30.1, "date": "2026-04-23" }, { "model_id": "MiniMaxAI/MiniMax-M2", "short_name": "MiniMax-M2", "provider": "MiniMaxAI", "score": 30.0, "date": "2025-10-22" }, { "model_id": "MiniMaxAI/MiniMax-M2.1", "short_name": "MiniMax-M2.1", "provider": "MiniMaxAI", "score": 29.2, "date": "2025-12-20" }, { "model_id": "moonshotai/Kimi-K2-Instruct", "short_name": "Kimi-K2-Instruct", "provider": "moonshotai", "score": 27.8, "date": "2025-07-11" }, { "model_id": "nvidia/Nemotron-Terminal-32B", "short_name": "Nemotron-Terminal-32B", "provider": "nvidia", "score": 27.4, "date": "2026-02-17" }, { "model_id": "zai-org/GLM-4.6", "short_name": "GLM-4.6", "provider": "zai-org", "score": 24.5, "date": "2025-09-29" }, { "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct", "short_name": "Qwen3-Coder-480B-A35B-Instruct", "provider": "Qwen", "score": 23.9, "date": "2025-07-22" }, { "model_id": "nvidia/Nemotron-Terminal-14B", "short_name": "Nemotron-Terminal-14B", "provider": "nvidia", "score": 20.2, "date": "2026-02-17" }, { "model_id": "nvidia/Nemotron-Terminal-8B", "short_name": "Nemotron-Terminal-8B", "provider": "nvidia", "score": 13.0, "date": "2026-02-17" }, { "model_id": "OrionLLM/Terminus-Qwen3-8b", "short_name": "Terminus-Qwen3-8b", "provider": "OrionLLM", "score": 4.9, "date": "2026-03-06" } ] }, "hmmt2026": { "name": "HMMT Feb 2026", "dataset": "MathArena/hmmt_feb_2026", "lower_is_better": false, "models": [ { "model_id": "moonshotai/Kimi-K2.6", "short_name": "Kimi-K2.6", "provider": "moonshotai", "score": 92.7, "date": "2026-04-14" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 87.88, "date": "2026-02-16" }, { "model_id": "internlm/Intern-S2-Preview", "short_name": "Intern-S2-Preview", "provider": "internlm", "score": 87.31, "date": "2026-05-15" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 87.12, "date": "2026-01-01" }, { "model_id": "stepfun-ai/Step-3.5-Flash", "short_name": "Step-3.5-Flash", "provider": "stepfun-ai", "score": 86.36, "date": "2026-02-01" }, { "model_id": "zai-org/GLM-5", "short_name": "GLM-5", "provider": "zai-org", "score": 86.36, "date": "2026-02-11" }, { "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "nvidia", "score": 84.85, "date": "2026-03-10" }, { "model_id": "OrionLLM/GRM-2.6-Plus", "short_name": "GRM-2.6-Plus", "provider": "OrionLLM", "score": 84.8, "date": "2026-04-23" }, { "model_id": "Qwen/Qwen3.6-27B", "short_name": "Qwen3.6-27B", "provider": "Qwen", "score": 84.3, "date": "2026-04-21" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 84.09, "date": "2025-12-01" }, { "model_id": "Qwen/Qwen3.6-35B-A3B", "short_name": "Qwen3.6-35B-A3B", "provider": "Qwen", "score": 83.6, "date": "2026-04-15" }, { "model_id": "zai-org/GLM-5.1", "short_name": "GLM-5.1", "provider": "zai-org", "score": 82.6, "date": "2026-04-03" }, { "model_id": "Qwen/Qwen3.5-35B-A3B", "short_name": "Qwen3.5-35B-A3B", "provider": "Qwen", "score": 81.82, "date": "2026-02-24" }, { "model_id": "Qwen/Qwen3.5-27B", "short_name": "Qwen3.5-27B", "provider": "Qwen", "score": 81.06, "date": "2026-02-24" }, { "model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507", "short_name": "Qwen3-30B-A3B-Thinking-2507", "provider": "Qwen", "score": 78.79, "date": "2025-07-29" }, { "model_id": "Qwen/Qwen3.5-9B", "short_name": "Qwen3.5-9B", "provider": "Qwen", "score": 71.21, "date": "2026-02-27" }, { "model_id": "lm-provers/QED-Nano", "short_name": "QED-Nano", "provider": "lm-provers", "score": 62.88, "date": "2026-02-12" }, { "model_id": "Qwen/Qwen3-4B-Thinking-2507", "short_name": "Qwen3-4B-Thinking-2507", "provider": "Qwen", "score": 53.03, "date": "2025-08-05" }, { "model_id": "inclusionAI/Ling-2.6-flash", "short_name": "Ling-2.6-flash", "provider": "inclusionAI", "score": 49.29, "date": "2026-04-28" } ] }, "Video_MME_v2": { "name": "Video-MME-v2", "dataset": "MME-Benchmarks/Video-MME-v2", "lower_is_better": false, "models": [ { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 61.1, "date": "2026-01-01" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 55.9, "date": "2026-02-16" } ] }, "arguana": { "name": "arguana", "dataset": "mteb/arguana", "lower_is_better": false, "models": [ { "model_id": "google/embeddinggemma-300m", "short_name": "embeddinggemma-300m", "provider": "google", "score": 71.53, "date": "2025-07-17" }, { "model_id": "GritLM/GritLM-7B", "short_name": "GritLM-7B", "provider": "GritLM", "score": 63.17, "date": "2024-02-11" }, { "model_id": "Snowflake/snowflake-arctic-embed-l-v2.0", "short_name": "snowflake-arctic-embed-l-v2.0", "provider": "Snowflake", "score": 59.11, "date": "2024-11-08" }, { "model_id": "ibm-granite/granite-embedding-125m-english", "short_name": "granite-embedding-125m-english", "provider": "ibm-granite", "score": 58.4, "date": "2024-12-04" }, { "model_id": "Snowflake/snowflake-arctic-embed-m-v2.0", "short_name": "snowflake-arctic-embed-m-v2.0", "provider": "Snowflake", "score": 57.88, "date": "2024-11-08" }, { "model_id": "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2", "short_name": "KaLM-embedding-multilingual-mini-instruct-v2", "provider": "HIT-TMG", "score": 57.42, "date": "2025-06-24" }, { "model_id": "BAAI/bge-m3", "short_name": "bge-m3", "provider": "BAAI", "score": 54.04, "date": "2024-01-27" }, { "model_id": "nomic-ai/nomic-embed-text-v1.5", "short_name": "nomic-embed-text-v1.5", "provider": "nomic-ai", "score": 52.02, "date": "2024-02-10" }, { "model_id": "sentence-transformers/all-MiniLM-L6-v2", "short_name": "all-MiniLM-L6-v2", "provider": "sentence-transformers", "score": 50.17, "date": "2022-03-02" }, { "model_id": "mteb/baseline-bm25s", "short_name": "baseline-bm25s", "provider": "mteb", "score": 49.28, "date": "2026-02-19" }, { "model_id": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "short_name": "paraphrase-multilingual-mpnet-base-v2", "provider": "sentence-transformers", "score": 48.91, "date": "2022-03-02" }, { "model_id": "jinaai/jina-embeddings-v3", "short_name": "jina-embeddings-v3", "provider": "jinaai", "score": 43.29, "date": "2024-09-05" }, { "model_id": "sentence-transformers/LaBSE", "short_name": "LaBSE", "provider": "sentence-transformers", "score": 34.18, "date": "2022-03-02" } ] }, "vlabench_primitive_ft_lerobot_video": { "name": "vlabench_primitive_ft_lerobot_video", "dataset": "VLABench/vlabench_primitive_ft_lerobot_video", "lower_is_better": false, "models": [ { "model_id": "lerobot/pi0_base", "short_name": "pi0_base", "provider": "lerobot", "score": 44.5, "date": "2025-09-09" }, { "model_id": "lerobot/pi05_base", "short_name": "pi05_base", "provider": "lerobot", "score": 42.0, "date": "2025-09-09" }, { "model_id": "nvidia/GR00T-N1-2B", "short_name": "GR00T-N1-2B", "provider": "nvidia", "score": 39.7, "date": "2025-03-05" }, { "model_id": "lerobot/pi0fast-base", "short_name": "pi0fast-base", "provider": "lerobot", "score": 34.1, "date": "2026-01-09" } ] }, "evasionBench": { "name": "EvasionBench", "dataset": "FutureMa/EvasionBench", "lower_is_better": false, "models": [ { "model_id": "zai-org/GLM-4.7", "short_name": "GLM-4.7", "provider": "zai-org", "score": 82.91, "date": "2025-12-22" }, { "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct", "short_name": "Qwen3-Coder-480B-A35B-Instruct", "provider": "Qwen", "score": 78.16, "date": "2025-07-22" }, { "model_id": "MiniMaxAI/MiniMax-M2.1", "short_name": "MiniMax-M2.1", "provider": "MiniMaxAI", "score": 71.31, "date": "2025-12-20" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 66.88, "date": "2025-12-01" }, { "model_id": "moonshotai/Kimi-K2-Instruct-0905", "short_name": "Kimi-K2-Instruct-0905", "provider": "moonshotai", "score": 66.68, "date": "2025-09-03" } ] }, "aime2026": { "name": "AIME 2026", "dataset": "MathArena/aime_2026", "lower_is_better": false, "models": [ { "model_id": "stepfun-ai/Step-3.5-Flash", "short_name": "Step-3.5-Flash", "provider": "stepfun-ai", "score": 96.67, "date": "2026-02-01" }, { "model_id": "moonshotai/Kimi-K2.6", "short_name": "Kimi-K2.6", "provider": "moonshotai", "score": 96.4, "date": "2026-04-14" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 95.83, "date": "2026-01-01" }, { "model_id": "zai-org/GLM-5", "short_name": "GLM-5", "provider": "zai-org", "score": 95.83, "date": "2026-02-11" }, { "model_id": "inclusionAI/Ring-2.6-1T", "short_name": "Ring-2.6-1T", "provider": "inclusionAI", "score": 95.83, "date": "2026-05-14" }, { "model_id": "zai-org/GLM-5.1", "short_name": "GLM-5.1", "provider": "zai-org", "score": 95.3, "date": "2026-04-03" }, { "model_id": "OrionLLM/GRM-2.6-Plus", "short_name": "GRM-2.6-Plus", "provider": "OrionLLM", "score": 95.1, "date": "2026-04-23" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 94.17, "date": "2025-12-01" }, { "model_id": "Qwen/Qwen3.6-27B", "short_name": "Qwen3.6-27B", "provider": "Qwen", "score": 94.1, "date": "2026-04-21" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 93.33, "date": "2026-02-16" }, { "model_id": "Qwen/Qwen3.5-35B-A3B", "short_name": "Qwen3.5-35B-A3B", "provider": "Qwen", "score": 93.33, "date": "2026-02-24" }, { "model_id": "Qwen/Qwen3.6-35B-A3B", "short_name": "Qwen3.6-35B-A3B", "provider": "Qwen", "score": 92.7, "date": "2026-04-15" }, { "model_id": "LGAI-EXAONE/EXAONE-4.5-33B", "short_name": "EXAONE-4.5-33B", "provider": "LGAI-EXAONE", "score": 92.6, "date": "2026-04-04" }, { "model_id": "Qwen/Qwen3.5-9B", "short_name": "Qwen3.5-9B", "provider": "Qwen", "score": 92.5, "date": "2026-02-27" }, { "model_id": "Qwen/Qwen3.5-27B", "short_name": "Qwen3.5-27B", "provider": "Qwen", "score": 90.83, "date": "2026-02-24" }, { "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "nvidia", "score": 90.0, "date": "2026-03-10" }, { "model_id": "google/gemma-4-31B-it", "short_name": "gemma-4-31B-it", "provider": "google", "score": 89.2, "date": "2026-03-11" }, { "model_id": "google/gemma-4-26B-A4B-it", "short_name": "gemma-4-26B-A4B-it", "provider": "google", "score": 88.3, "date": "2026-03-11" }, { "model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507", "short_name": "Qwen3-30B-A3B-Thinking-2507", "provider": "Qwen", "score": 87.5, "date": "2025-07-29" }, { "model_id": "Qwen/Qwen3-4B-Thinking-2507", "short_name": "Qwen3-4B-Thinking-2507", "provider": "Qwen", "score": 82.5, "date": "2025-08-05" }, { "model_id": "lm-provers/QED-Nano", "short_name": "QED-Nano", "provider": "lm-provers", "score": 82.5, "date": "2026-02-12" }, { "model_id": "inclusionAI/Ling-2.6-flash", "short_name": "Ling-2.6-flash", "provider": "inclusionAI", "score": 73.85, "date": "2026-04-28" }, { "model_id": "google/gemma-4-E4B-it", "short_name": "gemma-4-E4B-it", "provider": "google", "score": 42.5, "date": "2026-03-02" }, { "model_id": "google/gemma-4-E2B-it", "short_name": "gemma-4-E2B-it", "provider": "google", "score": 37.5, "date": "2026-03-02" } ] }, "yc_bench": { "name": "YC-Bench", "dataset": "collinear-ai/yc-bench", "lower_is_better": false, "models": [ { "model_id": "zai-org/GLM-5.1", "short_name": "GLM-5.1", "provider": "zai-org", "score": 1510772.0, "date": "2026-04-03" }, { "model_id": "zai-org/GLM-5", "short_name": "GLM-5", "provider": "zai-org", "score": 1208190.0, "date": "2026-02-11" }, { "model_id": "deepseek-ai/DeepSeek-V4-Pro", "short_name": "DeepSeek-V4-Pro", "provider": "deepseek-ai", "score": 1066426.0, "date": "2026-04-22" }, { "model_id": "moonshotai/Kimi-K2.6", "short_name": "Kimi-K2.6", "provider": "moonshotai", "score": 511137.0, "date": "2026-04-14" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 408822.0, "date": "2026-01-01" }, { "model_id": "zai-org/GLM-4.7", "short_name": "GLM-4.7", "provider": "zai-org", "score": 398410.0, "date": "2025-12-22" }, { "model_id": "MiniMaxAI/MiniMax-M2.5", "short_name": "MiniMax-M2.5", "provider": "MiniMaxAI", "score": 230465.0, "date": "2026-02-12" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 125263.0, "date": "2025-12-01" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 90787.0, "date": "2026-02-16" }, { "model_id": "arcee-ai/Trinity-Large-Thinking", "short_name": "Trinity-Large-Thinking", "provider": "arcee-ai", "score": 32667.0, "date": "2026-04-01" }, { "model_id": "Qwen/Qwen3.5-122B-A10B", "short_name": "Qwen3.5-122B-A10B", "provider": "Qwen", "score": 0.0, "date": "2026-02-24" } ] }, "Claw_Eval": { "name": "Claw-Eval", "dataset": "claw-eval/Claw-Eval", "lower_is_better": false, "models": [ { "model_id": "XiaomiMiMo/MiMo-V2.5-Pro", "short_name": "MiMo-V2.5-Pro", "provider": "XiaomiMiMo", "score": 64.0, "date": "2026-04-27" }, { "model_id": "inclusionAI/Ring-2.6-1T", "short_name": "Ring-2.6-1T", "provider": "inclusionAI", "score": 63.82, "date": "2026-05-14" }, { "model_id": "zai-org/GLM-5.1", "short_name": "GLM-5.1", "provider": "zai-org", "score": 62.7, "date": "2026-04-03" }, { "model_id": "XiaomiMiMo/MiMo-V2.5", "short_name": "MiMo-V2.5", "provider": "XiaomiMiMo", "score": 62.1, "date": "2026-04-27" }, { "model_id": "moonshotai/Kimi-K2.6", "short_name": "Kimi-K2.6", "provider": "moonshotai", "score": 61.5, "date": "2026-04-14" }, { "model_id": "deepseek-ai/DeepSeek-V4-Pro", "short_name": "DeepSeek-V4-Pro", "provider": "deepseek-ai", "score": 58.4, "date": "2026-04-22" }, { "model_id": "Qwen/Qwen3.5-397B-A17B", "short_name": "Qwen3.5-397B-A17B", "provider": "Qwen", "score": 57.8, "date": "2026-02-16" }, { "model_id": "deepseek-ai/DeepSeek-V4-Flash", "short_name": "DeepSeek-V4-Flash", "provider": "deepseek-ai", "score": 57.8, "date": "2026-04-22" }, { "model_id": "moonshotai/Kimi-K2.5", "short_name": "Kimi-K2.5", "provider": "moonshotai", "score": 52.8, "date": "2026-01-01" }, { "model_id": "MiniMaxAI/MiniMax-M2.7", "short_name": "MiniMax-M2.7", "provider": "MiniMaxAI", "score": 49.7, "date": "2026-04-09" }, { "model_id": "deepseek-ai/DeepSeek-V3.2", "short_name": "DeepSeek-V3.2", "provider": "deepseek-ai", "score": 42.2, "date": "2025-12-01" }, { "model_id": "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "short_name": "NVIDIA-Nemotron-3-Super-120B-A12B-BF16", "provider": "nvidia", "score": 6.8, "date": "2026-03-10" } ] }, "MDPBench": { "name": "MDPBench", "dataset": "Delores-Lin/MDPBench", "lower_is_better": false, "models": [ { "model_id": "rednote-hilab/dots.mocr", "short_name": "dots.mocr", "provider": "rednote-hilab", "score": 80.5, "date": "2026-03-19" }, { "model_id": "PaddlePaddle/PaddleOCR-VL-1.5", "short_name": "PaddleOCR-VL-1.5", "provider": "PaddlePaddle", "score": 78.3, "date": "2026-01-28" }, { "model_id": "rednote-hilab/dots.ocr", "short_name": "dots.ocr", "provider": "rednote-hilab", "score": 76.5, "date": "2025-07-30" }, { "model_id": "allenai/olmOCR-7B-0225-preview", "short_name": "olmOCR-7B-0225-preview", "provider": "allenai", "score": 70.4, "date": "2025-01-15" }, { "model_id": "PaddlePaddle/PaddleOCR-VL", "short_name": "PaddleOCR-VL", "provider": "PaddlePaddle", "score": 69.6, "date": "2025-10-16" }, { "model_id": "Qwen/Qwen3-VL-8B-Instruct", "short_name": "Qwen3-VL-8B-Instruct", "provider": "Qwen", "score": 68.3, "date": "2025-10-11" }, { "model_id": "zai-org/GLM-OCR", "short_name": "GLM-OCR", "provider": "zai-org", "score": 67.3, "date": "2026-01-30" }, { "model_id": "Qwen/Qwen3.5-9B", "short_name": "Qwen3.5-9B", "provider": "Qwen", "score": 65.7, "date": "2026-02-27" }, { "model_id": "nanonets/Nanonets-OCR2-3B", "short_name": "Nanonets-OCR2-3B", "provider": "nanonets", "score": 64.2, "date": "2025-10-13" }, { "model_id": "lightonai/LightOnOCR-2-1B", "short_name": "LightOnOCR-2-1B", "provider": "lightonai", "score": 63.9, "date": "2026-01-16" }, { "model_id": "nanonets/Nanonets-OCR-s", "short_name": "Nanonets-OCR-s", "provider": "nanonets", "score": 63.7, "date": "2025-06-10" }, { "model_id": "deepseek-ai/DeepSeek-OCR", "short_name": "DeepSeek-OCR", "provider": "deepseek-ai", "score": 51.8, "date": "2025-10-17" }, { "model_id": "opendatalab/MinerU2.5-2509-1.2B", "short_name": "MinerU2.5-2509-1.2B", "provider": "opendatalab", "score": 46.3, "date": "2025-09-17" }, { "model_id": "OpenGVLab/InternVL3_5-8B", "short_name": "InternVL3_5-8B", "provider": "OpenGVLab", "score": 42.7, "date": "2025-08-25" } ] } }, "logos": { "PolarSeeker": "https://www.gravatar.com/avatar/55d9cc59db4e30206a307c186cc6d5bb?d=retro&size=100", "tiiuae": "https://cdn-avatars.huggingface.co/v1/production/uploads/61a8d1aac664736898ffc84f/AT6cAB5ZNwCcqFMal71WD.jpeg", "FINAL-Bench": "https://cdn-avatars.huggingface.co/v1/production/uploads/6905bc786cb49b1f11d32728/VZmuKH-liifeL2GCXlwka.jpeg", "showlab": "https://cdn-avatars.huggingface.co/v1/production/uploads/1671779505215-63a55320ce5763e06f78519c.png", "meituan-longcat": "https://cdn-avatars.huggingface.co/v1/production/uploads/68a2a29ab9d4c5698e02c747/CDCAx7X7rXDt7xjI-DoxG.png", "KDEGroup": "https://cdn-avatars.huggingface.co/v1/production/uploads/688dc279b2d5ebe029e8aafe/GZis_Qxofgb67RkJsllJ6.png", "nvidia": "https://cdn-avatars.huggingface.co/v1/production/uploads/65df9200dc3292a8983e5017/Vs5FPVCH-VZBipV3qKTuy.png", "arcee-ai": "https://cdn-avatars.huggingface.co/v1/production/uploads/6435718aaaef013d1aec3b8b/GZPnGkfMn8Ino6JbkL4fJ.png", "opendatalab": "https://cdn-avatars.huggingface.co/v1/production/uploads/639c3afa7432f2f5d16b7296/yqxxBknyeqkGnYsjoaR4M.png", "microsoft": "https://cdn-avatars.huggingface.co/v1/production/uploads/1583646260758-5e64858c87403103f9f1055d.png", "Snowflake": "https://cdn-avatars.huggingface.co/v1/production/uploads/64dc52cf858f8a41c12fc819/O9-MWzRjWzbNP_DQlMb-7.png", "PaddlePaddle": "https://cdn-avatars.huggingface.co/v1/production/uploads/1654942635336-5f3ff69679c1ba4c353d0c5a.png", "XiaomiMiMo": "https://cdn-avatars.huggingface.co/v1/production/uploads/680cb7d1233834890a64acee/5w_4aLfF-7MAyaIPOV498.jpeg", "jdopensource": "https://cdn-avatars.huggingface.co/v1/production/uploads/68c0e2ab44ea28a974e3074b/g-4gTubd16qUtwmGZ0n4h.png", "Xerv-AI": "https://cdn-avatars.huggingface.co/v1/production/uploads/66a49c4207dd6d0ab485d3c2/6DScbGXvA8qqJ0KzpOkUW.png", "deepseek-ai": "https://cdn-avatars.huggingface.co/v1/production/uploads/6538815d1bdb3c40db94fbfa/xMBly9PUMphrFVMxLX4kq.png", "meta-llama": "https://cdn-avatars.huggingface.co/v1/production/uploads/646cf8084eefb026fb8fd8bc/oCTqufkdTkjyGodsx1vo1.png", "google": "https://cdn-avatars.huggingface.co/v1/production/uploads/5dd96eb166059660ed1ee413/WtA3YYitedOr9n02eHfJe.png", "kyutai": "https://cdn-avatars.huggingface.co/v1/production/uploads/6355a3c1805be5a8f30fea49/8xGdIOlfkopZfhbMitw_k.jpeg", "AIDC-AI": "https://cdn-avatars.huggingface.co/v1/production/uploads/666a9d46a638e57bb7907929/CRc-9MCuH2q9hjTScyTPE.png", "tencent": "https://cdn-avatars.huggingface.co/v1/production/uploads/5dd96eb166059660ed1ee413/Lp3m-XLpjQGwBItlvn69q.png", "mistralai": "https://cdn-avatars.huggingface.co/v1/production/uploads/634c17653d11eaedd88b314d/9OgyfKstSZtbmsmuG8MbU.png", "ByteDance-Seed": "https://cdn-avatars.huggingface.co/v1/production/uploads/6535c9e88bde2fae19b6fb25/flkDUqd_YEuFsjeNET3r-.png", "BAAI": "https://cdn-avatars.huggingface.co/v1/production/uploads/1664511063789-632c234f42c386ebd2710434.png", "lerobot": "https://cdn-avatars.huggingface.co/v1/production/uploads/631ce4b244503b72277fc89f/pcLUTLsvMQiR-ujlTgLYF.png", "Nanbeige": "https://cdn-avatars.huggingface.co/v1/production/uploads/646f0d118ff94af23bc44aab/GXHCollpMRgvYqUXQ2BQ7.png", "moonshotai": "https://cdn-avatars.huggingface.co/v1/production/uploads/641c1e77c3983aa9490f8121/X1yT2rsaIbR9cdYGEVu0X.jpeg", "OS-Copilot": "https://cdn-avatars.huggingface.co/v1/production/uploads/6280e830e99dccaac4bbfde5/SpDHebsUH88Eo03JCjvch.jpeg", "GAIR": "https://cdn-avatars.huggingface.co/v1/production/uploads/6144a0c4ff1146bbd84d9865/NqAuVddq2ci-AsFcFNbav.png", "Hcompany": "https://cdn-avatars.huggingface.co/v1/production/uploads/677d3f355f847864bb644112/OQyAJ33sssiTDIQEQ7oH_.png", "FireRedTeam": "https://cdn-avatars.huggingface.co/v1/production/uploads/66ec07ef12bd743cfe91004e/PK3bgl6aF2RzW1QFKkq8R.png", "Salesforce": "https://cdn-avatars.huggingface.co/v1/production/uploads/1602756670970-noauth.jpeg", "abr-ai": "https://cdn-avatars.huggingface.co/v1/production/uploads/68e56cb442da034c65305b1b/DYHvx48l1-zt0G5VRj95j.png", "infly": "https://cdn-avatars.huggingface.co/v1/production/uploads/63ed9862679c2cc40abb55d2/0n6g0jngiKkRjaEoAvPmM.png", "allenai": "https://cdn-avatars.huggingface.co/v1/production/uploads/652db071b62cf1f8463221e2/CxxwFiaomTa1MCX_B7-pT.png", "nyrahealth": "https://cdn-avatars.huggingface.co/v1/production/uploads/66ba1e19c485be2eb64e43fe/POBDyKOXfBAYRIpCkTuk4.png", "CohereLabs": "https://cdn-avatars.huggingface.co/v1/production/uploads/1678549441248-5e70f6048ce3c604d78fe133.png", "internlm": "https://cdn-avatars.huggingface.co/v1/production/uploads/6432683407bad11484a68457/Q3Y0dL79GcsnaBCGRMooZ.png", "datalab-to": "https://cdn-avatars.huggingface.co/v1/production/uploads/67ab6afe315e622f597bf9e8/YOgg0gVYVXZC1PDIHFTWK.png", "osunlp": "https://cdn-avatars.huggingface.co/v1/production/uploads/6477a323dbc2a416f8b852b3/oiPPBo_knuDrz0YN9slKj.png", "zai-org": "https://cdn-avatars.huggingface.co/v1/production/uploads/62dc173789b4cf157d36ebee/i_pxzM2ZDo3Ub-BEgIkE9.png", "baidu": "https://cdn-avatars.huggingface.co/v1/production/uploads/64f187a2cc1c03340ac30498/TYYUxK8xD1AxExFMWqbZD.png", "LGAI-EXAONE": "https://cdn-avatars.huggingface.co/v1/production/uploads/66a899a72f11aaf66001a8dc/UfdrP3GMo9pNT62BaMnhw.png", "okestro-ai-lab": "https://cdn-avatars.huggingface.co/v1/production/uploads/66d5316f18b92f31af962058/wRnaLGYzmSUAQaMFTdA_V.png", "speechbrain": "https://cdn-avatars.huggingface.co/v1/production/uploads/1663000279893-60243f18c1f3c79f98e4b382.png", "rednote-hilab": "https://cdn-avatars.huggingface.co/v1/production/uploads/6807a1d6504547b3554b9c73/WgnnQDsz7FqnyTtv8mmRO.png", "UsefulSensors": "https://cdn-avatars.huggingface.co/v1/production/uploads/noauth/TXG6u2PtGnohUXBQwj2Ks.png", "nanonets": "https://cdn-avatars.huggingface.co/v1/production/uploads/641fc216a390e539522d511f/Xtxh40e8zSzkuKtCr58DH.jpeg", "SWE-Lego": "https://cdn-avatars.huggingface.co/v1/production/uploads/60fc2fcca6bdebbe52dfdaf4/AeuYwUH-CQCt893qnmAGa.png", "stepfun-ai": "https://cdn-avatars.huggingface.co/v1/production/uploads/644f7e6233ac8f46fa0b9e26/CmF2ocXhkr2UtHXgmwq7-.png", "inclusionAI": "https://cdn-avatars.huggingface.co/v1/production/uploads/662e1f9da266499277937d33/fyKuazRifqiaIO34xrhhm.jpeg", "MiniMaxAI": "https://cdn-avatars.huggingface.co/v1/production/uploads/676e38ad04af5bec20bc9faf/dUd-LsZEX0H_d4qefO_g6.jpeg", "mteb": "https://cdn-avatars.huggingface.co/v1/production/uploads/5ff5943752c26e9bc240bada/OrZxdlg8doDNO2TZ6Q58G.png", "espnet": "https://cdn-avatars.huggingface.co/v1/production/uploads/1625224006560-60d28bba010d938bba5c6ae9.png", "prism-ml": "https://cdn-avatars.huggingface.co/v1/production/uploads/6920b71747acb07530915d41/mEaEo0tgAYZn-NB6S-nuP.png", "open-agent-leaderboard": "https://cdn-avatars.huggingface.co/v1/production/uploads/5fc0292de45c5468456e022b/Q_OTP--dfvdPiEoYThLUY.png", "OrionLLM": "https://cdn-avatars.huggingface.co/v1/production/uploads/685ea8ff7b4139b6845ce395/WDRcHfV5War3OgSPmXFve.png", "xlangai": "https://cdn-avatars.huggingface.co/v1/production/uploads/628f6e5ab90dde28ef57d293/PHIYPvE_tCM94SgoUzd7l.jpeg", "lightonai": "https://cdn-avatars.huggingface.co/v1/production/uploads/1651597775471-62715572ab9243b5d40cbb1d.png", "openai": "https://cdn-avatars.huggingface.co/v1/production/uploads/68783facef79a05727260de3/UPX5RQxiPGA-ZbBmArIKq.png", "Multilingual-Multimodal-NLP": "https://www.gravatar.com/avatar/cdabcf4c0ac6a92af4940fe0eb6924eb?d=retro&size=100", "efficient-speech": "https://www.gravatar.com/avatar/d7c3856fff536efb76921d957d060400?d=retro&size=100", "Zyphra": "https://cdn-avatars.huggingface.co/v1/production/uploads/noauth/jxR_DdbmmulkyLubYqfMv.png", "docling-project": "https://cdn-avatars.huggingface.co/v1/production/uploads/63c64dd877caf00391004e20/aWC70TyF2UhxyaUh1alpu.png", "HIT-TMG": "https://cdn-avatars.huggingface.co/v1/production/uploads/64b7679a08e2452d18db9a9e/uk4QHGGYqcrEHmiweTFWy.png", "HelpingAI": "https://cdn-avatars.huggingface.co/v1/production/uploads/6612aedf09f16e7347dfa7e1/jHRLPBTlyykFwrd6-Mak_.png", "RedHatAI": "https://cdn-avatars.huggingface.co/v1/production/uploads/60466e4b4f40b01b66151416/cdABRow21BL0sl1vSVTPk.png", "poolside": "https://cdn-avatars.huggingface.co/v1/production/uploads/699484cbe85a4b61cbc5ee0f/GpYWuz-CovEFgbPOW21dZ.png", "GritLM": "https://cdn-avatars.huggingface.co/v1/production/uploads/5f1eb362eec0ad2a071ad6e2/K3bi31cKoKtVDWAKu9trI.png", "sentence-transformers": "https://cdn-avatars.huggingface.co/v1/production/uploads/1609621322398-5eff4688ff69163f6f59e66c.png", "openbmb": "https://cdn-avatars.huggingface.co/v1/production/uploads/1670387859384-633fe7784b362488336bbfad.png", "ibm-granite": "https://cdn-avatars.huggingface.co/v1/production/uploads/639bcaa2445b133a4e942436/CEW-OjXkRkDNmTxSu8Egh.png", "facebook": "https://cdn-avatars.huggingface.co/v1/production/uploads/1592839207516-noauth.png", "OpenGVLab": "https://cdn-avatars.huggingface.co/v1/production/uploads/64006c09330a45b03605bba3/FvdxiTkTqH8rKDOzGZGUE.jpeg", "distil-whisper": "https://cdn-avatars.huggingface.co/v1/production/uploads/61f91cf54a8e5a275b2b3e7c/cUNzV7MAYi8lo9LsCYixp.png", "lm-provers": "https://cdn-avatars.huggingface.co/v1/production/uploads/5f0c746619cb630495b814fd/Td4sH4W-LIdR89AqHCuw3.jpeg", "jinaai": "https://cdn-avatars.huggingface.co/v1/production/uploads/603763514de52ff951d89793/wD54VbAHHyHop3uYlJKl4.png", "miromind-ai": "https://cdn-avatars.huggingface.co/v1/production/uploads/682c41fb2f8a52030ec93ce0/Cna52_IapEXuNBsyI3lvR.png", "soundsgoodai": "https://www.gravatar.com/avatar/2e7e38eb40d5e4429d62544c73640cc6?d=retro&size=100", "LiquidAI": "https://cdn-avatars.huggingface.co/v1/production/uploads/61b8e2ba285851687028d395/EsTgVtnM2IqVRKgPdfqcB.png", "Qwen": "https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png", "nomic-ai": "https://cdn-avatars.huggingface.co/v1/production/uploads/641f01fe6d51620635e118e9/wy0ax27ok1-uHWoUAHSEs.png" }, "colors": { "AIDC-AI": "#6366f1", "BAAI": "#0d9488", "ByteDance-Seed": "#d97706", "CohereLabs": "#e11d48", "FINAL-Bench": "#7c3aed", "FireRedTeam": "#16a34a", "GAIR": "#2563eb", "GritLM": "#ea580c", "HIT-TMG": "#8b5cf6", "Hcompany": "#0891b2", "HelpingAI": "#c026d3", "KDEGroup": "#65a30d", "LGAI-EXAONE": "#dc2626", "LiquidAI": "#0284c7", "MiniMaxAI": "#a21caf", "Multilingual-Multimodal-NLP": "#059669", "Nanbeige": "#9333ea", "OS-Copilot": "#ca8a04", "OpenGVLab": "#be185d", "OrionLLM": "#0369a1", "PaddlePaddle": "#6366f1", "PolarSeeker": "#0d9488", "Qwen": "#d97706", "RedHatAI": "#e11d48", "SWE-Lego": "#7c3aed", "Salesforce": "#16a34a", "Snowflake": "#2563eb", "UsefulSensors": "#ea580c", "Xerv-AI": "#8b5cf6", "XiaomiMiMo": "#0891b2", "Zyphra": "#c026d3", "abr-ai": "#65a30d", "allenai": "#dc2626", "arcee-ai": "#0284c7", "baidu": "#a21caf", "datalab-to": "#059669", "deepseek-ai": "#9333ea", "distil-whisper": "#ca8a04", "docling-project": "#be185d", "efficient-speech": "#0369a1", "espnet": "#6366f1", "facebook": "#0d9488", "google": "#d97706", "ibm-granite": "#e11d48", "inclusionAI": "#7c3aed", "infly": "#16a34a", "internlm": "#2563eb", "jdopensource": "#ea580c", "jinaai": "#8b5cf6", "kyutai": "#0891b2", "lerobot": "#c026d3", "lightonai": "#65a30d", "lm-provers": "#dc2626", "meituan-longcat": "#0284c7", "meta-llama": "#a21caf", "microsoft": "#059669", "miromind-ai": "#9333ea", "mistralai": "#ca8a04", "moonshotai": "#be185d", "mteb": "#0369a1", "nanonets": "#6366f1", "nomic-ai": "#0d9488", "nvidia": "#d97706", "nyrahealth": "#e11d48", "okestro-ai-lab": "#7c3aed", "open-agent-leaderboard": "#16a34a", "openai": "#2563eb", "openbmb": "#ea580c", "opendatalab": "#8b5cf6", "osunlp": "#0891b2", "poolside": "#c026d3", "prism-ml": "#65a30d", "rednote-hilab": "#dc2626", "sentence-transformers": "#0284c7", "showlab": "#a21caf", "soundsgoodai": "#059669", "speechbrain": "#9333ea", "stepfun-ai": "#ca8a04", "tencent": "#be185d", "tiiuae": "#0369a1", "xlangai": "#6366f1", "zai-org": "#0d9488" }, "generated_at": "2026-05-20T20:00:36.647268+00:00" }