[2026-03-21 00:18:14,920] [DEBUG] [axolotl.utils.config.resolve_dtype:69] [PID:3143] bf16 support detected, enabling for this configuration.
[2026-03-21 00:18:15,065] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:3143] baseline 0.000GB ()
[2026-03-21 00:18:15,066] [INFO] [axolotl.cli.config.load_cfg:341] [PID:3143] config:
{
  "activation_offloading": true,
  "adapter": "lora",
  "axolotl_config_path": "train.yml",
  "base_model": "Qwen/Qwen3.5-27B",
  "base_model_config": "Qwen/Qwen3.5-27B",
  "batch_size": 32,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_90",
    "fp8": true,
    "n_gpu": 1,
    "n_node": 1,
    "tf32": true
  },
  "chat_template": "tokenizer_default",
  "context_parallel_size": 1,
  "cosine_min_lr_ratio": 0.1,
  "cut_cross_entropy": true,
  "dataloader_num_workers": 1,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 24,
  "dataset_prepared_path": "last_run_prepared",
  "datasets": [
    {
      "ds_type": "parquet",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "output.parquet",
      "trust_remote_code": false
    }
  ],
  "ddp": false,
  "device": "cuda:0",
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.8.0"
  },
  "eval_batch_size": 8,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_table_size": 0,
  "experimental_skip_move_to_device": true,
  "flash_attention": true,
  "fp16": false,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 4,
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": true
  },
  "group_by_length": false,
  "include_tkps": true,
  "is_multimodal": true,
  "learning_rate": 1e-05,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_alpha": 512,
  "lora_dropout": 0.0,
  "lora_r": 64,
  "lora_target_modules": [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "down_proj",
    "up_proj",
    "linear_attn.in_proj_qkv",
    "linear_attn.in_proj_z",
    "linear_attn.out_proj"
  ],
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "constant",
  "max_grad_norm": 0.1,
  "mean_resizing_embeddings": false,
  "micro_batch_size": 8,
  "model_config_type": "qwen3_5",
  "model_config_type_text": "qwen3_5_text",
  "num_epochs": 2.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_torch_8bit",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "./model-output",
  "pad_to_sequence_len": true,
  "plugins": [
    "axolotl.integrations.liger.LigerPlugin",
    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"
  ],
  "pretrain_multipack_attn": true,
  "processor_config": "Qwen/Qwen3.5-27B",
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 0.5,
  "saves_per_epoch": 1,
  "sequence_len": 8192,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": true,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Qwen/Qwen3.5-27B",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "trust_remote_code": false,
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.0,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_project": "qwen-27b-seemo",
  "warmup_ratio": 0.05,
  "weight_decay": 0.001,
  "world_size": 1
}
[2026-03-21 00:18:15,069] [WARNING] [axolotl.cli.checks.check_user_token:46] [PID:3143] Error verifying HuggingFace token. Remember to log in using `hf auth login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets.
[2026-03-21 00:18:15,157] [WARNING] [huggingface_hub.utils._http._warn_on_warning_headers:916] [PID:3143] Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
[2026-03-21 00:18:16,548] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:3143] EOS: 248046 / <|im_end|>
[2026-03-21 00:18:16,548] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:3143] BOS: None / None
[2026-03-21 00:18:16,548] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:3143] PAD: 248044 / <|endoftext|>
[2026-03-21 00:18:16,548] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:3143] UNK: None / None
[2026-03-21 00:18:16,550] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:3143] Loading prepared dataset from disk at last_run_prepared/e72d0f3749caf5df6eb0a14757b8955e...
Loading dataset from disk:   0%|                                                                                    | 0/24 [00:00<?, ?it/s]Loading dataset from disk: 100%|███████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 107776.55it/s]
[2026-03-21 00:18:16,641] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:420] [PID:3143] total_num_tokens: 11_947_893
[2026-03-21 00:18:16,772] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:438] [PID:3143] `total_supervised_tokens: 9_954_548`
[2026-03-21 00:18:18,359] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3143] generate_batches time: 0.73325514793396
[2026-03-21 00:18:19,103] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3143] generate_batches time: 0.7439286708831787
[2026-03-21 00:18:19,822] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3143] generate_batches time: 0.7183380126953125
[2026-03-21 00:18:20,614] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3143] generate_batches time: 0.7919862270355225
[2026-03-21 00:18:20,637] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:3143] gather_len_batches: [183]
[2026-03-21 00:18:20,637] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:495] [PID:3143] data_loader_len: 45
[2026-03-21 00:18:20,637] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:504] [PID:3143] sample_packing_eff_est across ranks: [0.9962315793897285]
[2026-03-21 00:18:20,638] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:516] [PID:3143] sample_packing_eff_est: 1.0
[2026-03-21 00:18:20,638] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:521] [PID:3143] total_num_steps: 90
[2026-03-21 00:18:20,638] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:3143] Maximum number of steps set at 90
[2026-03-21 00:18:20,731] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:3143] loading tokenizer... Qwen/Qwen3.5-27B
[2026-03-21 00:18:22,269] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:299] [PID:3143] EOS: 248046 / <|im_end|>
[2026-03-21 00:18:22,269] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:300] [PID:3143] BOS: None / None
[2026-03-21 00:18:22,269] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:301] [PID:3143] PAD: 248044 / <|endoftext|>
[2026-03-21 00:18:22,269] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:302] [PID:3143] UNK: None / None
preprocessor_config.json:   0%|                                                                                  | 0.00/390 [00:00<?, ?B/s]preprocessor_config.json: 100%|███████████████████████████████████████████████████████████████████████████| 390/390 [00:00<00:00, 1.91MB/s]
video_preprocessor_config.json:   0%|                                                                            | 0.00/385 [00:00<?, ?B/s]video_preprocessor_config.json: 100%|█████████████████████████████████████████████████████████████████████| 385/385 [00:00<00:00, 1.67MB/s]
[2026-03-21 00:18:25,140] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:3143] Loading model
[2026-03-21 00:18:25,181] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:91] [PID:3143] Patched Trainer.evaluation_loop with nanmean loss calculation
[2026-03-21 00:18:25,182] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:142] [PID:3143] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
[2026-03-21 00:18:25,224] [INFO] [axolotl.monkeypatch.attention.flash_attn_4.patch_flash_attn_4:52] [PID:3143] Flash Attention 4 is available for your GPU and offers faster training speeds. To enable: pip install flash-attn-4
[2026-03-21 00:18:26,416] [INFO] [axolotl.monkeypatch.models.qwen3_5.modeling._apply_packing_patches:254] [PID:3143] Applied Qwen3_5 packing patch (fla_causal_conv1d=available)
[2026-03-21 00:18:26,416] [INFO] [axolotl.monkeypatch.models.qwen3_5.modeling.patch_qwen3_5_vlm_flash_attention:289] [PID:3143] Applied Qwen3.5 VLM flash-attention patch (3-D MRoPE position_ids)
[2026-03-21 00:18:26,417] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:402] [PID:3143] Applying multipack dataloader patch for sample packing...
[2026-03-21 00:18:26,640] [WARNING] [axolotl.integrations.liger.plugin.pre_model_load:215] [PID:3143] Unsupported model config type: qwen3_5. Liger not applied.
[2026-03-21 00:18:26,669] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:3143] Applying Cut Cross Entropy to model type: qwen3_5
model.safetensors.index.json: 0.00B [00:00, ?B/s]model.safetensors.index.json: 127kB [00:00, 210MB/s]
Downloading (incomplete total...): 0.00B [00:00, ?B/s]
Fetching 11 files:   0%|                                                                                            | 0/11 [00:00<?, ?it/s][ADownloading (incomplete total...):   0%|                                                                       | 0.00/5.26G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                       | 0.00/10.6G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                       | 0.00/16.0G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                       | 0.00/32.0G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                       | 0.00/37.4G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                       | 0.00/37.4G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                       | 0.00/42.7G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                       | 0.00/42.7G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                                       | 0.00/42.7G [00:00<?, ?B/s]Downloading (incomplete total...):   0%|                                                            | 65.5k/42.7G [00:00<36:22:48, 326kB/s]Downloading (incomplete total...):   0%|                                                             | 324k/42.7G [00:00<12:07:40, 978kB/s]Downloading (incomplete total...):   0%|                                                              | 12.0M/42.7G [00:01<38:59, 18.3MB/s]Downloading (incomplete total...):   0%|▎                                                               | 179M/42.7G [00:01<05:05, 139MB/s]Downloading (incomplete total...):   1%|▍                                                               | 300M/42.7G [00:02<02:54, 243MB/s]Downloading (incomplete total...):   1%|▊                                                               | 502M/42.7G [00:02<01:31, 463MB/s]Downloading (incomplete total...):   1%|▉                                                               | 636M/42.7G [00:02<01:34, 447MB/s]Downloading (incomplete total...):   2%|█▎                                                              | 837M/42.7G [00:02<01:04, 650MB/s]Downloading (incomplete total...):   3%|█▉                                                            | 1.31G/42.7G [00:02<00:36, 1.15GB/s]Downloading (incomplete total...):   3%|██▏                                                            | 1.47G/42.7G [00:03<01:08, 606MB/s]Downloading (incomplete total...):   5%|███▍                                                          | 2.34G/42.7G [00:03<00:31, 1.28GB/s]Downloading (incomplete total...):   7%|████▏                                                         | 2.88G/42.7G [00:04<00:35, 1.12GB/s]Downloading (incomplete total...):   8%|████▊                                                         | 3.28G/42.7G [00:04<00:32, 1.21GB/s]Downloading (incomplete total...):  10%|██████                                                         | 4.09G/42.7G [00:06<01:07, 576MB/s]Downloading (incomplete total...):  10%|██████▌                                                        | 4.45G/42.7G [00:07<00:57, 670MB/s]Downloading (incomplete total...):  15%|█████████                                                     | 6.23G/42.7G [00:07<00:24, 1.48GB/s]Downloading (incomplete total...):  19%|███████████▍                                                  | 7.92G/42.7G [00:07<00:14, 2.42GB/s]Downloading (incomplete total...):  24%|██████████████▊                                               | 10.2G/42.7G [00:07<00:07, 4.07GB/s]Downloading (incomplete total...):  41%|█████████████████████████▏                                    | 17.3G/42.7G [00:07<00:02, 10.5GB/s]Downloading (incomplete total...):  50%|███████████████████████████████▏                              | 21.5G/42.7G [00:08<00:01, 11.6GB/s]Downloading (incomplete total...):  50%|███████████████████████████████▏                              | 21.5G/42.7G [00:08<00:01, 12.9GB/s]Downloading (incomplete total...):  50%|███████████████████████████████▏                              | 21.5G/42.7G [00:08<00:01, 12.9GB/s]Downloading (incomplete total...):  65%|████████████████████████████████████████▏                     | 27.7G/42.7G [00:08<00:00, 17.6GB/s]Downloading (incomplete total...):  72%|████████████████████████████████████████████▌                 | 30.7G/42.7G [00:08<00:01, 11.6GB/s]Downloading (incomplete total...):  85%|████████████████████████████████████████████████████▊         | 36.4G/42.7G [00:09<00:00, 8.86GB/s]Downloading (incomplete total...):  91%|████████████████████████████████████████████████████████▎     | 38.8G/42.7G [00:09<00:00, 9.25GB/s]Downloading (incomplete total...):  94%|██████████████████████████████████████████████████████████    | 40.0G/42.7G [00:09<00:00, 8.74GB/s]Downloading (incomplete total...):  98%|████████████████████████████████████████████████████████████▌ | 41.8G/42.7G [00:09<00:00, 7.44GB/s]Downloading (incomplete total...): 100%|██████████████████████████████████████████████████████████████| 42.7G/42.7G [00:09<00:00, 7.44GB/s]Downloading (incomplete total...): 100%|██████████████████████████████████████████████████████████████| 42.7G/42.7G [00:09<00:00, 7.44GB/s]Downloading (incomplete total...): 100%|██████████████████████████████████████████████████████████████| 42.7G/42.7G [00:09<00:00, 7.44GB/s]Downloading (incomplete total...): 100%|██████████████████████████████████████████████████████████████| 42.7G/42.7G [00:09<00:00, 7.44GB/s]
Fetching 11 files:   9%|███████▋                                                                            | 1/11 [00:09<01:32,  9.28s/it][ADownloading (incomplete total...):  89%|███████████████████████████████████████████████████████       | 42.7G/48.1G [00:09<00:00, 7.44GB/s]Downloading (incomplete total...):  85%|████████████████████████████████████████████████████▋         | 42.7G/50.2G [00:09<00:01, 7.44GB/s]Downloading (incomplete total...):  77%|███████████████████████████████████████████████▋              | 42.7G/55.6G [00:09<00:01, 7.44GB/s]Downloading (incomplete total...):  79%|█████████████████████████████████████████████████             | 43.9G/55.6G [00:10<00:03, 3.28GB/s]Downloading (incomplete total...):  81%|██████████████████████████████████████████████████▍           | 45.2G/55.6G [00:11<00:03, 3.28GB/s]Downloading (incomplete total...):  83%|███████████████████████████████████████████████████▌          | 46.2G/55.6G [00:11<00:02, 3.65GB/s]Downloading (incomplete total...):  86%|█████████████████████████████████████████████████████▎        | 47.8G/55.6G [00:11<00:01, 4.32GB/s]Downloading (incomplete total...):  88%|██████████████████████████████████████████████████████▋       | 49.0G/55.6G [00:11<00:01, 5.09GB/s]Downloading (incomplete total...):  90%|████████████████████████████████████████████████████████      | 50.2G/55.6G [00:11<00:00, 5.48GB/s]Downloading (incomplete total...):  93%|█████████████████████████████████████████████████████████▌    | 51.6G/55.6G [00:11<00:00, 5.92GB/s]Downloading (incomplete total...):  97%|███████████████████████████████████████████████████████████▉  | 53.7G/55.6G [00:12<00:00, 4.64GB/s]Downloading (incomplete total...):  99%|█████████████████████████████████████████████████████████████▎| 55.0G/55.6G [00:12<00:00, 4.99GB/s]
Fetching 11 files:  82%|████████████████████████████████████████████████████████████████████▋               | 9/11 [00:13<00:02,  1.19s/it][AFetching 11 files: 100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [00:13<00:00,  1.20s/it]
Download complete: 100%|██████████████████████████████████████████████████████████████████████████████| 55.6G/55.6G [00:13<00:00, 4.99GB/s]Download complete: 100%|██████████████████████████████████████████████████████████████████████████████| 55.6G/55.6G [00:13<00:00, 4.22GB/s]
Loading weights:   0%|                                                                                            | 0/1184 [00:00<?, ?it/s]Loading weights:  91%|███████████████████████████████████████████████████████████████████████▎      | 1082/1184 [00:00<00:00, 10818.77it/s]Loading weights: 100%|██████████████████████████████████████████████████████████████████████████████| 1184/1184 [00:00<00:00, 11513.01it/s]
generation_config.json:   0%|                                                                                    | 0.00/244 [00:00<?, ?B/s]generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████| 244/244 [00:00<00:00, 1.44MB/s]
[2026-03-21 00:18:45,825] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:361] [PID:3143] Converting modules to torch.bfloat16
[2026-03-21 00:18:49,854] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:3143] Memory usage after model load 0.000GB (+0.000GB allocated, +0.002GB reserved)
trainable params: 342,884,352 || all params: 27,699,612,912 || trainable%: 1.2379
[2026-03-21 00:18:53,043] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:3143] after adapters 0.000GB ()
[2026-03-21 00:19:04,656] [INFO] [axolotl.train.save_initial_configs:413] [PID:3143] Pre-saving adapter config to ./model-output...
[2026-03-21 00:19:04,656] [INFO] [axolotl.train.save_initial_configs:417] [PID:3143] Pre-saving tokenizer to ./model-output...
[2026-03-21 00:19:04,824] [INFO] [axolotl.train.save_initial_configs:422] [PID:3143] Pre-saving model config to ./model-output...
[2026-03-21 00:19:04,829] [INFO] [axolotl.train.save_initial_configs:426] [PID:3143] Pre-saving processor to ./model-output...
[2026-03-21 00:19:05,620] [INFO] [axolotl.train.execute_training:218] [PID:3143] Starting trainer...
[2026-03-21 00:19:07,814] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3143] generate_batches time: 0.9281222820281982
[2026-03-21 00:19:08,801] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3143] generate_batches time: 0.986567497253418
[2026-03-21 00:19:09,833] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3143] generate_batches time: 1.0313453674316406
[2026-03-21 00:19:10,854] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:3143] generate_batches time: 1.0205955505371094
[2026-03-21 00:19:10,854] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:3143] gather_len_batches: [183]
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Create a new API key at: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Store your API key securely and do not share it.
[34m[1mwandb[0m: Paste your API key and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfizzz[0m ([33mfizzzz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m Waiting for wandb.init()...
[Am[2K[34m[1mwandb[0m: [38;5;178m⣽[0m setting up run wme52dg5 (0.3s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣾[0m setting up run wme52dg5 (0.3s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣷[0m setting up run wme52dg5 (0.3s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣯[0m setting up run wme52dg5 (0.3s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣟[0m setting up run wme52dg5 (0.3s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⡿[0m setting up run wme52dg5 (0.8s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⢿[0m setting up run wme52dg5 (0.8s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣻[0m setting up run wme52dg5 (0.8s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣽[0m setting up run wme52dg5 (0.8s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣾[0m setting up run wme52dg5 (0.8s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣷[0m setting up run wme52dg5 (1.3s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣯[0m setting up run wme52dg5 (1.3s)
[Am[2K[34m[1mwandb[0m: [38;5;178m⣟[0m setting up run wme52dg5 (1.3s)
[Am[2K[34m[1mwandb[0m: Tracking run with wandb version 0.25.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/root/axolotl/wandb/run-20260321_002002-wme52dg5[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mabsurd-elevator-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/fizzzz/qwen-27b-seemo[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/fizzzz/qwen-27b-seemo/runs/wme52dg5[0m
[34m[1mwandb[0m: [33mWARNING[0m Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
[34m[1mwandb[0m: [33mWARNING[0m Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-03-21 00:20:06,105] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:3143] The Axolotl config has been saved to the WandB run under files.
  0%|                                                                                                               | 0/90 [00:00<?, ?it/s]  1%|█                                                                                                   | 1/90 [06:06<9:03:29, 366.40s/it]                                                                                                                                           {'loss': '0.9887', 'grad_norm': '0.4026', 'learning_rate': '1e-05', 'ppl': '2.688', 'memory/max_active (GiB)': '105.4', 'memory/max_allocated (GiB)': '105.4', 'memory/device_reserved (GiB)': '108.3', 'tokens/train_per_sec_per_gpu': '144.2', 'tokens/total': 262144, 'tokens/trainable': 213567, 'epoch': '0.02186'}
  1%|█                                                                                                   | 1/90 [06:06<9:03:29, 366.40s/it]  2%|██▏                                                                                                 | 2/90 [09:20<6:28:45, 265.06s/it]                                                                                                                                           {'loss': '0.9882', 'grad_norm': '0.3587', 'learning_rate': '1e-05', 'ppl': '2.686', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '291.9', 'tokens/total': 524288, 'tokens/trainable': 439223, 'epoch': '0.04372'}
  2%|██▏                                                                                                 | 2/90 [09:20<6:28:45, 265.06s/it]  3%|███▎                                                                                                | 3/90 [12:34<5:37:16, 232.61s/it]                                                                                                                                           {'loss': '0.977', 'grad_norm': '0.3049', 'learning_rate': '1e-05', 'ppl': '2.656', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '286.3', 'tokens/total': 786432, 'tokens/trainable': 652743, 'epoch': '0.06557'}
  3%|███▎                                                                                                | 3/90 [12:34<5:37:16, 232.61s/it]  4%|████▍                                                                                               | 4/90 [15:48<5:11:38, 217.43s/it]                                                                                                                                           {'loss': '0.9013', 'grad_norm': '0.2071', 'learning_rate': '1e-05', 'ppl': '2.463', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '278.1', 'tokens/total': 1048576, 'tokens/trainable': 873364, 'epoch': '0.08743'}
  4%|████▍                                                                                               | 4/90 [15:48<5:11:38, 217.43s/it]  6%|█████▌                                                                                              | 5/90 [19:02<4:55:56, 208.90s/it]                                                                                                                                           {'loss': '0.8648', 'grad_norm': '0.1772', 'learning_rate': '1e-05', 'ppl': '2.375', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '279.2', 'tokens/total': 1310720, 'tokens/trainable': 1071939, 'epoch': '0.1093'}
  6%|█████▌                                                                                              | 5/90 [19:02<4:55:56, 208.90s/it]  7%|██████▋                                                                                             | 6/90 [22:16<4:45:30, 203.94s/it]                                                                                                                                           {'loss': '0.9415', 'grad_norm': '0.1793', 'learning_rate': '1e-05', 'ppl': '2.564', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '300', 'tokens/total': 1572864, 'tokens/trainable': 1296771, 'epoch': '0.1311'}
  7%|██████▋                                                                                             | 6/90 [22:16<4:45:30, 203.94s/it]  8%|███████▊                                                                                            | 7/90 [25:30<4:37:39, 200.72s/it]                                                                                                                                           {'loss': '0.7985', 'grad_norm': '0.137', 'learning_rate': '1e-05', 'ppl': '2.222', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '291.8', 'tokens/total': 1835008, 'tokens/trainable': 1513259, 'epoch': '0.153'}
  8%|███████▊                                                                                            | 7/90 [25:30<4:37:39, 200.72s/it]  9%|████████▉                                                                                           | 8/90 [28:45<4:31:28, 198.64s/it]                                                                                                                                           {'loss': '0.8464', 'grad_norm': '0.1442', 'learning_rate': '1e-05', 'ppl': '2.331', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '278.7', 'tokens/total': 2097152, 'tokens/trainable': 1733807, 'epoch': '0.1749'}
  9%|████████▉                                                                                           | 8/90 [28:45<4:31:28, 198.64s/it] 10%|██████████                                                                                          | 9/90 [31:59<4:26:19, 197.28s/it]                                                                                                                                           {'loss': '0.8014', 'grad_norm': '0.2301', 'learning_rate': '1e-05', 'ppl': '2.229', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '255.6', 'tokens/total': 2359296, 'tokens/trainable': 1955565, 'epoch': '0.1967'}
 10%|██████████                                                                                          | 9/90 [31:59<4:26:19, 197.28s/it] 11%|███████████                                                                                        | 10/90 [35:15<4:22:30, 196.88s/it]                                                                                                                                           {'loss': '0.8031', 'grad_norm': '0.1278', 'learning_rate': '1e-05', 'ppl': '2.232', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '288.1', 'tokens/total': 2621440, 'tokens/trainable': 2169647, 'epoch': '0.2186'}
 11%|███████████                                                                                        | 10/90 [35:15<4:22:30, 196.88s/it] 12%|████████████                                                                                       | 11/90 [38:29<4:18:04, 196.00s/it]                                                                                                                                           {'loss': '0.8434', 'grad_norm': '0.1123', 'learning_rate': '1e-05', 'ppl': '2.324', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '278.6', 'tokens/total': 2883584, 'tokens/trainable': 2380690, 'epoch': '0.2404'}
 12%|████████████                                                                                       | 11/90 [38:29<4:18:04, 196.00s/it] 13%|█████████████▏                                                                                     | 12/90 [41:43<4:14:08, 195.50s/it]                                                                                                                                           {'loss': '0.8152', 'grad_norm': '0.1011', 'learning_rate': '1e-05', 'ppl': '2.26', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '287.8', 'tokens/total': 3145728, 'tokens/trainable': 2609166, 'epoch': '0.2623'}
 13%|█████████████▏                                                                                     | 12/90 [41:43<4:14:08, 195.50s/it] 14%|██████████████▎                                                                                    | 13/90 [44:58<4:10:38, 195.31s/it]                                                                                                                                           {'loss': '0.8308', 'grad_norm': '0.09826', 'learning_rate': '1e-05', 'ppl': '2.295', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '296.8', 'tokens/total': 3407872, 'tokens/trainable': 2830433, 'epoch': '0.2842'}
 14%|██████████████▎                                                                                    | 13/90 [44:58<4:10:38, 195.31s/it] 16%|███████████████▍                                                                                   | 14/90 [48:12<4:06:57, 194.97s/it]                                                                                                                                           {'loss': '0.7664', 'grad_norm': '0.08073', 'learning_rate': '1e-05', 'ppl': '2.152', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '278.6', 'tokens/total': 3670016, 'tokens/trainable': 3053179, 'epoch': '0.306'}
 16%|███████████████▍                                                                                   | 14/90 [48:12<4:06:57, 194.97s/it] 17%|████████████████▌                                                                                  | 15/90 [51:26<4:03:26, 194.75s/it]                                                                                                                                           {'loss': '0.7886', 'grad_norm': '0.08914', 'learning_rate': '1e-05', 'ppl': '2.2', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '289.8', 'tokens/total': 3932160, 'tokens/trainable': 3275304, 'epoch': '0.3279'}
 17%|████████████████▌                                                                                  | 15/90 [51:26<4:03:26, 194.75s/it] 18%|█████████████████▌                                                                                 | 16/90 [54:41<3:59:59, 194.59s/it]                                                                                                                                           {'loss': '0.8138', 'grad_norm': '0.08036', 'learning_rate': '1e-05', 'ppl': '2.257', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '260.4', 'tokens/total': 4194304, 'tokens/trainable': 3487582, 'epoch': '0.3497'}
 18%|█████████████████▌                                                                                 | 16/90 [54:41<3:59:59, 194.59s/it] 19%|██████████████████▋                                                                                | 17/90 [57:55<3:56:39, 194.51s/it]                                                                                                                                           {'loss': '0.7818', 'grad_norm': '0.07846', 'learning_rate': '1e-05', 'ppl': '2.185', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '281.6', 'tokens/total': 4456448, 'tokens/trainable': 3713050, 'epoch': '0.3716'}
 19%|██████████████████▋                                                                                | 17/90 [57:55<3:56:39, 194.51s/it] 20%|███████████████████▍                                                                             | 18/90 [1:01:11<3:53:51, 194.88s/it]                                                                                                                                           {'loss': '0.7663', 'grad_norm': '0.07943', 'learning_rate': '1e-05', 'ppl': '2.152', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '261.6', 'tokens/total': 4718592, 'tokens/trainable': 3927420, 'epoch': '0.3934'}
 20%|███████████████████▍                                                                             | 18/90 [1:01:11<3:53:51, 194.88s/it] 21%|████████████████████▍                                                                            | 19/90 [1:04:25<3:50:21, 194.67s/it]                                                                                                                                           {'loss': '0.7355', 'grad_norm': '0.07259', 'learning_rate': '1e-05', 'ppl': '2.086', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '277', 'tokens/total': 4980736, 'tokens/trainable': 4147429, 'epoch': '0.4153'}
 21%|████████████████████▍                                                                            | 19/90 [1:04:25<3:50:21, 194.67s/it] 22%|█████████████████████▌                                                                           | 20/90 [1:07:39<3:46:55, 194.51s/it]                                                                                                                                           {'loss': '0.7543', 'grad_norm': '0.07516', 'learning_rate': '1e-05', 'ppl': '2.126', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '288.5', 'tokens/total': 5242880, 'tokens/trainable': 4369327, 'epoch': '0.4372'}
 22%|█████████████████████▌                                                                           | 20/90 [1:07:39<3:46:55, 194.51s/it] 23%|██████████████████████▋                                                                          | 21/90 [1:10:53<3:43:37, 194.46s/it]                                                                                                                                           {'loss': '0.7667', 'grad_norm': '0.07336', 'learning_rate': '1e-05', 'ppl': '2.153', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '280.7', 'tokens/total': 5505024, 'tokens/trainable': 4596554, 'epoch': '0.459'}
 23%|██████████████████████▋                                                                          | 21/90 [1:10:53<3:43:37, 194.46s/it] 24%|███████████████████████▋                                                                         | 22/90 [1:14:07<3:40:11, 194.29s/it]                                                                                                                                           {'loss': '0.7566', 'grad_norm': '0.07749', 'learning_rate': '1e-05', 'ppl': '2.131', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '247.3', 'tokens/total': 5767168, 'tokens/trainable': 4802721, 'epoch': '0.4809'}
 24%|███████████████████████▋                                                                         | 22/90 [1:14:07<3:40:11, 194.29s/it] 26%|████████████████████████▊                                                                        | 23/90 [1:17:21<3:36:54, 194.25s/it]                                                                                                                                           {'loss': '0.6946', 'grad_norm': '0.07036', 'learning_rate': '1e-05', 'ppl': '2.003', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '298.2', 'tokens/total': 6029312, 'tokens/trainable': 5027507, 'epoch': '0.5027'}
 26%|████████████████████████▊                                                                        | 23/90 [1:17:21<3:36:54, 194.25s/it] 27%|█████████████████████████▊                                                                       | 24/90 [1:20:35<3:33:35, 194.17s/it]                                                                                                                                           {'loss': '0.7064', 'grad_norm': '0.0833', 'learning_rate': '1e-05', 'ppl': '2.027', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '267.6', 'tokens/total': 6291456, 'tokens/trainable': 5242022, 'epoch': '0.5246'}
 27%|█████████████████████████▊                                                                       | 24/90 [1:20:35<3:33:35, 194.17s/it] 28%|██████████████████████████▉                                                                      | 25/90 [1:23:50<3:30:22, 194.19s/it]                                                                                                                                           {'loss': '0.7493', 'grad_norm': '0.06973', 'learning_rate': '1e-05', 'ppl': '2.116', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '253', 'tokens/total': 6553600, 'tokens/trainable': 5459249, 'epoch': '0.5464'}
 28%|██████████████████████████▉                                                                      | 25/90 [1:23:50<3:30:22, 194.19s/it] 29%|████████████████████████████                                                                     | 26/90 [1:27:04<3:27:07, 194.18s/it]                                                                                                                                           {'loss': '0.719', 'grad_norm': '0.07261', 'learning_rate': '1e-05', 'ppl': '2.052', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '307.6', 'tokens/total': 6815744, 'tokens/trainable': 5681948, 'epoch': '0.5683'}
 29%|████████████████████████████                                                                     | 26/90 [1:27:04<3:27:07, 194.18s/it] 30%|█████████████████████████████                                                                    | 27/90 [1:30:18<3:23:56, 194.23s/it]                                                                                                                                           {'loss': '0.7095', 'grad_norm': '0.07657', 'learning_rate': '1e-05', 'ppl': '2.033', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '290.4', 'tokens/total': 7077888, 'tokens/trainable': 5902846, 'epoch': '0.5902'}
 30%|█████████████████████████████                                                                    | 27/90 [1:30:18<3:23:56, 194.23s/it] 31%|██████████████████████████████▏                                                                  | 28/90 [1:33:32<3:20:40, 194.20s/it]                                                                                                                                           {'loss': '0.7248', 'grad_norm': '0.07364', 'learning_rate': '1e-05', 'ppl': '2.064', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '273.8', 'tokens/total': 7340032, 'tokens/trainable': 6127588, 'epoch': '0.612'}
 31%|██████████████████████████████▏                                                                  | 28/90 [1:33:32<3:20:40, 194.20s/it] 32%|███████████████████████████████▎                                                                 | 29/90 [1:36:46<3:17:25, 194.18s/it]                                                                                                                                           {'loss': '0.7286', 'grad_norm': '0.06617', 'learning_rate': '1e-05', 'ppl': '2.072', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '281.6', 'tokens/total': 7602176, 'tokens/trainable': 6345817, 'epoch': '0.6339'}
 32%|███████████████████████████████▎                                                                 | 29/90 [1:36:46<3:17:25, 194.18s/it] 33%|████████████████████████████████▎                                                                | 30/90 [1:40:00<3:14:05, 194.09s/it]                                                                                                                                           {'loss': '0.6989', 'grad_norm': '0.1305', 'learning_rate': '1e-05', 'ppl': '2.012', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '239.3', 'tokens/total': 7864320, 'tokens/trainable': 6554083, 'epoch': '0.6557'}
 33%|████████████████████████████████▎                                                                | 30/90 [1:40:00<3:14:05, 194.09s/it] 34%|█████████████████████████████████▍                                                               | 31/90 [1:43:14<3:10:50, 194.07s/it]                                                                                                                                           {'loss': '0.6912', 'grad_norm': '0.06376', 'learning_rate': '1e-05', 'ppl': '1.996', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '275.5', 'tokens/total': 8126464, 'tokens/trainable': 6778407, 'epoch': '0.6776'}
 34%|█████████████████████████████████▍                                                               | 31/90 [1:43:14<3:10:50, 194.07s/it] 36%|██████████████████████████████████▍                                                              | 32/90 [1:46:28<3:07:37, 194.09s/it]                                                                                                                                           {'loss': '0.7114', 'grad_norm': '0.06336', 'learning_rate': '1e-05', 'ppl': '2.037', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '284', 'tokens/total': 8388608, 'tokens/trainable': 7005342, 'epoch': '0.6995'}
 36%|██████████████████████████████████▍                                                              | 32/90 [1:46:28<3:07:37, 194.09s/it] 37%|███████████████████████████████████▌                                                             | 33/90 [1:49:43<3:04:26, 194.14s/it]                                                                                                                                           {'loss': '0.6939', 'grad_norm': '0.06481', 'learning_rate': '1e-05', 'ppl': '2.001', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '282.2', 'tokens/total': 8650752, 'tokens/trainable': 7229431, 'epoch': '0.7213'}
 37%|███████████████████████████████████▌                                                             | 33/90 [1:49:43<3:04:26, 194.14s/it] 38%|████████████████████████████████████▋                                                            | 34/90 [1:52:57<3:01:09, 194.10s/it]                                                                                                                                           {'loss': '0.7668', 'grad_norm': '0.06838', 'learning_rate': '1e-05', 'ppl': '2.153', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '283', 'tokens/total': 8912896, 'tokens/trainable': 7441804, 'epoch': '0.7432'}
 38%|████████████████████████████████████▋                                                            | 34/90 [1:52:57<3:01:09, 194.10s/it] 39%|█████████████████████████████████████▋                                                           | 35/90 [1:56:11<2:57:55, 194.10s/it]                                                                                                                                           {'loss': '0.7602', 'grad_norm': '0.06979', 'learning_rate': '1e-05', 'ppl': '2.139', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '253.1', 'tokens/total': 9175040, 'tokens/trainable': 7652895, 'epoch': '0.765'}
 39%|█████████████████████████████████████▋                                                           | 35/90 [1:56:11<2:57:55, 194.10s/it] 40%|██████████████████████████████████████▊                                                          | 36/90 [1:59:25<2:54:40, 194.08s/it]                                                                                                                                           {'loss': '0.7418', 'grad_norm': '0.06433', 'learning_rate': '1e-05', 'ppl': '2.1', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '277', 'tokens/total': 9437184, 'tokens/trainable': 7871632, 'epoch': '0.7869'}
 40%|██████████████████████████████████████▊                                                          | 36/90 [1:59:25<2:54:40, 194.08s/it] 41%|███████████████████████████████████████▉                                                         | 37/90 [2:02:39<2:51:24, 194.05s/it]                                                                                                                                           {'loss': '0.7375', 'grad_norm': '0.1387', 'learning_rate': '1e-05', 'ppl': '2.091', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '271.4', 'tokens/total': 9699328, 'tokens/trainable': 8083506, 'epoch': '0.8087'}
 41%|███████████████████████████████████████▉                                                         | 37/90 [2:02:39<2:51:24, 194.05s/it] 42%|████████████████████████████████████████▉                                                        | 38/90 [2:05:53<2:48:10, 194.05s/it]                                                                                                                                           {'loss': '0.6796', 'grad_norm': '0.06568', 'learning_rate': '1e-05', 'ppl': '1.973', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '244.2', 'tokens/total': 9961472, 'tokens/trainable': 8297266, 'epoch': '0.8306'}
 42%|████████████████████████████████████████▉                                                        | 38/90 [2:05:53<2:48:10, 194.05s/it] 43%|██████████████████████████████████████████                                                       | 39/90 [2:09:07<2:44:54, 194.01s/it]                                                                                                                                           {'loss': '0.7272', 'grad_norm': '0.06804', 'learning_rate': '1e-05', 'ppl': '2.069', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '259.1', 'tokens/total': 10223616, 'tokens/trainable': 8511873, 'epoch': '0.8525'}
 43%|██████████████████████████████████████████                                                       | 39/90 [2:09:07<2:44:54, 194.01s/it] 44%|███████████████████████████████████████████                                                      | 40/90 [2:12:21<2:41:41, 194.03s/it]                                                                                                                                           {'loss': '0.6948', 'grad_norm': '0.06503', 'learning_rate': '1e-05', 'ppl': '2.003', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '271.5', 'tokens/total': 10485760, 'tokens/trainable': 8725559, 'epoch': '0.8743'}
 44%|███████████████████████████████████████████                                                      | 40/90 [2:12:21<2:41:41, 194.03s/it] 46%|████████████████████████████████████████████▏                                                    | 41/90 [2:15:35<2:38:25, 194.00s/it]                                                                                                                                           {'loss': '0.7427', 'grad_norm': '0.07094', 'learning_rate': '1e-05', 'ppl': '2.102', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '297.7', 'tokens/total': 10747904, 'tokens/trainable': 8927999, 'epoch': '0.8962'}
 46%|████████████████████████████████████████████▏                                                    | 41/90 [2:15:35<2:38:25, 194.00s/it] 47%|█████████████████████████████████████████████▎                                                   | 42/90 [2:18:49<2:35:15, 194.07s/it]                                                                                                                                           {'loss': '0.6863', 'grad_norm': '0.08371', 'learning_rate': '1e-05', 'ppl': '1.986', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '298.9', 'tokens/total': 11010048, 'tokens/trainable': 9156383, 'epoch': '0.918'}
 47%|█████████████████████████████████████████████▎                                                   | 42/90 [2:18:49<2:35:15, 194.07s/it] 48%|██████████████████████████████████████████████▎                                                  | 43/90 [2:22:03<2:32:04, 194.13s/it]                                                                                                                                           {'loss': '0.6972', 'grad_norm': '0.07311', 'learning_rate': '1e-05', 'ppl': '2.008', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '288.9', 'tokens/total': 11272192, 'tokens/trainable': 9382136, 'epoch': '0.9399'}
 48%|██████████████████████████████████████████████▎                                                  | 43/90 [2:22:03<2:32:04, 194.13s/it] 49%|███████████████████████████████████████████████▍                                                 | 44/90 [2:25:18<2:28:53, 194.20s/it]                                                                                                                                           {'loss': '0.7249', 'grad_norm': '0.07009', 'learning_rate': '1e-05', 'ppl': '2.065', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '297.6', 'tokens/total': 11534336, 'tokens/trainable': 9600401, 'epoch': '0.9617'}
 49%|███████████████████████████████████████████████▍                                                 | 44/90 [2:25:18<2:28:53, 194.20s/it] 50%|████████████████████████████████████████████████▌                                                | 45/90 [2:28:32<2:25:40, 194.24s/it]                                                                                                                                           {'loss': '0.694', 'grad_norm': '0.07533', 'learning_rate': '1e-05', 'ppl': '2.002', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '276.3', 'tokens/total': 11796480, 'tokens/trainable': 9828463, 'epoch': '0.9836'}
 50%|████████████████████████████████████████████████▌                                                | 45/90 [2:28:32<2:25:40, 194.24s/it][2026-03-21 02:48:38,650] [INFO] [axolotl.core.trainers.base._save:721] [PID:3143] Saving model checkpoint to ./model-output/checkpoint-45
 51%|█████████████████████████████████████████████████▌                                               | 46/90 [2:31:24<2:17:32, 187.57s/it]                                                                                                                                           {'loss': '0.7151', 'grad_norm': '0.1007', 'learning_rate': '1e-05', 'ppl': '2.044', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '123.8', 'tokens/total': 11960320, 'tokens/trainable': 9954548, 'epoch': '1'}
 51%|█████████████████████████████████████████████████▌                                               | 46/90 [2:31:24<2:17:32, 187.57s/it] 52%|██████████████████████████████████████████████████▋                                              | 47/90 [2:34:41<2:16:22, 190.30s/it]                                                                                                                                           {'loss': '0.7023', 'grad_norm': '0.08667', 'learning_rate': '1e-05', 'ppl': '2.018', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '284.8', 'tokens/total': 12222464, 'tokens/trainable': 10173413, 'epoch': '1.022'}
 52%|██████████████████████████████████████████████████▋                                              | 47/90 [2:34:41<2:16:22, 190.30s/it] 53%|███████████████████████████████████████████████████▋                                             | 48/90 [2:37:55<2:14:01, 191.48s/it]                                                                                                                                           {'loss': '0.7292', 'grad_norm': '0.08012', 'learning_rate': '1e-05', 'ppl': '2.073', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '301.4', 'tokens/total': 12484608, 'tokens/trainable': 10391800, 'epoch': '1.044'}
 53%|███████████████████████████████████████████████████▋                                             | 48/90 [2:37:55<2:14:01, 191.48s/it] 54%|████████████████████████████████████████████████████▊                                            | 49/90 [2:41:09<2:11:25, 192.34s/it]                                                                                                                                           {'loss': '0.7073', 'grad_norm': '0.07503', 'learning_rate': '1e-05', 'ppl': '2.029', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '273.1', 'tokens/total': 12746752, 'tokens/trainable': 10604221, 'epoch': '1.066'}
 54%|████████████████████████████████████████████████████▊                                            | 49/90 [2:41:09<2:11:25, 192.34s/it] 56%|█████████████████████████████████████████████████████▉                                           | 50/90 [2:44:24<2:08:36, 192.92s/it]                                                                                                                                           {'loss': '0.7207', 'grad_norm': '0.07128', 'learning_rate': '1e-05', 'ppl': '2.056', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '269.4', 'tokens/total': 13008896, 'tokens/trainable': 10818158, 'epoch': '1.087'}
 56%|█████████████████████████████████████████████████████▉                                           | 50/90 [2:44:24<2:08:36, 192.92s/it] 57%|██████████████████████████████████████████████████████▉                                          | 51/90 [2:47:38<2:05:38, 193.29s/it]                                                                                                                                           {'loss': '0.7003', 'grad_norm': '0.07703', 'learning_rate': '1e-05', 'ppl': '2.014', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '301.6', 'tokens/total': 13271040, 'tokens/trainable': 11037824, 'epoch': '1.109'}
 57%|██████████████████████████████████████████████████████▉                                          | 51/90 [2:47:38<2:05:38, 193.29s/it] 58%|████████████████████████████████████████████████████████                                         | 52/90 [2:50:52<2:02:34, 193.53s/it]                                                                                                                                           {'loss': '0.643', 'grad_norm': '0.06556', 'learning_rate': '1e-05', 'ppl': '1.902', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '306.3', 'tokens/total': 13533184, 'tokens/trainable': 11253286, 'epoch': '1.131'}
 58%|████████████████████████████████████████████████████████                                         | 52/90 [2:50:52<2:02:34, 193.53s/it] 59%|█████████████████████████████████████████████████████████                                        | 53/90 [2:54:06<1:59:28, 193.74s/it]                                                                                                                                           {'loss': '0.6834', 'grad_norm': '0.06421', 'learning_rate': '1e-05', 'ppl': '1.981', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '294.3', 'tokens/total': 13795328, 'tokens/trainable': 11480965, 'epoch': '1.153'}
 59%|█████████████████████████████████████████████████████████                                        | 53/90 [2:54:06<1:59:28, 193.74s/it] 60%|██████████████████████████████████████████████████████████▏                                      | 54/90 [2:57:20<1:56:20, 193.90s/it]                                                                                                                                           {'loss': '0.678', 'grad_norm': '0.1167', 'learning_rate': '1e-05', 'ppl': '1.97', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '268.9', 'tokens/total': 14057472, 'tokens/trainable': 11704522, 'epoch': '1.175'}
 60%|██████████████████████████████████████████████████████████▏                                      | 54/90 [2:57:20<1:56:20, 193.90s/it] 61%|███████████████████████████████████████████████████████████▎                                     | 55/90 [3:00:35<1:53:15, 194.16s/it]                                                                                                                                           {'loss': '0.6854', 'grad_norm': '0.06601', 'learning_rate': '1e-05', 'ppl': '1.985', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '287', 'tokens/total': 14319616, 'tokens/trainable': 11930214, 'epoch': '1.197'}
 61%|███████████████████████████████████████████████████████████▎                                     | 55/90 [3:00:35<1:53:15, 194.16s/it] 62%|████████████████████████████████████████████████████████████▎                                    | 56/90 [3:03:49<1:50:00, 194.15s/it]                                                                                                                                           {'loss': '0.6284', 'grad_norm': '0.0666', 'learning_rate': '1e-05', 'ppl': '1.875', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '305.3', 'tokens/total': 14581760, 'tokens/trainable': 12151710, 'epoch': '1.219'}
 62%|████████████████████████████████████████████████████████████▎                                    | 56/90 [3:03:49<1:50:00, 194.15s/it] 63%|█████████████████████████████████████████████████████████████▍                                   | 57/90 [3:07:04<1:46:48, 194.20s/it]                                                                                                                                           {'loss': '0.6214', 'grad_norm': '0.06623', 'learning_rate': '1e-05', 'ppl': '1.862', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '280.2', 'tokens/total': 14843904, 'tokens/trainable': 12368803, 'epoch': '1.24'}
 63%|█████████████████████████████████████████████████████████████▍                                   | 57/90 [3:07:04<1:46:48, 194.20s/it] 64%|██████████████████████████████████████████████████████████████▌                                  | 58/90 [3:10:18<1:43:36, 194.26s/it]                                                                                                                                           {'loss': '0.7014', 'grad_norm': '0.07006', 'learning_rate': '1e-05', 'ppl': '2.017', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '259.7', 'tokens/total': 15106048, 'tokens/trainable': 12584024, 'epoch': '1.262'}
 64%|██████████████████████████████████████████████████████████████▌                                  | 58/90 [3:10:18<1:43:36, 194.26s/it] 66%|███████████████████████████████████████████████████████████████▌                                 | 59/90 [3:13:32<1:40:23, 194.29s/it]                                                                                                                                           {'loss': '0.6885', 'grad_norm': '0.06688', 'learning_rate': '1e-05', 'ppl': '1.991', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '271.4', 'tokens/total': 15368192, 'tokens/trainable': 12803302, 'epoch': '1.284'}
 66%|███████████████████████████████████████████████████████████████▌                                 | 59/90 [3:13:32<1:40:23, 194.29s/it] 67%|████████████████████████████████████████████████████████████████▋                                | 60/90 [3:16:47<1:37:09, 194.31s/it]                                                                                                                                           {'loss': '0.6836', 'grad_norm': '0.06747', 'learning_rate': '1e-05', 'ppl': '1.981', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '279.8', 'tokens/total': 15630336, 'tokens/trainable': 13026503, 'epoch': '1.306'}
 67%|████████████████████████████████████████████████████████████████▋                                | 60/90 [3:16:47<1:37:09, 194.31s/it] 68%|█████████████████████████████████████████████████████████████████▋                               | 61/90 [3:20:01<1:33:53, 194.24s/it]                                                                                                                                           {'loss': '0.6746', 'grad_norm': '0.07845', 'learning_rate': '1e-05', 'ppl': '1.963', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '288.7', 'tokens/total': 15892480, 'tokens/trainable': 13236174, 'epoch': '1.328'}
 68%|█████████████████████████████████████████████████████████████████▋                               | 61/90 [3:20:01<1:33:53, 194.24s/it] 69%|██████████████████████████████████████████████████████████████████▊                              | 62/90 [3:23:15<1:30:36, 194.17s/it]                                                                                                                                           {'loss': '0.6288', 'grad_norm': '0.07305', 'learning_rate': '1e-05', 'ppl': '1.875', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '249.2', 'tokens/total': 16154624, 'tokens/trainable': 13446007, 'epoch': '1.35'}
 69%|██████████████████████████████████████████████████████████████████▊                              | 62/90 [3:23:15<1:30:36, 194.17s/it] 70%|███████████████████████████████████████████████████████████████████▉                             | 63/90 [3:26:29<1:27:21, 194.13s/it]                                                                                                                                           {'loss': '0.6904', 'grad_norm': '0.07515', 'learning_rate': '1e-05', 'ppl': '1.994', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '258.8', 'tokens/total': 16416768, 'tokens/trainable': 13655189, 'epoch': '1.372'}
 70%|███████████████████████████████████████████████████████████████████▉                             | 63/90 [3:26:29<1:27:21, 194.13s/it] 71%|████████████████████████████████████████████████████████████████████▉                            | 64/90 [3:29:43<1:24:07, 194.13s/it]                                                                                                                                           {'loss': '0.6879', 'grad_norm': '0.07398', 'learning_rate': '1e-05', 'ppl': '1.99', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '287.8', 'tokens/total': 16678912, 'tokens/trainable': 13875915, 'epoch': '1.393'}
 71%|████████████████████████████████████████████████████████████████████▉                            | 64/90 [3:29:43<1:24:07, 194.13s/it] 72%|██████████████████████████████████████████████████████████████████████                           | 65/90 [3:32:57<1:20:54, 194.20s/it]                                                                                                                                           {'loss': '0.6837', 'grad_norm': '0.06895', 'learning_rate': '1e-05', 'ppl': '1.981', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '296.8', 'tokens/total': 16941056, 'tokens/trainable': 14099856, 'epoch': '1.415'}
 72%|██████████████████████████████████████████████████████████████████████                           | 65/90 [3:32:57<1:20:54, 194.20s/it] 73%|███████████████████████████████████████████████████████████████████████▏                         | 66/90 [3:36:11<1:17:39, 194.15s/it]                                                                                                                                           {'loss': '0.661', 'grad_norm': '0.08005', 'learning_rate': '1e-05', 'ppl': '1.937', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '292.4', 'tokens/total': 17203200, 'tokens/trainable': 14318671, 'epoch': '1.437'}
 73%|███████████████████████████████████████████████████████████████████████▏                         | 66/90 [3:36:11<1:17:39, 194.15s/it] 74%|████████████████████████████████████████████████████████████████████████▏                        | 67/90 [3:39:26<1:14:26, 194.18s/it]                                                                                                                                           {'loss': '0.6365', 'grad_norm': '0.06262', 'learning_rate': '1e-05', 'ppl': '1.89', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '304.4', 'tokens/total': 17465344, 'tokens/trainable': 14545753, 'epoch': '1.459'}
 74%|████████████████████████████████████████████████████████████████████████▏                        | 67/90 [3:39:26<1:14:26, 194.18s/it] 76%|█████████████████████████████████████████████████████████████████████████▎                       | 68/90 [3:42:40<1:11:12, 194.21s/it]                                                                                                                                           {'loss': '0.6894', 'grad_norm': '0.07881', 'learning_rate': '1e-05', 'ppl': '1.993', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '296.4', 'tokens/total': 17727488, 'tokens/trainable': 14769902, 'epoch': '1.481'}
 76%|█████████████████████████████████████████████████████████████████████████▎                       | 68/90 [3:42:40<1:11:12, 194.21s/it] 77%|██████████████████████████████████████████████████████████████████████████▎                      | 69/90 [3:45:54<1:07:57, 194.16s/it]                                                                                                                                           {'loss': '0.6555', 'grad_norm': '0.08518', 'learning_rate': '1e-05', 'ppl': '1.926', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '264.9', 'tokens/total': 17989632, 'tokens/trainable': 14984228, 'epoch': '1.503'}
 77%|██████████████████████████████████████████████████████████████████████████▎                      | 69/90 [3:45:54<1:07:57, 194.16s/it] 78%|███████████████████████████████████████████████████████████████████████████▍                     | 70/90 [3:49:08<1:04:42, 194.14s/it]                                                                                                                                           {'loss': '0.6922', 'grad_norm': '0.07176', 'learning_rate': '1e-05', 'ppl': '1.998', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '233', 'tokens/total': 18251776, 'tokens/trainable': 15190075, 'epoch': '1.525'}
 78%|███████████████████████████████████████████████████████████████████████████▍                     | 70/90 [3:49:08<1:04:42, 194.14s/it] 79%|████████████████████████████████████████████████████████████████████████████▌                    | 71/90 [3:52:22<1:01:28, 194.14s/it]                                                                                                                                           {'loss': '0.6945', 'grad_norm': '0.07226', 'learning_rate': '1e-05', 'ppl': '2.003', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '290.4', 'tokens/total': 18513920, 'tokens/trainable': 15399802, 'epoch': '1.546'}
 79%|████████████████████████████████████████████████████████████████████████████▌                    | 71/90 [3:52:22<1:01:28, 194.14s/it] 80%|███████████████████████████████████████████████████████████████████████████████▏                   | 72/90 [3:55:36<58:15, 194.21s/it]                                                                                                                                           {'loss': '0.6815', 'grad_norm': '0.0664', 'learning_rate': '1e-05', 'ppl': '1.977', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '280.3', 'tokens/total': 18776064, 'tokens/trainable': 15622658, 'epoch': '1.568'}
 80%|███████████████████████████████████████████████████████████████████████████████▏                   | 72/90 [3:55:36<58:15, 194.21s/it] 81%|████████████████████████████████████████████████████████████████████████████████▎                  | 73/90 [3:58:51<55:01, 194.21s/it]                                                                                                                                           {'loss': '0.684', 'grad_norm': '0.08807', 'learning_rate': '1e-05', 'ppl': '1.982', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '247.6', 'tokens/total': 19038208, 'tokens/trainable': 15837607, 'epoch': '1.59'}
 81%|████████████████████████████████████████████████████████████████████████████████▎                  | 73/90 [3:58:51<55:01, 194.21s/it] 82%|█████████████████████████████████████████████████████████████████████████████████▍                 | 74/90 [4:02:05<51:47, 194.21s/it]                                                                                                                                           {'loss': '0.6805', 'grad_norm': '0.07422', 'learning_rate': '1e-05', 'ppl': '1.975', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '274.6', 'tokens/total': 19300352, 'tokens/trainable': 16053938, 'epoch': '1.612'}
 82%|█████████████████████████████████████████████████████████████████████████████████▍                 | 74/90 [4:02:05<51:47, 194.21s/it] 83%|██████████████████████████████████████████████████████████████████████████████████▌                | 75/90 [4:05:19<48:33, 194.24s/it]                                                                                                                                           {'loss': '0.7', 'grad_norm': '0.06819', 'learning_rate': '1e-05', 'ppl': '2.014', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '262.1', 'tokens/total': 19562496, 'tokens/trainable': 16270282, 'epoch': '1.634'}
 83%|██████████████████████████████████████████████████████████████████████████████████▌                | 75/90 [4:05:19<48:33, 194.24s/it] 84%|███████████████████████████████████████████████████████████████████████████████████▌               | 76/90 [4:08:33<45:18, 194.18s/it]                                                                                                                                           {'loss': '0.7011', 'grad_norm': '0.07372', 'learning_rate': '1e-05', 'ppl': '2.016', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '259.9', 'tokens/total': 19824640, 'tokens/trainable': 16486278, 'epoch': '1.656'}
 84%|███████████████████████████████████████████████████████████████████████████████████▌               | 76/90 [4:08:33<45:18, 194.18s/it] 86%|████████████████████████████████████████████████████████████████████████████████████▋              | 77/90 [4:11:48<42:05, 194.25s/it]                                                                                                                                           {'loss': '0.7008', 'grad_norm': '0.07398', 'learning_rate': '1e-05', 'ppl': '2.015', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '307.9', 'tokens/total': 20086784, 'tokens/trainable': 16712172, 'epoch': '1.678'}
 86%|████████████████████████████████████████████████████████████████████████████████████▋              | 77/90 [4:11:48<42:05, 194.25s/it] 87%|█████████████████████████████████████████████████████████████████████████████████████▊             | 78/90 [4:15:02<38:50, 194.24s/it]                                                                                                                                           {'loss': '0.6221', 'grad_norm': '0.07666', 'learning_rate': '1e-05', 'ppl': '1.863', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '305.6', 'tokens/total': 20348928, 'tokens/trainable': 16931092, 'epoch': '1.699'}
 87%|█████████████████████████████████████████████████████████████████████████████████████▊             | 78/90 [4:15:02<38:50, 194.24s/it] 88%|██████████████████████████████████████████████████████████████████████████████████████▉            | 79/90 [4:18:16<35:36, 194.20s/it]                                                                                                                                           {'loss': '0.6462', 'grad_norm': '0.08494', 'learning_rate': '1e-05', 'ppl': '1.908', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '274.3', 'tokens/total': 20611072, 'tokens/trainable': 17153198, 'epoch': '1.721'}
 88%|██████████████████████████████████████████████████████████████████████████████████████▉            | 79/90 [4:18:16<35:36, 194.20s/it] 89%|████████████████████████████████████████████████████████████████████████████████████████           | 80/90 [4:21:30<32:21, 194.13s/it]                                                                                                                                           {'loss': '0.6746', 'grad_norm': '0.2009', 'learning_rate': '1e-05', 'ppl': '1.963', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '286.6', 'tokens/total': 20873216, 'tokens/trainable': 17368220, 'epoch': '1.743'}
 89%|████████████████████████████████████████████████████████████████████████████████████████           | 80/90 [4:21:30<32:21, 194.13s/it] 90%|█████████████████████████████████████████████████████████████████████████████████████████          | 81/90 [4:24:44<29:07, 194.22s/it]                                                                                                                                           {'loss': '0.6731', 'grad_norm': '0.07809', 'learning_rate': '1e-05', 'ppl': '1.96', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '292.1', 'tokens/total': 21135360, 'tokens/trainable': 17596832, 'epoch': '1.765'}
 90%|█████████████████████████████████████████████████████████████████████████████████████████          | 81/90 [4:24:44<29:07, 194.22s/it] 91%|██████████████████████████████████████████████████████████████████████████████████████████▏        | 82/90 [4:27:59<25:53, 194.24s/it]                                                                                                                                           {'loss': '0.6511', 'grad_norm': '0.08194', 'learning_rate': '1e-05', 'ppl': '1.918', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '307', 'tokens/total': 21397504, 'tokens/trainable': 17825896, 'epoch': '1.787'}
 91%|██████████████████████████████████████████████████████████████████████████████████████████▏        | 82/90 [4:27:59<25:53, 194.24s/it] 92%|███████████████████████████████████████████████████████████████████████████████████████████▎       | 83/90 [4:31:13<22:39, 194.19s/it]                                                                                                                                           {'loss': '0.6461', 'grad_norm': '0.08777', 'learning_rate': '1e-05', 'ppl': '1.908', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '293.9', 'tokens/total': 21659648, 'tokens/trainable': 18046552, 'epoch': '1.809'}
 92%|███████████████████████████████████████████████████████████████████████████████████████████▎       | 83/90 [4:31:13<22:39, 194.19s/it] 93%|████████████████████████████████████████████████████████████████████████████████████████████▍      | 84/90 [4:34:27<19:24, 194.12s/it]                                                                                                                                           {'loss': '0.634', 'grad_norm': '0.1761', 'learning_rate': '1e-05', 'ppl': '1.885', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '280.7', 'tokens/total': 21921792, 'tokens/trainable': 18270408, 'epoch': '1.831'}
 93%|████████████████████████████████████████████████████████████████████████████████████████████▍      | 84/90 [4:34:27<19:24, 194.12s/it] 94%|█████████████████████████████████████████████████████████████████████████████████████████████▌     | 85/90 [4:37:41<16:10, 194.04s/it]                                                                                                                                           {'loss': '0.6599', 'grad_norm': '0.0759', 'learning_rate': '1e-05', 'ppl': '1.935', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '241.2', 'tokens/total': 22183936, 'tokens/trainable': 18473082, 'epoch': '1.852'}
 94%|█████████████████████████████████████████████████████████████████████████████████████████████▌     | 85/90 [4:37:41<16:10, 194.04s/it] 96%|██████████████████████████████████████████████████████████████████████████████████████████████▌    | 86/90 [4:40:55<12:56, 194.06s/it]                                                                                                                                           {'loss': '0.7237', 'grad_norm': '0.0772', 'learning_rate': '1e-05', 'ppl': '2.062', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '221.1', 'tokens/total': 22446080, 'tokens/trainable': 18677566, 'epoch': '1.874'}
 96%|██████████████████████████████████████████████████████████████████████████████████████████████▌    | 86/90 [4:40:55<12:56, 194.06s/it] 97%|███████████████████████████████████████████████████████████████████████████████████████████████▋   | 87/90 [4:44:09<09:42, 194.08s/it]                                                                                                                                           {'loss': '0.6671', 'grad_norm': '0.07135', 'learning_rate': '1e-05', 'ppl': '1.949', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '299.4', 'tokens/total': 22708224, 'tokens/trainable': 18891550, 'epoch': '1.896'}
 97%|███████████████████████████████████████████████████████████████████████████████████████████████▋   | 87/90 [4:44:09<09:42, 194.08s/it] 98%|████████████████████████████████████████████████████████████████████████████████████████████████▊  | 88/90 [4:47:23<06:28, 194.17s/it]                                                                                                                                           {'loss': '0.6626', 'grad_norm': '0.07485', 'learning_rate': '1e-05', 'ppl': '1.94', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '295.6', 'tokens/total': 22970368, 'tokens/trainable': 19112040, 'epoch': '1.918'}
 98%|████████████████████████████████████████████████████████████████████████████████████████████████▊  | 88/90 [4:47:23<06:28, 194.17s/it] 99%|█████████████████████████████████████████████████████████████████████████████████████████████████▉ | 89/90 [4:50:37<03:14, 194.20s/it]                                                                                                                                           {'loss': '0.6296', 'grad_norm': '0.08641', 'learning_rate': '1e-05', 'ppl': '1.877', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '300.4', 'tokens/total': 23232512, 'tokens/trainable': 19334320, 'epoch': '1.94'}
 99%|█████████████████████████████████████████████████████████████████████████████████████████████████▉ | 89/90 [4:50:37<03:14, 194.20s/it]100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [4:53:53<00:00, 194.70s/it]                                                                                                                                           {'loss': '0.6922', 'grad_norm': '0.1125', 'learning_rate': '1e-05', 'ppl': '1.998', 'memory/max_active (GiB)': '105.5', 'memory/max_allocated (GiB)': '105.5', 'memory/device_reserved (GiB)': '108.6', 'tokens/train_per_sec_per_gpu': '235.3', 'tokens/total': 23494656, 'tokens/trainable': 19549098, 'epoch': '1.962'}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [4:53:53<00:00, 194.70s/it][2026-03-21 05:13:59,923] [INFO] [axolotl.core.trainers.base._save:721] [PID:3143] Saving model checkpoint to ./model-output/checkpoint-90
                                                                                                                                           {'train_runtime': '1.769e+04', 'train_samples_per_second': '0.163', 'train_steps_per_second': '0.005', 'train_loss': '0.7247', 'memory/max_active (GiB)': '54.12', 'memory/max_allocated (GiB)': '54.12', 'memory/device_reserved (GiB)': '108.6', 'epoch': '1.962', 'tokens/train_per_sec_per_gpu': '0'}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [4:53:57<00:00, 194.70s/it]100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [4:53:57<00:00, 195.97s/it]
[2026-03-21 05:14:03,876] [INFO] [axolotl.train.save_trained_model:237] [PID:3143] Training completed! Saving trained model to ./model-output.
[2026-03-21 05:14:05,641] [INFO] [axolotl.train.save_trained_model:351] [PID:3143] Model successfully saved to ./model-output