| { |
| "hf_model_id": "hdlm-group/hdlm-base-epsilon-0.0", |
| "reset_step_for_finetuning": true, |
| "ngpus": 4, |
| "type": "aligned", |
| "gradient_accumulation_steps": 8, |
| "model_type": "epsilon_hybrid", |
| "tokenizer": { |
| "tokens": 50257, |
| "model": "gpt2" |
| }, |
| "training": { |
| "batch_size": 512, |
| "accum": 8, |
| "n_iters": 500000, |
| "snapshot_freq": 5000, |
| "log_freq": 500, |
| "eval_freq": 5000, |
| "snapshot_freq_for_preemption": 1000, |
| "snapshot_sampling": true, |
| "ema": 0.9999, |
| "warmup_iter": 50000, |
| "loss_type": "hybrid", |
| "epsilon": 0.05, |
| "lambda": 5.0, |
| "lr": 1e-05 |
| }, |
| "data": { |
| "train": "openwebtext-train", |
| "valid": "wikitext103", |
| "cache_dir": "/home/toolkit/research-diffcodegen/data", |
| "debug": false |
| }, |
| "annealing": { |
| "type": "none", |
| "efficient": false, |
| "width": 1024, |
| "tau": 1024, |
| "eval_tau": 1024, |
| "sampling_method": "sdlm", |
| "sampling_eps": 0.0001, |
| "attention": { |
| "context_type": "block_causal", |
| "block_type": "full" |
| }, |
| "match_inference": true |
| }, |
| "eval": { |
| "batch_size": 32, |
| "perplexity": true, |
| "perplexity_batch_size": 16 |
| }, |
| "optim": { |
| "weight_decay": 0.1, |
| "optimizer": "AdamW", |
| "lr": 5e-05, |
| "beta1": 0.9, |
| "beta2": 0.95, |
| "eps": 1e-08, |
| "warmup": 10000, |
| "grad_clip": 1.0, |
| "scheduler": "cosine" |
| }, |
| "experiment": { |
| "name": "ft_epsilon_0.05_lambda_5.0", |
| "wandb_project": "Hybrid-SDLM-ALIGNED" |
| }, |
| "model": { |
| "name": "epsilon_hdlm", |
| "type": "ddit", |
| "hidden_size": 768, |
| "cond_dim": 128, |
| "length": 1024, |
| "n_blocks": 12, |
| "n_heads": 12, |
| "dropout": 0.1, |
| "scale_by_sigma": false, |
| "transformer_sigma_conditioning": false, |
| "hybrid_sigma_embedding": false, |
| "post_process_logits": false, |
| "use_timestep_embedding": false |
| } |
| } |