from datasets import load_dataset, concatenate_datasets from transformers import AutoTokenizer, Qwen3_5ForConditionalGeneration from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier # NOTE: This example requires transformers >= v5 MODEL_ID = "Qwen/Qwen3.5-9B" # Load model. model = Qwen3_5ForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto") processor = AutoTokenizer.from_pretrained(MODEL_ID) recipe = QuantizationModifier( targets="Linear", scheme="NVFP4", ignore=[ "lm_head", "re:.*visual.*", "re:.*linear_attn.*", ] ) NUM_CALIBRATION_SAMPLES = 1024 MAX_SEQUENCE_LENGTH = 8192 samples_per_split = NUM_CALIBRATION_SAMPLES // 4 # 256 per domain # ============================================================ # 1. General conversation (English) # ============================================================ ds_chat = load_dataset( "HuggingFaceH4/ultrachat_200k", split=f"train_sft[:{samples_per_split}]", ) def preprocess_chat(example): text = processor.apply_chat_template( example["messages"], tokenize=False ) return {"text": text} ds_chat = ds_chat.map(preprocess_chat).select_columns(["text"]) # ============================================================ # 2. Math / reasoning # ============================================================ ds_math = load_dataset( "openai/gsm8k", "main", split=f"train[:{samples_per_split}]", ) def preprocess_math(example): messages = [ {"role": "user", "content": example["question"]}, {"role": "assistant", "content": example["answer"]}, ] text = processor.apply_chat_template(messages, tokenize=False) return {"text": text} ds_math = ds_math.map(preprocess_math).select_columns(["text"]) # ============================================================ # 3. Code # ============================================================ ds_code = load_dataset( "sahil2801/CodeAlpaca-20k", split=f"train[:{samples_per_split}]", ) def preprocess_code(example): user_content = example["instruction"] if example.get("input"): user_content += "\n\n" + example["input"] messages = [ {"role": "user", "content": user_content}, {"role": "assistant", "content": example["output"]}, ] text = processor.apply_chat_template(messages, tokenize=False) return {"text": text} ds_code = ds_code.map(preprocess_code).select_columns(["text"]) # ============================================================ # 4. Multilingual # ============================================================ ds_multi = load_dataset( "CohereForAI/aya_dataset", split=f"train[:{samples_per_split}]", ) def preprocess_multi(example): messages = [ {"role": "user", "content": example["inputs"]}, {"role": "assistant", "content": example["targets"]}, ] text = processor.apply_chat_template(messages, tokenize=False) return {"text": text} ds_multi = ds_multi.map(preprocess_multi).select_columns(["text"]) # ============================================================ # Combine all datasets and shuffle # ============================================================ ds = concatenate_datasets([ds_chat, ds_math, ds_code, ds_multi]) ds = ds.shuffle(seed=42) # Filter out any empty entries just in case. ds = ds.filter(lambda x: len(x["text"].strip()) > 0) # Tokenize inputs. def tokenize(sample): return processor( sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False, ) ds = ds.map(tokenize, remove_columns=ds.column_names) # ============================================================ # Patch: llmcompressor reads attention config from top-level, # but for this multimodal model it lives in text_config # ============================================================ text_cfg = model.config.text_config for attr in [ "num_attention_heads", "num_key_value_heads", "hidden_size", "head_dim", ]: if not hasattr(model.config, attr) and hasattr(text_cfg, attr): setattr(model.config, attr, getattr(text_cfg, attr)) # Apply quantization. oneshot( model=model, recipe=recipe, dataset=ds, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, moe_calibrate_all_experts=True, ) # Save to disk in compressed-tensors format. SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4" model.save_pretrained(SAVE_DIR, safe_serialization=True) processor.save_pretrained(SAVE_DIR)