import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer # --- ✅ Load Model & Tokenizer --- MODEL_PATH = "rohith-yarramala/asyncapi-assistant-model-merged" # 🚨 Force CPU mode (NO bitsandbytes, NO quantization) device = "cpu" model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, torch_dtype=torch.float32, # ✅ Force CPU-friendly dtype device_map=device, # ✅ Ensure model is loaded on CPU trust_remote_code=True, # ✅ Required for custom model code low_cpu_mem_usage=True # ✅ Reduce CPU memory footprint ) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) model.config.pad_token_id = tokenizer.eos_token_id # ✅ Avoid generation warnings print("✅ Model and tokenizer loaded successfully!") # --- 🚀 Define Chatbot Function --- def asyncapi_chatbot(question): inputs = tokenizer(question, return_tensors="pt").to(device) output = model.generate(**inputs, max_length=300, use_cache=False) return tokenizer.decode(output[0], skip_special_tokens=True) # --- 🎨 Gradio UI --- css = """ h1 { text-align: center; font-size: 28px; color: #4CAF50; } textarea { font-size: 16px; } """ iface = gr.Interface( fn=asyncapi_chatbot, inputs=gr.Textbox(label="Ask an AsyncAPI Question", placeholder="What is an AsyncAPI schema?"), outputs=gr.Textbox(label="AI Response"), title="AsyncAPI Assistant 🤖", description="Ask any question about AsyncAPI, event-driven architecture, or message brokers.", theme="compact", allow_flagging="never", css=css ) # --- 🔥 Launch in Public Mode --- iface.launch()