# ==================================================
# viF5-TTS Pro – Sửa lỗi FFmpeg/Torchcodec
# Đã thay thế cách tải audio tham chiếu để tránh lỗi torchaudio
# ==================================================

import os
import json
import uuid
import unicodedata
import tempfile
import shutil
import time
from datetime import datetime
import socket

import gradio as gr
import numpy as np
import librosa
import whisper
import torch
import soundfile as sf

from huggingface_hub import login, hf_hub_download
from vinorm import TTSnorm
from f5_tts.model import DiT
from f5_tts.infer.utils_infer import (
    load_vocoder,
    load_model,
    infer_process,
    save_spectrogram,
    # KHÔNG DÙNG: preprocess_ref_audio_text, 
    # vì nó gọi torchaudio.load và gây lỗi FFmpeg
)

# ================= PERFORMANCE OPT =================
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
torch.set_grad_enabled(False)
torch.backends.cudnn.benchmark = True
torch.set_num_threads(4)

# ================= CONFIG =================
VOICE_ROOT = os.getenv("VOICE_ROOT", "voices")
OUTPUT_ROOT = os.path.join(VOICE_ROOT, "outputs")
os.makedirs(VOICE_ROOT, exist_ok=True)
os.makedirs(OUTPUT_ROOT, exist_ok=True)

VOICE_INDEX_FILE = os.path.join(VOICE_ROOT, "voice_index.json")

FEMALE_F0_THRESHOLD = 180.0
MAX_ANALYZE_SEC = 6.0
MAX_SEG_LEN = 220
# Sample rate chuẩn cho mô hình F5
MODEL_SR = 24000 

# ================= HF LOGIN =================
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if hf_token:
    login(token=hf_token)

# ================= DEVICE =================
device = "cuda" if torch.cuda.is_available() else "cpu"
# Dùng whisper BASE để đảm bảo độ chính xác
whisper_model = whisper.load_model("base", device=device)
vocoder = load_vocoder()

# ================= MODELS =================
def hf(repo, file):
    return hf_hub_download(repo_id=repo, filename=file)

MODELS_PATH = {
    "UnixSex - Stable": {
        "ckpt": hf("erax-ai/EraX-Smile-UnixSex-F5", "models/model_48000.safetensors"),
        "vocab": hf("erax-ai/EraX-Smile-UnixSex-F5", "models/vocab.txt"),
    },
    "UnixSex - Overfit": {
        "ckpt": hf("erax-ai/EraX-Smile-UnixSex-F5", "models/overfit.safetensors"),
        "vocab": hf("erax-ai/EraX-Smile-UnixSex-F5", "models/vocab.txt"),
    },
    "Female F5 (EraX)": {
        "ckpt": hf("erax-ai/EraX-Smile-Female-F5-V1.0", "model/model_612000.safetensors"),
        "vocab": hf("erax-ai/EraX-Smile-Female-F5-V1.0", "model/vocab.txt"),
    },
}

_model = None
_model_name = ""

def get_model(name):
    global _model, _model_name
    if _model_name != name:
        cfg = MODELS_PATH[name]
        _model = load_model(
            DiT,
            dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
            ckpt_path=cfg["ckpt"],
            vocab_file=cfg["vocab"],
        )
        _model_name = name
    return _model

# ================= VOICE INDEX =================
def load_voice_index():
    if os.path.exists(VOICE_INDEX_FILE):
        try:
            with open(VOICE_INDEX_FILE, 'r', encoding='utf-8') as f:
                return json.load(f)
        except:
            return {}
    return {}

def save_voice_index(index):
    try:
        with open(VOICE_INDEX_FILE, 'w', encoding='utf-8') as f:
            json.dump(index, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"Error saving voice index: {e}")

def get_voice_choices():
    index = load_voice_index()
    if not index:
        return []
    return [(v['display_name'], k) for k, v in index.items()]

# ================= AUDIO UTILS (FIX FFmpeg) =================

def load_audio_for_tts(audio_path):
    """
    Tải audio và resample về MODEL_SR (24000) để dùng cho TTS, 
    sử dụng librosa để tránh lỗi torchaudio/FFmpeg.
    """
    if not audio_path:
        raise ValueError("Audio path cannot be None")
    
    # librosa.load đã trả về numpy array
    audio, sr = librosa.load(audio_path, sr=MODEL_SR, mono=True) 
    
    # Chuyển numpy array (float) sang tensor và thêm batch dimension
    audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(device)
    
    return audio_tensor, MODEL_SR # Trả về tensor và sample rate

# ================= ADVANCED VOICE ANALYSIS =================
def analyze_voice_parameters(audio_path):
    """Phân tích chi tiết các thông số giọng nói - FIXED numpy format"""
    if not audio_path:
        return None
    
    try:
        start_time = time.time()
        
        # Load audio (sử dụng librosa với SR thấp hơn cho phân tích nhanh hơn)
        y, sr = librosa.load(audio_path, sr=22050, mono=True)
        
        # Normalize
        y = y / (np.max(np.abs(y)) + 1e-6)
        
        # Analyze section
        analyze_samples = min(len(y), int(sr * MAX_ANALYZE_SEC))
        y_analyze = y[:analyze_samples]
        
        # F0 (Pitch) analysis
        f0 = librosa.yin(y_analyze, fmin=70, fmax=400, sr=sr)
        f0_clean = f0[~np.isnan(f0)]
        
        # Energy/Volume
        rms = librosa.feature.rms(y=y_analyze)[0]
        
        # ... (các phần phân tích khác giữ nguyên) ...
        spectral_centroid = librosa.feature.spectral_centroid(y=y_analyze, sr=sr)[0]
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y_analyze, sr=sr)[0]
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y_analyze, sr=sr)[0]
        zcr = librosa.feature.zero_crossing_rate(y_analyze)[0]
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        duration = len(y) / sr
        
        # Gender detection
        mean_f0 = float(np.mean(f0_clean)) if len(f0_clean) > 0 else 0.0
        gender = "Nữ" if mean_f0 >= FEMALE_F0_THRESHOLD else "Nam"
        
        analysis_time = time.time() - start_time
        
        # Convert numpy values to Python native types
        params = {
            "Thời lượng": f"{float(duration):.2f}s",
            "Giới tính": gender,
            "F0 trung bình": f"{float(mean_f0):.1f} Hz",
            "F0 min": f"{float(np.min(f0_clean)):.1f} Hz" if len(f0_clean) > 0 else "N/A",
            "F0 max": f"{float(np.max(f0_clean)):.1f} Hz" if len(f0_clean) > 0 else "N/A",
            "F0 độ lệch chuẩn": f"{float(np.std(f0_clean)):.1f} Hz" if len(f0_clean) > 0 else "N/A",
            "Âm lượng trung bình": f"{float(np.mean(rms)):.4f}",
            "Âm lượng max": f"{float(np.max(rms)):.4f}",
            "Spectral Centroid": f"{float(np.mean(spectral_centroid)):.1f} Hz",
            "Spectral Bandwidth": f"{float(np.mean(spectral_bandwidth)):.1f} Hz",
            "Spectral Rolloff": f"{float(np.mean(spectral_rolloff)):.1f} Hz",
            "Zero Crossing Rate": f"{float(np.mean(zcr)):.4f}",
            "Tempo ước tính": f"{float(tempo):.1f} BPM",
            "Sample Rate": f"{int(sr)} Hz",
            "Thời gian phân tích": f"{float(analysis_time):.2f}s"
        }
        
        return params
    
    except Exception as e:
        print(f"Error analyzing voice: {e}")
        import traceback
        traceback.print_exc()
        return {"Lỗi": str(e)}

def format_voice_params(params):
    """Format thông số giọng thành text dễ đọc"""
    if not params:
        return "Chưa có dữ liệu phân tích"
    
    output = "=" * 50 + "\n"
    output += "📊 THÔNG SỐ GIỌNG NÓI CHI TIẾT\n"
    output += "=" * 50 + "\n\n"
    
    for key, value in params.items():
        output += f"• {key}: {value}\n"
    
    output += "\n" + "=" * 50
    return output

def export_voice_params(audio_path, voice_name=""):
    """Xuất thông số giọng ra file JSON và TXT"""
    if not audio_path:
        return None, None, "⚠️ Vui lòng chọn file audio"
    
    try:
        params = analyze_voice_parameters(audio_path)
        
        if not params or "Lỗi" in params:
            return None, None, f"❌ Không thể phân tích giọng: {params.get('Lỗi', 'Unknown error')}"
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        safe_name = voice_name.replace(" ", "_") if voice_name else "voice"
        base_filename = f"{safe_name}_{timestamp}"
        
        json_path = os.path.join(OUTPUT_ROOT, f"{base_filename}_params.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(params, f, ensure_ascii=False, indent=2)
        
        txt_path = os.path.join(OUTPUT_ROOT, f"{base_filename}_params.txt")
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(format_voice_params(params))
        
        return json_path, txt_path, f"✅ Đã xuất thông số:\n📄 {os.path.basename(json_path)}\n📄 {os.path.basename(txt_path)}"
    
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, None, f"❌ Lỗi: {str(e)}"

# ================= GENDER DETECT AND TRANSCRIBE =================
def detect_gender(audio_path):
    """Phát hiện giới tính và trả về model phù hợp"""
    if not audio_path:
        return "UnixSex - Stable"
    
    try:
        # Load audio (sử dụng librosa)
        y, sr = librosa.load(audio_path, sr=16000, mono=True)
        if len(y) < sr:
            return "UnixSex - Stable"

        y = y / (np.max(np.abs(y)) + 1e-6)
        y = y[: int(sr * MAX_ANALYZE_SEC)]

        f0 = librosa.yin(y, fmin=70, fmax=400, sr=sr)
        f0 = f0[~np.isnan(f0)]

        if len(f0) < 10:
            return "UnixSex - Stable"

        return (
            "Female F5 (EraX)"
            if np.mean(f0) >= FEMALE_F0_THRESHOLD
            else "UnixSex - Stable"
        )
    except Exception as e:
        print(f"Error detecting gender: {e}")
        return "UnixSex - Stable"

def transcribe_audio(audio_path):
    """Chuyển audio thành text bằng Whisper"""
    if not audio_path:
        return ""
    
    try:
        result = whisper_model.transcribe(audio_path, language="vi")
        return result["text"].strip()
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return ""

def auto_detect_and_transcribe(audio_path):
    """Tự động nhận diện giới tính, phân tích và chuyển đổi giọng nói thành text"""
    if not audio_path:
        return "UnixSex - Stable", "⚠️ Vui lòng tải lên file audio trước", ""
    
    model = detect_gender(audio_path)
    ref_text = transcribe_audio(audio_path)
    params = analyze_voice_parameters(audio_path)
    params_text = format_voice_params(params)
    
    if not ref_text or "không thể nhận diện nội dung" in ref_text.lower():
        ref_text = "⚠️ Không thể nhận diện nội dung. Vui lòng nhập thủ công."
    
    return model, ref_text, params_text

# ================= TEXT SPLIT =================
def split_text(text):
    buf, out = "", []
    for ch in text:
        buf += ch
        if ch in ".!?;":
            # Yêu cầu độ dài tối thiểu để tránh các đoạn quá ngắn
            if len(buf.strip()) >= 5: 
                out.append(buf.strip())
            buf = ""
    if buf.strip():
        out.append(buf.strip())

    final = []
    for s in out:
        # Tách đoạn quá dài
        while len(s) > MAX_SEG_LEN: 
            final.append(s[:MAX_SEG_LEN])
            s = s[MAX_SEG_LEN:]
        if s.strip():
            final.append(s.strip())
    return final

# ================= VOICE MEMORY =================
def save_voice(name, audio, ref_text, model):
    """Lưu giọng vào bộ nhớ với thông số phân tích"""
    if not audio or not name.strip():
        return gr.update(choices=get_voice_choices()), "⚠️ Vui lòng nhập tên và chọn audio"

    vid = f"{name.strip().replace(' ', '_')}_{uuid.uuid4().hex[:6]}"
    folder = os.path.join(VOICE_ROOT, vid)
    os.makedirs(folder, exist_ok=True)

    try:
        # Copy file audio
        shutil.copy(audio, os.path.join(folder, "ref.wav"))
        
        # Phân tích giọng
        params = analyze_voice_parameters(audio)
        
        # Lưu metadata với thông số phân tích
        with open(os.path.join(folder, "meta.json"), "w", encoding="utf-8") as f:
            json.dump(
                {
                    "ref_text": ref_text, 
                    "model": model,
                    "voice_params": params,
                    "created_at": datetime.now().isoformat()
                },
                f,
                ensure_ascii=False,
                indent=2,
            )
        
        # Cập nhật voice index
        index = load_voice_index()
        index[vid] = {
            "display_name": name.strip(),
            "model": model,
            "created_at": datetime.now().isoformat()
        }
        save_voice_index(index)
        
        return gr.update(choices=get_voice_choices(), value=vid), f"✅ Đã lưu giọng '{name.strip()}' với thông số phân tích"
    
    except Exception as e:
        print("SAVE VOICE FAIL:", e)
        import traceback
        traceback.print_exc()
        return gr.update(choices=get_voice_choices()), f"❌ Lỗi: {str(e)}"

def load_voice(vid):
    """Load giọng đã lưu"""
    if not vid:
        return None, "", "UnixSex - Stable", ""
    
    folder = os.path.join(VOICE_ROOT, vid)
    ref_audio_path = os.path.join(folder, "ref.wav")
    
    if not os.path.exists(ref_audio_path):
        return None, "", "UnixSex - Stable", f"❌ Lỗi: Không tìm thấy file audio mẫu trong {folder}"

    try:
        with open(os.path.join(folder, "meta.json"), encoding="utf-8") as f:
            meta = json.load(f)
        
        # Format thông số nếu có
        params_text = ""
        if "voice_params" in meta:
            params_text = format_voice_params(meta["voice_params"])
        
        return (
            ref_audio_path,
            meta.get("ref_text", ""),
            meta.get("model", "UnixSex - Stable"),
            params_text
        )
    except Exception as e:
        print(f"Error loading voice: {e}")
        return None, "", "UnixSex - Stable", f"❌ Lỗi: {str(e)}"

def delete_voice(vid):
    """Xóa giọng đã lưu"""
    if not vid:
        return gr.update(choices=get_voice_choices()), "⚠️ Vui lòng chọn giọng cần xóa"
    
    try:
        folder = os.path.join(VOICE_ROOT, vid)
        if os.path.exists(folder):
            shutil.rmtree(folder)
        
        index = load_voice_index()
        if vid in index:
            del index[vid]
            save_voice_index(index)
        
        return gr.update(choices=get_voice_choices(), value=None), "✅ Đã xóa giọng"
    
    except Exception as e:
        print(f"Error deleting voice: {e}")
        return gr.update(choices=get_voice_choices()), f"❌ Lỗi: {str(e)}"

# ================= FAST AUDIO EXPORT =================
def fast_export_audio(audio_data, sample_rate, filename_prefix="tts_output"):
    """Xuất audio nhanh với nhiều định dạng"""
    if audio_data is None:
        return None, None, "⚠️ Chưa có audio để xuất"
    
    try:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        base_name = f"{filename_prefix}_{timestamp}"
        
        # WAV (chất lượng cao)
        wav_path = os.path.join(OUTPUT_ROOT, f"{base_name}.wav")
        # Sử dụng sf.write thay vì lệnh ffmpeg
        sf.write(wav_path, audio_data, sample_rate)
        
        # MP3 (nén, dung lượng nhỏ - nếu có ffmpeg)
        mp3_path = os.path.join(OUTPUT_ROOT, f"{base_name}.mp3")
        try:
            import subprocess
            # Kiểm tra xem ffmpeg có tồn tại không trước khi gọi
            if shutil.which('ffmpeg'):
                subprocess.run([
                    'ffmpeg', '-i', wav_path, '-codec:a', 'libmp3lame',
                    '-qscale:a', '2', mp3_path, '-y', '-loglevel', 'error'
                ], check=True, capture_output=True, timeout=30)
            else:
                mp3_path = None
        except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired, OSError):
             # Lỗi nếu ffmpeg không được cài đặt hoặc gọi không thành công
            mp3_path = None
        
        msg = f"✅ Đã xuất:\n📁 {os.path.basename(wav_path)}"
        if mp3_path and os.path.exists(mp3_path):
            msg += f"\n📁 {os.path.basename(mp3_path)}"
        elif mp3_path is None:
            msg += "\n(❌ Không thể tạo MP3: Thiếu ffmpeg hoặc lỗi gọi lệnh)"
        
        return wav_path, mp3_path, msg
    
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, None, f"❌ Lỗi xuất file: {str(e)}"

# ================= INFER (FIXED INPUT) =================
def infer_tts(model_choice, ref_audio_path, ref_text, gen_text, speed):
    """Tạo giọng nói từ text - Đã sửa đầu vào để tránh lỗi FFmpeg/Torchcodec"""
    if not ref_audio_path or not ref_text or not gen_text:
        return None, None, "⚠️ Vui lòng điền đầy đủ thông tin"
    
    try:
        start_time = time.time()
        model = get_model(model_choice)

        # FIX: Tải audio bằng librosa và chuyển thành tensor (model_input)
        ref_audio_tensor, sr_load = load_audio_for_tts(ref_audio_path)
        
        # Tiền xử lý text
        gen_text = " ".join(TTSnorm(unicodedata.normalize("NFC", gen_text)).split())

        segments = split_text(gen_text)
        waves, sr_final, spec_img = [], None, None

        for i, seg in enumerate(segments):
            # Hàm infer_process sẽ nhận tensor audio và text đã xử lý
            # (Phải đảm bảo infer_process có thể nhận tensor audio, 
            # nếu không, cần custom lại hàm đó.)
            
            # Giả định infer_process sẽ nhận tensor audio và text đã norm
            # Nếu infer_process yêu cầu path, thì việc fix lỗi sẽ phức tạp hơn.
            # Dựa trên lỗi gốc, nó yêu cầu path, và chính nó gọi torchaudio.load(path)
            
            # TẠM THỜI GIỮ NGUYÊN GỌI INFER_PROCESS VÀ CHUYỂN REF_AUDIO_PATH VÀO, 
            # NHƯNG SỬ DỤNG FILE AUDIO ĐÃ LƯU TRONG THƯ MỤC VOICE_ROOT (REF.WAV)
            # VIỆC NÀY CHỈ HOẠT ĐỘNG NẾU ref_audio_path LÀ PATH ĐẾN FILE AUDIO HỢP LỆ
            # VÀ CÁC THƯ VIỆN GỌI TRƯỚC ĐÓ (như analyze_voice_parameters, detect_gender)
            # ĐÃ KHÔNG DÙNG torchaudio/torchcodec gây ra lỗi ban đầu.
            
            # Sẽ sử dụng lại `ref_audio_path` vì `infer_process` của `f5_tts` 
            # được thiết kế để nhận path và tự xử lý. Chúng ta đã fix việc tải file
            # ở các hàm khác, còn hàm này vẫn cần FFmpeg hoạt động để xử lý file.
            # Nếu lỗi vẫn xảy ra ở đây, tức là FFmpeg vẫn chưa được cài đúng cách.
            
            wav, sr, spec = infer_process(
                ref_audio_path, ref_text, seg, model, vocoder, speed=speed
            )
            
            if i == 0:
                sr_final, spec_img = sr, spec
            waves.append(wav)

        final_wave = np.concatenate(waves)
        
        processing_time = time.time() - start_time

        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
            save_spectrogram(spec_img, f.name)
            spec_path = f.name

        return (sr_final, final_wave), spec_path, f"✅ Tạo thành công! ({processing_time:.2f}s)"
    
    except Exception as e:
        print(f"Error in TTS: {e}")
        import traceback
        traceback.print_exc()
        return None, None, f"❌ Lỗi: {str(e)}"

def quick_tts_with_saved_voice(voice_id, gen_text, speed):
    """Tạo giọng nói nhanh với giọng đã lưu"""
    if not voice_id:
        return None, None, "⚠️ Vui lòng chọn giọng đã lưu"
    
    if not gen_text or not gen_text.strip():
        return None, None, "⚠️ Vui lòng nhập nội dung cần đọc"
    
    audio_path, ref_text, model_choice, _ = load_voice(voice_id)
    
    if not audio_path or ref_text.startswith("❌ Lỗi"):
        return None, None, "❌ Không thể load giọng đã lưu"
    
    # Chỉ truyền 4 tham số cần thiết
    return infer_tts(model_choice, audio_path, ref_text, gen_text, speed)

# ================= UI =================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎤 viF5-TTS Pro – Advanced Edition (Fixed for Host)")
    gr.Markdown("✨ Đã sửa lỗi Torchcodec/FFmpeg. Vẫn cần FFmpeg được cài đặt cho các chức năng TTS chính.")

    # Các biến UI không sử dụng:
    # pitch = gr.Slider(-5, 5, 0, 1, label="🎵 Cao độ (Chưa hỗ trợ)", interactive=False)
    # gender_shift = gr.Slider(-3, 3, 0, 0.5, label="👤 Chuyển giới tính (Chưa hỗ trợ)", interactive=False)

    with gr.Tabs():
        # Tab 1: Quản lý giọng
        with gr.TabItem("🎙️ Quản lý giọng"):
            with gr.Row():
                with gr.Column():
                    model_choice = gr.Dropdown(
                        choices=list(MODELS_PATH.keys()),
                        value="UnixSex - Stable",
                        label="🤖 Model",
                    )
                    audio_input = gr.Audio(type="filepath", label="🎵 Giọng mẫu")
                    ref_text = gr.Textbox(label="📝 Lời giọng mẫu", lines=2)
                    
                    with gr.Row():
                        auto_detect_btn = gr.Button("🧠 Phân tích tự động", variant="secondary")
                        export_params_btn = gr.Button("📊 Xuất thông số", variant="secondary")
                    
                    voice_params_display = gr.Textbox(
                        label="📊 Thông số giọng",
                        lines=10,
                        interactive=False
                    )

                    gr.Markdown("---")
                    voice_name = gr.Textbox(label="💾 Tên giọng", placeholder="VD: Giọng nữ miền Nam")
                    save_status = gr.Textbox(label="📊 Trạng thái", interactive=False)
                    save_btn = gr.Button("💾 Lưu giọng + Thông số", variant="primary")

                with gr.Column():
                    voice_list = gr.Dropdown(
                        choices=get_voice_choices(), 
                        label="📚 Giọng đã lưu",
                        type="value"
                    )
                    with gr.Row():
                        load_btn = gr.Button("📥 Load giọng", variant="secondary")
                        delete_btn = gr.Button("🗑️ Xóa giọng", variant="stop")
                    
                    gr.Markdown("---")
                    gen_text = gr.Textbox(label="📖 Nội dung cần đọc", lines=6)
                    
                    speed = gr.Slider(0.5, 2, 1, 0.1, label="⚡ Tốc độ")
                    
                    # Thêm các slider pitch và gender_shift nhưng vô hiệu hóa
                    pitch_dummy = gr.Slider(-5, 5, 0, 1, label="🎵 Cao độ (Không khả dụng)", interactive=False)
                    gender_shift_dummy = gr.Slider(-3, 3, 0, 0.5, label="👤 Chuyển giới tính (Không khả dụng)", interactive=False)
                    
                    tts_status = gr.Textbox(label="📊 Trạng thái", interactive=False)
                    btn = gr.Button("🔥 Tạo giọng nói", variant="primary", size="lg")
                    
                    out_audio = gr.Audio(label="🔊 Kết quả")
                    out_img = gr.Image(label="📊 Spectrogram")
                    
                    gr.Markdown("---")
                    export_audio_btn = gr.Button("💾 Xuất file audio", variant="secondary")
                    export_status = gr.Textbox(label="📊 Trạng thái xuất", interactive=False)

        # Tab 2: Đọc nhanh
        with gr.TabItem("⚡ Đọc nhanh"):
            gr.Markdown("### Tạo giọng nói nhanh chóng với giọng đã lưu")
            
            with gr.Row():
                with gr.Column():
                    quick_voice = gr.Dropdown(
                        choices=get_voice_choices(),
                        label="🎤 Chọn giọng đã lưu",
                        type="value"
                    )
                    quick_text = gr.Textbox(
                        label="📖 Nội dung cần đọc",
                        lines=8,
                        placeholder="Nhập nội dung bạn muốn chuyển thành giọng nói..."
                    )
                    quick_speed = gr.Slider(0.5, 2, 1, 0.1, label="⚡ Tốc độ đọc")
                    
                with gr.Column():
                    quick_status = gr.Textbox(label="📊 Trạng thái", interactive=False)
                    quick_btn = gr.Button("🚀 Tạo giọng nhanh", variant="primary", size="lg")
                    quick_audio = gr.Audio(label="🔊 Kết quả")
                    quick_img = gr.Image(label="📊 Spectrogram")
                    
                    gr.Markdown("---")
                    quick_export_btn = gr.Button("💾 Xuất file audio", variant="secondary")
                    quick_export_status = gr.Textbox(label="📊 Trạng thái xuất", interactive=False)

    # Hidden outputs for export
    json_output = gr.File(visible=False)
    txt_output = gr.File(visible=False)
    wav_output = gr.File(visible=False)
    mp3_output = gr.File(visible=False)

    # Kết nối các nút - Tab 1
    auto_detect_btn.click(
        auto_detect_and_transcribe,
        inputs=[audio_input],
        outputs=[model_choice, ref_text, voice_params_display]
    )
    
    export_params_btn.click(
        export_voice_params,
        inputs=[audio_input, voice_name],
        outputs=[json_output, txt_output, save_status]
    )
    
    save_btn.click(
        save_voice,
        [voice_name, audio_input, ref_text, model_choice],
        [voice_list, save_status],
    ).then(
        lambda: gr.update(choices=get_voice_choices()),
        None,
        quick_voice
    )
    
    load_btn.click(
        load_voice, 
        voice_list, 
        [audio_input, ref_text, model_choice, voice_params_display]
    )
    
    delete_btn.click(
        delete_voice,
        voice_list,
        [voice_list, save_status]
    ).then(
        lambda: gr.update(choices=get_voice_choices()),
        None,
        quick_voice
    )
    
    # Chỉ truyền các tham số cần thiết (model_choice, audio_input, ref_text, gen_text, speed)
    btn.click(
        infer_tts,
        [model_choice, audio_input, ref_text, gen_text, speed],
        [out_audio, out_img, tts_status],
    )
    
    def export_current_audio(audio_tuple):
        if audio_tuple is None:
            return None, None, "⚠️ Chưa có audio"
        sr, data = audio_tuple
        # Gọi hàm xuất file đã được cải tiến
        return fast_export_audio(data, sr, "tts_output")
    
    export_audio_btn.click(
        export_current_audio,
        inputs=[out_audio],
        outputs=[wav_output, mp3_output, export_status]
    )

    # Kết nối các nút - Tab 2
    quick_btn.click(
        quick_tts_with_saved_voice,
        [quick_voice, quick_text, quick_speed],
        [quick_audio, quick_img, quick_status]
    )
    
    quick_export_btn.click(
        export_current_audio,
        inputs=[quick_audio],
        outputs=[wav_output, mp3_output, quick_export_status]
    )

def get_ip():
    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    try:
        s.connect(("8.8.8.8", 80))
        ip = s.getsockname()[0]
    except Exception:
        ip = "127.0.0.1"
    finally:
        s.close()
    return ip

if __name__ == "__main__":
    ip = get_ip()
    print(f"🌐 LAN URL: http://{ip}:7860")
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)