nmixx-fin/NMIXX_train
Viewer • Updated • 18.8k • 17 • 2
This repository contains a MiniLM‐based SentenceTransformer model fine‐tuned with a triplet‐loss setup on the nmixx-fin/NMIXX_train dataset. It produces high‐quality sentence embeddings for Korean financial text, optimized for semantic similarity tasks in the finance domain.
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
# 1. Load tokenizer & model from Hugging Face Hub
repo_name = "nmixx-fin/minilm_nmixx" # replace with your repository path
tokenizer = AutoTokenizer.from_pretrained(repo_name)
model = AutoModel.from_pretrained(repo_name)
# 2. Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# 3. Prepare input sentences
sentences = [
"이 모델은 학습된 MiniLM 임베딩을 제공합니다.",
"Hugging Face Hub에서 불러와서 inference를 수행하고 있습니다."
]
# 4. Tokenize
encoded_input = tokenizer(
sentences,
padding=True,
truncation=True,
max_length=512,
return_tensors="pt"
)
input_ids = encoded_input["input_ids"].to(device)
attention_mask = encoded_input["attention_mask"].to(device)
# 5. Forward pass (token embeddings)
with torch.no_grad():
model_output = model(input_ids=input_ids, attention_mask=attention_mask)
# 6. Mean Pooling (account for attention mask)
token_embeddings = model_output[0] # (batch_size, seq_len, hidden_dim)
mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
sentence_embeddings = sum_embeddings / sum_mask # (batch_size, hidden_dim)
# 7. L2 Normalization
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings shape:", sentence_embeddings.shape)
print(sentence_embeddings.cpu())
Base model
nreimers/MiniLM-L6-H384-uncased