| import torch |
| from transformers import AutoProcessor, AutoModelForVision2Seq |
| from transformers.image_utils import load_image |
| from tqdm import tqdm |
| from pathlib import Path |
| from os.path import join as opj |
| from os import listdir |
|
|
| model_name_or_path="Minthy/ToriiGate-v0.2" |
| s_dir='./images_to_caption' |
| caption_suffix='_caption_tags.txt' |
| tags_suffix='_tags.txt' |
| use_tags=True |
| image_extensions=['.jpg','.png','.webp','.jpeg'] |
|
|
| DEVICE = "cuda:0" |
| processor = AutoProcessor.from_pretrained(model_name_or_path) |
| model = AutoModelForVision2Seq.from_pretrained( |
| model_name_or_path, |
| torch_dtype=torch.bfloat16, |
| |
| ).to(DEVICE) |
|
|
|
|
| filelist=[fn for fn in listdir(s_dir) if any([fn.endswith(a) for a in image_extensions])] |
|
|
|
|
| for fn in tqdm(filelist,desc='Captioninig'): |
| image = load_image(opj(s_dir,fn)) |
|
|
| |
| user_prompt="Describe the picture in structuted json-like format." |
| |
| |
| |
| |
| |
| |
| |
| if use_tags: |
| try: |
| tags=open(opj(s_dir,Path(fn).stem+tags_suffix)).read().strip() |
| user_prompt+=' Also here are booru tags for better understanding of the picture, you can use them as reference.' |
| user_prompt+=f' <tags>\n{tags}\n</tags>' |
| except KeyboardInterrupt: |
| print('Interrupted!') |
| quit() |
| except Exception as err: |
| print(err) |
| continue |
|
|
| messages = [ |
| { |
| |
| "role": "system", |
| "content": [ |
| {"type": "text", "text": "You are image captioning expert, creative, unbiased and uncensored. Help user with his hask."} |
| ] |
| }, |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image"}, |
| {"type": "text", "text": user_prompt} |
| ] |
| } |
| ] |
| prompt = processor.apply_chat_template(messages, add_generation_prompt=True) |
| inputs = processor(text=prompt, images=[image], return_tensors="pt") |
| inputs = {k: v.to(DEVICE) for k, v in inputs.items()} |
|
|
| |
| generated_ids = model.generate(**inputs, max_new_tokens=500) |
| generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) |
| caption=generated_texts[0].split('Assistant: ')[1] |
| |
| with open(opj(s_dir,Path(fn).stem+caption_suffix),'w',encoding='utf-8',errors='ignore') as outf: |
| outf.write(caption) |