ViT를 PEFT형식으로 전이학습하기
사용한 Dataset:
huggingfaceFastJobs/Visual_Emotional_Analysis · Datasets at Hugging Face
FastJobs/Visual_Emotional_Analysis · Datasets at Hugging Face
We’re on a journey to advance and democratize artificial intelligence through open source and open science.
사용한 모델:
huggingfacegoogle/vit-base-patch16-224-in21k · Hugging Face
google/vit-base-patch16-224-in21k · Hugging Face
We’re on a journey to advance and democratize artificial intelligence through open source and open science.
from datasets import load_dataset from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer from peft import LoraConfig, get_peft_model import torch from torchvision.transforms import ( CenterCrop, Compose, Normalize, RandomHorizontalFlip, RandomResizedCrop, Resize, ToTensor, ) from PIL import Image import logging # Initialize logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Load dataset dataset = load_dataset("FastJobs/Visual_Emotional_Analysis") labels = dataset["train"].features["label"].names label2id = {label: i for i, label in enumerate(labels)} id2label = {i: label for i, label in enumerate(labels)} # Image processor and transformations image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) train_transforms = Compose( [ RandomResizedCrop(image_processor.size["height"]), RandomHorizontalFlip(), ToTensor(), normalize, ] ) def preprocess_train(example_batch): example_batch["pixel_values"] = [train_transforms(image.convert("RGB")) for image in example_batch["image"]] return {"pixel_values": example_batch["pixel_values"], "labels": example_batch["label"]} # Preprocess train dataset train_dataset = dataset["train"].map(preprocess_train, batched=True) # LoRA configuration and model application model = AutoModelForImageClassification.from_pretrained( "google/vit-base-patch16-224-in21k", num_labels=len(labels), # 올바른 클래스 수 설정 label2id=label2id, id2label=id2label, ) lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["attention.query", "attention.key", "attention.value", "intermediate.dense"], lora_dropout=0.1, bias="none", modules_to_save=["classifier"], ) model = get_peft_model(model, lora_config) # Training arguments without validation training_args = TrainingArguments( output_dir="./results", save_strategy="epoch", learning_rate=5e-4, per_device_train_batch_size=4, num_train_epochs=2, weight_decay=0.01, logging_steps=10, save_total_limit=2, load_best_model_at_end=False, logging_dir='./logs', ) # Data collator definition def collate_fn(examples): pixel_values = torch.stack([torch.tensor(example["pixel_values"]) for example in examples]) labels = torch.tensor([example["labels"] for example in examples], dtype=torch.long) return {"pixel_values": pixel_values, "labels": labels} # Trainer instance without validation trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, data_collator=collate_fn, ) # Training trainer.train() # 모델과 이미지 프로세서를 로컬에 저장 model.save_pretrained("./trained_model") image_processor.save_pretrained("./trained_model")
Resolving data files: 100%
800/800 [00:00<00:00, 163.25it/s]
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`. Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[400/400 03:46, Epoch 2/2]
Step | Training Loss |
10 | 2.080800 |
20 | 2.068200 |
30 | 2.092600 |
40 | 2.059000 |
50 | 2.028400 |
60 | 1.947200 |
70 | 1.854000 |
80 | 1.811000 |
90 | 1.730300 |
100 | 1.693300 |
110 | 1.604800 |
120 | 1.507600 |
130 | 1.536400 |
140 | 1.460100 |
150 | 1.719700 |
160 | 1.417700 |
170 | 1.613800 |
180 | 1.611200 |
190 | 1.522200 |
200 | 1.502700 |
210 | 1.392300 |
220 | 1.323600 |
230 | 1.257000 |
240 | 1.249800 |
250 | 1.394300 |
260 | 1.299300 |
270 | 1.270400 |
280 | 1.374700 |
290 | 1.579900 |
300 | 1.415400 |
310 | 1.183400 |
320 | 1.261300 |
330 | 1.248900 |
340 | 1.101700 |
350 | 1.174200 |
360 | 1.209000 |
370 | 1.104300 |
380 | 1.265800 |
390 | 1.269100 |
400 | 1.173700 |
['./trained_model/preprocessor_config.json']
테스트

from transformers import AutoImageProcessor, AutoModelForImageClassification from PIL import Image import torch import os # 로그 테스트 - logging이라는 라이브러리가 작동하지 않아 print문으로 대체하였음! print("로그 테스트 - 이 메시지가 출력되면 로그 설정이 정상적으로 작동하는 것입니다.") # 모델과 이미지 프로세서 로드 try: print("모델 로드 중...") model_path = "./trained_model" if not os.path.exists(model_path): print("모델 경로가 존재하지 않습니다:", model_path) raise FileNotFoundError(f"모델 경로가 존재하지 않습니다: {model_path}") model = AutoModelForImageClassification.from_pretrained(model_path, num_labels=8) #num_labels를 설정하지 않으면 오류가 생긴다!!! image_processor = AutoImageProcessor.from_pretrained(model_path) print("모델 및 이미지 프로세서 로드 완료") except Exception as e: print("모델 로드 실패:", e) raise e # 이미지 파일 경로 uploaded_image_path = "./content/amusement.jpg" # 이미지 로드 및 전처리 try: print("이미지 로드 중...") if not os.path.exists(uploaded_image_path): print("이미지 파일이 존재하지 않습니다:", uploaded_image_path) raise FileNotFoundError(f"이미지 파일이 존재하지 않습니다: {uploaded_image_path}") image = Image.open(uploaded_image_path) encoding = image_processor(images=image.convert("RGB"), return_tensors="pt") print("이미지 전처리 완료") except Exception as e: print("이미지 로드 및 전처리 실패:", e) raise e # 모델을 동일한 디바이스에 이동 try: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"디바이스: {device}") model.to(device) encoding = {k: v.to(device) for k, v in encoding.items()} print("디바이스로 이동된 encoding:", encoding) except Exception as e: print("디바이스 이동 실패:", e) raise e # 모델 예측 수행 try: print("모델 예측 수행 중...") model.eval() with torch.no_grad(): outputs = model(**encoding) logits = outputs.logits print("예측 완료") except Exception as e: print("모델 예측 실패:", e) raise e # 예측된 클래스 출력 try: predicted_class_idx = logits.argmax(-1).item() predicted_class = model.config.id2label[predicted_class_idx] print("Predicted class:", predicted_class) except Exception as e: print("예측된 클래스 출력 실패:", e) raise e

로그 테스트 - 이 메시지가 출력되면 로그 설정이 정상적으로 작동하는 것입니다. 모델 로드 중...
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
모델 및 이미지 프로세서 로드 완료 이미지 로드 중... 이미지 전처리 완료 디바이스: cuda 디바이스로 이동된 encoding: {'pixel_values': tensor([[[[0.7882, 0.8275, 0.8510, ..., 0.8667, 0.8745, 0.9059], [0.8118, 0.8510, 0.8431, ..., 0.8510, 0.8824, 0.9059], [0.8196, 0.8588, 0.8745, ..., 0.8431, 0.8667, 0.8902], ..., [0.8118, 0.8118, 0.7804, ..., 0.6784, 0.6627, 0.6314], [0.8510, 0.8275, 0.8353, ..., 0.7176, 0.7176, 0.7255], [0.8431, 0.8275, 0.8353, ..., 0.7333, 0.7176, 0.7176]], [[0.7725, 0.7961, 0.7961, ..., 0.8588, 0.8510, 0.8510], [0.7961, 0.8196, 0.7961, ..., 0.8431, 0.8510, 0.8588], [0.8039, 0.8353, 0.8196, ..., 0.8431, 0.8353, 0.8353], ..., [0.8275, 0.8275, 0.7961, ..., 0.7333, 0.7255, 0.6941], [0.8667, 0.8353, 0.8510, ..., 0.7725, 0.7647, 0.7804], [0.8510, 0.8353, 0.8431, ..., 0.7804, 0.7647, 0.7647]], [[0.6549, 0.6863, 0.6941, ..., 0.7725, 0.7647, 0.7725], [0.6784, 0.7020, 0.6863, ..., 0.7490, 0.7647, 0.7725], [0.6863, 0.7176, 0.7176, ..., 0.7490, 0.7490, 0.7569], ..., [0.6549, 0.6549, 0.6235, ..., 0.4588, 0.4431, 0.4196], [0.7098, 0.6784, 0.6941, ..., 0.5529, 0.5529, 0.5608], [0.7098, 0.6941, 0.7020, ..., 0.5608, 0.5529, 0.5451]]]], device='cuda:0')} 모델 예측 수행 중... 예측 완료 Predicted class: LABEL_7
LABEL_7 : surprise

로그 테스트 - 이 메시지가 출력되면 로그 설정이 정상적으로 작동하는 것입니다. 모델 로드 중...
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
모델 및 이미지 프로세서 로드 완료 이미지 로드 중... 이미지 전처리 완료 디바이스: cuda 디바이스로 이동된 encoding: {'pixel_values': tensor([[[[ 0.1373, 0.0980, 0.0510, ..., 0.3020, 0.2863, 0.2627], [ 0.0510, -0.0039, -0.0431, ..., -0.0353, -0.0510, -0.0745], [ 0.0039, -0.0431, -0.0745, ..., -0.1529, -0.1529, -0.1686], ..., [ 0.4275, 0.5059, 0.5216, ..., 0.4196, 0.4667, 0.5137], [ 0.4824, 0.5059, 0.4902, ..., 0.4353, 0.4824, 0.5294], [ 0.4980, 0.4745, 0.4588, ..., 0.4353, 0.4824, 0.5294]], [[ 0.2314, 0.2000, 0.1608, ..., 0.3098, 0.2941, 0.2706], [ 0.0980, 0.0588, 0.0353, ..., 0.0353, 0.0196, -0.0039], [ 0.0431, 0.0196, 0.0039, ..., -0.1059, -0.1059, -0.1216], ..., [ 0.5294, 0.6078, 0.6392, ..., 0.4824, 0.5529, 0.6078], [ 0.5922, 0.6314, 0.6235, ..., 0.5216, 0.5843, 0.6392], [ 0.6157, 0.6078, 0.6078, ..., 0.5373, 0.6000, 0.6627]], [[ 0.1529, 0.1294, 0.1059, ..., 0.2784, 0.2627, 0.2392], [-0.0039, -0.0510, -0.0745, ..., 0.0118, -0.0039, -0.0275], [-0.0510, -0.0902, -0.1294, ..., -0.2235, -0.2235, -0.2392], ..., [ 0.6863, 0.7804, 0.8431, ..., 0.7020, 0.7882, 0.8431], [ 0.7882, 0.8353, 0.8353, ..., 0.7333, 0.8118, 0.8667], [ 0.8510, 0.8353, 0.8275, ..., 0.7490, 0.8275, 0.8824]]]], device='cuda:0')} 모델 예측 수행 중... 예측 완료 Predicted class: LABEL_1
LABEL_1 : contempt - 경멸
최종 파일 업로드!!!
Google Colab