- 코드 수정 (CLIP ViT encoder)
ViT + Text encoder(CLIP lib에서 제공) + prompt는 성능을 더 높여준다.
코드 - no train / ugrp test (zero shot without prompting) 18%
from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset import torch # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) # 2. 데이터셋 로드 dataset = load_dataset("JANGJIWON/UGRP_sketchset_textbook") # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["Happiness", "Sadness", "Disgust", "Fear", "Anger", "Surprise"] # 4. 데이터셋의 레이블 분포 확인 print("Checking label distribution and mapping...\n") print(f"Dataset label information: {dataset['train'].features['label']}") # 몇 개의 샘플을 출력하여 매핑 확인 print("\nSample label checks:") for i in range(5): # 첫 5개 샘플 확인 sample = dataset["train"][i] label = sample['label'] print(f"Sample {i} - Label Index: {label}, Mapped Text: {possible_labels[label]}") # 5. 전체 데이터셋 평가 correct = 0 total = len(dataset["train"]) print("\nEvaluating dataset...") for sample in dataset["train"]: image = sample['image'] label = sample['label'] text_options = possible_labels # 모든 가능 텍스트를 비교 # 여러 텍스트와 이미지를 비교 inputs = processor(text=text_options, images=image, return_tensors="pt", padding=True) with torch.no_grad(): outputs = model(**inputs) logits_per_image = outputs.logits_per_image # 이미지와 모든 텍스트 간 유사도 predicted_idx = logits_per_image.argmax().item() # 가장 높은 유사도를 가진 텍스트 인덱스 if predicted_idx == label: correct += 1 accuracy = correct / total * 100 print(f"\nAccuracy: {accuracy:.2f}% ({correct}/{total})")
코드 - ugrp train (아마 모든 파라미터 활성화 시키고 학습) / ugrp test 60%
from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from torch.utils.data import DataLoader import torch from transformers import AdamW from tqdm import tqdm # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) # 2. 데이터셋 로드 및 분리 dataset = load_dataset("JANGJIWON/UGRP_sketchset_textbook") split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) # 80% train, 20% test train_dataset = split_dataset["train"] test_dataset = split_dataset["test"] # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["Happiness", "Sadness", "Disgust", "Fear", "Anger", "Surprise"] # 4. 데이터셋 처리 함수 정의 def collate_fn(samples): images = [s['image'] for s in samples] labels = [s['label'] for s in samples] inputs = processor(images=images, text=possible_labels, return_tensors="pt", padding=True) inputs['labels'] = torch.tensor(labels) return inputs # DataLoader 설정 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn) # 5. 학습 설정 optimizer = AdamW(model.parameters(), lr=5e-5) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) num_epochs = 5 # 6. 학습 및 평가 함수 def evaluate(model, loader): model.eval() correct = 0 total = 0 with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) logits = outputs.logits_per_image # 이미지-텍스트 유사도 preds = logits.argmax(dim=1) correct += (preds == labels).sum().item() total += labels.size(0) return 100 * correct / total # 7. 학습 루프 for epoch in range(1, num_epochs + 1): model.train() epoch_loss = 0 for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"): inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) loss = torch.nn.functional.cross_entropy(outputs.logits_per_image, labels) loss.backward() optimizer.step() optimizer.zero_grad() epoch_loss += loss.item() print(f"Epoch {epoch} Loss: {epoch_loss / len(train_loader):.4f}") # Train 정확도 train_acc = evaluate(model, train_loader) print(f"Train Accuracy: {train_acc:.2f}%") # Test 정확도 test_acc = evaluate(model, test_loader) print(f"Test Accuracy: {test_acc:.2f}%")
Epoch 1/5: 100%|██████████| 5/5 [00:32<00:00, 6.48s/it] Epoch 1 Loss: 1.9884 Train Accuracy: 60.53% Test Accuracy: 60.00% Epoch 2/5: 100%|██████████| 5/5 [00:30<00:00, 6.11s/it] Epoch 2 Loss: 1.9887 Train Accuracy: 60.53% Test Accuracy: 60.00% Epoch 3/5: 100%|██████████| 5/5 [00:30<00:00, 6.12s/it] Epoch 3 Loss: 1.4269 Train Accuracy: 60.53% Test Accuracy: 60.00% Epoch 4/5: 100%|██████████| 5/5 [00:30<00:00, 6.02s/it] Epoch 4 Loss: 1.2799 Train Accuracy: 60.53% Test Accuracy: 60.00% Epoch 5/5: 100%|██████████| 5/5 [00:30<00:00, 6.06s/it] Epoch 5 Loss: 1.3203 Train Accuracy: 60.53% Test Accuracy: 60.00%
코드 - ugrp train (LoRA) / ugrp test 50%
from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from torch.utils.data import DataLoader import torch from transformers import AdamW from tqdm import tqdm from peft import get_peft_model, LoraConfig from peft import PeftModel # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) # 2. 데이터셋 로드 및 분리 dataset = load_dataset("JANGJIWON/UGRP_sketchset_textbook") split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) # 80% train, 20% test train_dataset = split_dataset["train"] test_dataset = split_dataset["test"] # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["Happiness", "Sadness", "Disgust", "Fear", "Anger", "Surprise"] # 4. 데이터셋 처리 함수 정의 def collate_fn(samples): images = [s['image'] for s in samples] labels = [s['label'] for s in samples] inputs = processor(images=images, text=possible_labels, return_tensors="pt", padding=True) inputs['labels'] = torch.tensor(labels) return inputs # DataLoader 설정 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn) # 5. LoRA 설정 lora_config = LoraConfig( r=8, # rank of low-rank matrices lora_alpha=16, # scaling factor for LoRA target_modules=["q_proj", "v_proj"], # LoRA will be applied to attention projections lora_dropout=0.1, # dropout rate for LoRA layers bias="none", # no bias terms in LoRA layers ) # 6. LoRA 적용 모델 준비 model = get_peft_model(model, lora_config) # 7. 학습 설정 optimizer = AdamW(model.parameters(), lr=5e-5) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) num_epochs = 5 # 8. 학습 및 평가 함수 def evaluate(model, loader): model.eval() correct = 0 total = 0 with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) logits = outputs.logits_per_image # 이미지-텍스트 유사도 preds = logits.argmax(dim=1) correct += (preds == labels).sum().item() total += labels.size(0) return 100 * correct / total # 9. 학습 루프 for epoch in range(1, num_epochs + 1): model.train() epoch_loss = 0 for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"): inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) loss = torch.nn.functional.cross_entropy(outputs.logits_per_image, labels) loss.backward() optimizer.step() optimizer.zero_grad() epoch_loss += loss.item() print(f"Epoch {epoch} Loss: {epoch_loss / len(train_loader):.4f}") # Train 정확도 train_acc = evaluate(model, train_loader) print(f"Train Accuracy: {train_acc:.2f}%") # Test 정확도 test_acc = evaluate(model, test_loader) print(f"Test Accuracy: {test_acc:.2f}%")
Epoch 1/5: 100%|██████████| 5/5 [00:22<00:00, 4.53s/it] Epoch 1 Loss: 2.0560 Train Accuracy: 28.95% Test Accuracy: 30.00% Epoch 2/5: 100%|██████████| 5/5 [00:15<00:00, 3.15s/it] Epoch 2 Loss: 1.8668 Train Accuracy: 31.58% Test Accuracy: 40.00% Epoch 3/5: 100%|██████████| 5/5 [00:17<00:00, 3.45s/it] Epoch 3 Loss: 1.6599 Train Accuracy: 42.11% Test Accuracy: 40.00% Epoch 4/5: 100%|██████████| 5/5 [00:16<00:00, 3.20s/it] Epoch 4 Loss: 1.4819 Train Accuracy: 47.37% Test Accuracy: 50.00% Epoch 5/5: 100%|██████████| 5/5 [00:15<00:00, 3.19s/it] Epoch 5 Loss: 1.3964 Train Accuracy: 55.26% Test Accuracy: 50.00%
코드 - ugrp train (LoRA) with prompting / ugrp test 50%
from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from torch.utils.data import DataLoader import torch from transformers import AdamW from tqdm import tqdm from peft import get_peft_model, LoraConfig from peft import PeftModel # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) # 2. 데이터셋 로드 및 분리 dataset = load_dataset("JANGJIWON/UGRP_sketchset_textbook") split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) # 80% train, 20% test train_dataset = split_dataset["train"] test_dataset = split_dataset["test"] # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["This image represents Happiness.", "This image represents Sadness.", "This image represents Disgust.", "This image represents Fear.", "This image represents Anger.", "This image represents Surprise."] # 4. 데이터셋 처리 함수 정의 def collate_fn(samples): images = [s['image'] for s in samples] labels = [s['label'] for s in samples] inputs = processor(images=images, text=possible_labels, return_tensors="pt", padding=True) inputs['labels'] = torch.tensor(labels) return inputs # DataLoader 설정 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn) # 5. LoRA 설정 lora_config = LoraConfig( r=8, # rank of low-rank matrices lora_alpha=16, # scaling factor for LoRA target_modules=["q_proj", "v_proj"], # LoRA will be applied to attention projections lora_dropout=0.1, # dropout rate for LoRA layers bias="none", # no bias terms in LoRA layers ) # 6. LoRA 적용 모델 준비 model = get_peft_model(model, lora_config) # 7. 학습 설정 optimizer = AdamW(model.parameters(), lr=5e-5) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) num_epochs = 5 # 8. 학습 및 평가 함수 def evaluate(model, loader): model.eval() correct = 0 total = 0 with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) logits = outputs.logits_per_image # 이미지-텍스트 유사도 preds = logits.argmax(dim=1) correct += (preds == labels).sum().item() total += labels.size(0) return 100 * correct / total # 9. 학습 루프 for epoch in range(1, num_epochs + 1): model.train() epoch_loss = 0 for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"): inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) loss = torch.nn.functional.cross_entropy(outputs.logits_per_image, labels) loss.backward() optimizer.step() optimizer.zero_grad() epoch_loss += loss.item() print(f"Epoch {epoch} Loss: {epoch_loss / len(train_loader):.4f}") # Train 정확도 train_acc = evaluate(model, train_loader) print(f"Train Accuracy: {train_acc:.2f}%") # Test 정확도 test_acc = evaluate(model, test_loader) print(f"Test Accuracy: {test_acc:.2f}%")
Epoch 1/5: 100%|██████████| 5/5 [00:19<00:00, 3.88s/it] Epoch 1 Loss: 2.0221 Train Accuracy: 21.05% Test Accuracy: 10.00% Epoch 2/5: 100%|██████████| 5/5 [00:17<00:00, 3.58s/it] Epoch 2 Loss: 1.7777 Train Accuracy: 42.11% Test Accuracy: 30.00% Epoch 3/5: 100%|██████████| 5/5 [00:16<00:00, 3.35s/it] Epoch 3 Loss: 1.6095 Train Accuracy: 47.37% Test Accuracy: 40.00% Epoch 4/5: 100%|██████████| 5/5 [00:17<00:00, 3.54s/it] Epoch 4 Loss: 1.4401 Train Accuracy: 57.89% Test Accuracy: 50.00% Epoch 5/5: 100%|██████████| 5/5 [00:17<00:00, 3.48s/it] Epoch 5 Loss: 1.3153 Train Accuracy: 55.26% Test Accuracy: 50.00%
train 정확도 낮은 이유 → overfitting 안되었기 때문에 → 안된 이유는 파라미터 개수 등으로 비교해 보기
- 코드 테스트 (모네 스타일)
코드 - emoset mone train / sketch test
import torch import os from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from torch.utils.data import DataLoader from torch.optim import AdamW import torch.nn as nn from peft import get_peft_model, LoraConfig, TaskType from tqdm import tqdm # tqdm를 사용해 진행 상태를 시각화 # GPU 설정 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Device:", device) # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) # 2. 데이터셋 로드 및 분리 train_dataset = load_dataset("xodhks/EmoSet118K_MonetStyle", split="train") test_dataset = load_dataset("xodhks/Children_Sketch", split="train") # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["This image represents Happiness.", "This image represents Anger.", "This image represents Surprise.", "This image represents Disgust.", "This image represents Fear.", "This image represents Sadness."] # 4. 데이터셋 처리 함수 정의 def collate_fn(samples): images = [s['image'] for s in samples] labels = [s['label'] for s in samples] inputs = processor(images=images, text=possible_labels, return_tensors="pt", padding=True) inputs['labels'] = torch.tensor(labels) return inputs # DataLoader 설정 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn) # 5. LoRA 설정 lora_config = LoraConfig( r=8, # rank of low-rank matrices lora_alpha=16, # scaling factor for LoRA target_modules=["q_proj", "v_proj"], # LoRA will be applied to attention projections lora_dropout=0.1, # dropout rate for LoRA layers bias="none", # no bias terms in LoRA layers ) # 6. LoRA 적용 모델 준비 model = get_peft_model(model, lora_config) # 7. 학습 설정 optimizer = AdamW(model.parameters(), lr=5e-5) model.to(device) num_epochs = 100 # 8. 평가 함수 def evaluate(model, loader): model.eval() correct = 0 total = 0 with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) logits = outputs.logits_per_image # 이미지-텍스트 유사도 preds = logits.argmax(dim=1) correct += (preds == labels).sum().item() total += labels.size(0) return 100 * correct / total # 9. top 모델 저장 함수 정의 def save_top_models(epoch, accuracy, model, top_model=None): if top_model is None or accuracy > top_model['accuracy']: model_save_path = f"best_model_epoch_{epoch}.pth" torch.save(model.state_dict(), model_save_path) print(f"Model saved at epoch {epoch} with accuracy {accuracy:.2f}%") return {"epoch": epoch, "accuracy": accuracy, "model_path": model_save_path} return top_model # 10. 학습 루프 top_model = None # 최상의 모델을 저장할 변수 for epoch in range(1, num_epochs + 1): model.train() epoch_loss = 0 for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"): inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) loss = torch.nn.functional.cross_entropy(outputs.logits_per_image, labels) loss.backward() optimizer.step() optimizer.zero_grad() epoch_loss += loss.item() print(f"Epoch {epoch} Loss: {epoch_loss / len(train_loader):.4f}") # Train 정확도 train_acc = evaluate(model, train_loader) print(f"Train Accuracy: {train_acc:.2f}%") # Test 정확도 test_acc = evaluate(model, test_loader) print(f"Test Accuracy: {test_acc:.2f}%") # top 모델 저장 top_model = save_top_models(epoch, test_acc, model, top_model) print("Finished Training")
Epoch 1/100: 100%|██████████| 2250/2250 [13:38<00:00, 2.75it/s] Epoch 1 Loss: 0.5468 Train Accuracy: 86.07% Test Accuracy: 51.10% Model saved at epoch 1 with accuracy 51.10% Epoch 2/100: 100%|██████████| 2250/2250 [13:35<00:00, 2.76it/s] Epoch 2 Loss: 0.3868 Train Accuracy: 88.65% Test Accuracy: 49.63% Epoch 3/100: 100%|██████████| 2250/2250 [13:36<00:00, 2.76it/s] Epoch 3 Loss: 0.3123 Train Accuracy: 91.66% Test Accuracy: 50.92% Epoch 4/100: 100%|██████████| 2250/2250 [13:35<00:00, 2.76it/s] Epoch 4 Loss: 0.2436 Train Accuracy: 94.27% Test Accuracy: 50.55% Epoch 5/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 5 Loss: 0.1820 Train Accuracy: 95.78% Test Accuracy: 52.58% Model saved at epoch 5 with accuracy 52.58% Epoch 6/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 6 Loss: 0.1316 Train Accuracy: 97.32% Test Accuracy: 52.03% Epoch 7/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 7 Loss: 0.0888 Train Accuracy: 98.69% Test Accuracy: 51.75% Epoch 8/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 8 Loss: 0.0606 Train Accuracy: 99.01% Test Accuracy: 51.75% Epoch 9/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 9 Loss: 0.0399 Train Accuracy: 99.49% Test Accuracy: 52.30% Epoch 10/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 10 Loss: 0.0300 Train Accuracy: 99.66% Test Accuracy: 52.30% Epoch 11/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 11 Loss: 0.0186 Train Accuracy: 99.81% Test Accuracy: 53.22% Model saved at epoch 11 with accuracy 53.22% Epoch 12/100: 100%|██████████| 2250/2250 [13:33<00:00, 2.77it/s] Epoch 12 Loss: 0.0171 Train Accuracy: 99.89% Test Accuracy: 51.01% Epoch 13/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 13 Loss: 0.0164 Train Accuracy: 99.60% Test Accuracy: 51.10% Epoch 14/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 14 Loss: 0.0148 Train Accuracy: 99.87% Test Accuracy: 50.46% Epoch 15/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 15 Loss: 0.0120 Train Accuracy: 99.84% Test Accuracy: 52.30% Epoch 16/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 16 Loss: 0.0096 Train Accuracy: 99.70% Test Accuracy: 51.75% Epoch 17/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 17 Loss: 0.0120 Train Accuracy: 99.91% Test Accuracy: 53.59% Model saved at epoch 17 with accuracy 53.59% Epoch 18/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 18 Loss: 0.0103 Train Accuracy: 99.71% Test Accuracy: 52.03% Epoch 19/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 19 Loss: 0.0096 Train Accuracy: 99.81% Test Accuracy: 52.76% Epoch 20/100: 100%|██████████| 2250/2250 [13:32<00:00, 2.77it/s] Epoch 20 Loss: 0.0064 Train Accuracy: 99.92% Test Accuracy: 51.57%
코드 - emoset mone train / ugrp test (zero shot without prompting) 18%
from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from torch.utils.data import DataLoader import torch from transformers import AdamW from tqdm import tqdm from peft import get_peft_model, LoraConfig from peft import PeftModel from huggingface_hub import hf_hub_download # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) model_weights_path = hf_hub_download(repo_id="JANGJIWON/EmoSet118K_MonetStyle_CLIP_student", filename="best_model_epoch_17.pth") model.load_state_dict(torch.load(model_weights_path, map_location='cpu'), strict=False) # 2. 데이터셋 로드 dataset = load_dataset("JANGJIWON/UGRP_sketchset_textbook") # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["Happiness", "Sadness", "Disgust", "Fear", "Anger", "Surprise"] # 4. 데이터셋의 레이블 분포 확인 print("Checking label distribution and mapping...\n") print(f"Dataset label information: {dataset['train'].features['label']}") # 몇 개의 샘플을 출력하여 매핑 확인 print("\nSample label checks:") for i in range(5): # 첫 5개 샘플 확인 sample = dataset["train"][i] label = sample['label'] print(f"Sample {i} - Label Index: {label}, Mapped Text: {possible_labels[label]}") # 5. 전체 데이터셋 평가 correct = 0 total = len(dataset["train"]) print("\nEvaluating dataset...") for sample in dataset["train"]: image = sample['image'] label = sample['label'] text_options = possible_labels # 모든 가능 텍스트를 비교 # 여러 텍스트와 이미지를 비교 inputs = processor(text=text_options, images=image, return_tensors="pt", padding=True) with torch.no_grad(): outputs = model(**inputs) logits_per_image = outputs.logits_per_image # 이미지와 모든 텍스트 간 유사도 predicted_idx = logits_per_image.argmax().item() # 가장 높은 유사도를 가진 텍스트 인덱스 if predicted_idx == label: correct += 1 accuracy = correct / total * 100 print(f"\nAccuracy: {accuracy:.2f}% ({correct}/{total})")
Accuracy: 18.75% (9/48)
코드 - emoset mone train / ugrp train (아마 모든 파라미터 활성화 시키고 학습) / ugrp test 60%
from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from torch.utils.data import DataLoader import torch from transformers import AdamW from tqdm import tqdm from peft import get_peft_model, LoraConfig from peft import PeftModel from huggingface_hub import hf_hub_download # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) model_weights_path = hf_hub_download(repo_id="JANGJIWON/EmoSet118K_MonetStyle_CLIP_student", filename="best_model_epoch_17.pth") model.load_state_dict(torch.load(model_weights_path, map_location='cpu'), strict=False) # 2. 데이터셋 로드 및 분리 dataset = load_dataset("JANGJIWON/UGRP_sketchset_textbook") split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) # 80% train, 20% test train_dataset = split_dataset["train"] test_dataset = split_dataset["test"] # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["Happiness", "Sadness", "Disgust", "Fear", "Anger", "Surprise"] # 4. 데이터셋 처리 함수 정의 def collate_fn(samples): images = [s['image'] for s in samples] labels = [s['label'] for s in samples] inputs = processor(images=images, text=possible_labels, return_tensors="pt", padding=True) inputs['labels'] = torch.tensor(labels) return inputs # DataLoader 설정 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn) # 5. 학습 설정 optimizer = AdamW(model.parameters(), lr=5e-5) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) num_epochs = 5 # 6. 학습 및 평가 함수 def evaluate(model, loader): model.eval() correct = 0 total = 0 with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) logits = outputs.logits_per_image # 이미지-텍스트 유사도 preds = logits.argmax(dim=1) correct += (preds == labels).sum().item() total += labels.size(0) return 100 * correct / total # 7. 학습 루프 for epoch in range(1, num_epochs + 1): model.train() epoch_loss = 0 for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"): inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) loss = torch.nn.functional.cross_entropy(outputs.logits_per_image, labels) loss.backward() optimizer.step() optimizer.zero_grad() epoch_loss += loss.item() print(f"Epoch {epoch} Loss: {epoch_loss / len(train_loader):.4f}") # Train 정확도 train_acc = evaluate(model, train_loader) print(f"Train Accuracy: {train_acc:.2f}%") # Test 정확도 test_acc = evaluate(model, test_loader) print(f"Test Accuracy: {test_acc:.2f}%")
Epoch 1/5: 100%|██████████| 5/5 [00:30<00:00, 6.01s/it] Epoch 1 Loss: 2.8952 Train Accuracy: 60.53% Test Accuracy: 60.00% Epoch 2/5: 100%|██████████| 5/5 [00:28<00:00, 5.71s/it] Epoch 2 Loss: 1.5556 Train Accuracy: 60.53% Test Accuracy: 60.00% Epoch 3/5: 100%|██████████| 5/5 [00:28<00:00, 5.78s/it] Epoch 3 Loss: 1.4175 Train Accuracy: 60.53% Test Accuracy: 60.00% Epoch 4/5: 100%|██████████| 5/5 [00:28<00:00, 5.78s/it] Epoch 4 Loss: 1.3321 Train Accuracy: 60.53% Test Accuracy: 60.00% Epoch 5/5: 100%|██████████| 5/5 [00:29<00:00, 5.94s/it] Epoch 5 Loss: 1.2781 Train Accuracy: 60.53% Test Accuracy: 60.00%
코드 - emoset mone train / ugrp train (LoRA) with prompting / ugrp test (zero shot) 50%
from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from torch.utils.data import DataLoader import torch from transformers import AdamW from tqdm import tqdm from peft import get_peft_model, LoraConfig from peft import PeftModel from huggingface_hub import hf_hub_download # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) model_weights_path = hf_hub_download(repo_id="JANGJIWON/EmoSet118K_MonetStyle_CLIP_student", filename="best_model_epoch_17.pth") model.load_state_dict(torch.load(model_weights_path, map_location='cpu'), strict=False) # 2. 데이터셋 로드 및 분리 dataset = load_dataset("JANGJIWON/UGRP_sketchset_textbook") split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) # 80% train, 20% test train_dataset = split_dataset["train"] test_dataset = split_dataset["test"] # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["This image represents Happiness.", "This image represents Sadness.", "This image represents Disgust.", "This image represents Fear.", "This image represents Anger.", "This image represents Surprise."] # 4. 데이터셋 처리 함수 정의 def collate_fn(samples): images = [s['image'] for s in samples] labels = [s['label'] for s in samples] inputs = processor(images=images, text=possible_labels, return_tensors="pt", padding=True) inputs['labels'] = torch.tensor(labels) return inputs # DataLoader 설정 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn) # 5. LoRA 설정 lora_config = LoraConfig( r=8, # rank of low-rank matrices lora_alpha=16, # scaling factor for LoRA target_modules=["q_proj", "v_proj"], # LoRA will be applied to attention projections lora_dropout=0.1, # dropout rate for LoRA layers bias="none", # no bias terms in LoRA layers ) # 6. LoRA 적용 모델 준비 model = get_peft_model(model, lora_config) # 7. 학습 설정 optimizer = AdamW(model.parameters(), lr=5e-5) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) num_epochs = 5 # 8. 학습 및 평가 함수 def evaluate(model, loader): model.eval() correct = 0 total = 0 with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) logits = outputs.logits_per_image # 이미지-텍스트 유사도 preds = logits.argmax(dim=1) correct += (preds == labels).sum().item() total += labels.size(0) return 100 * correct / total # 9. 학습 루프 for epoch in range(1, num_epochs + 1): model.train() epoch_loss = 0 for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"): inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) loss = torch.nn.functional.cross_entropy(outputs.logits_per_image, labels) loss.backward() optimizer.step() optimizer.zero_grad() epoch_loss += loss.item() print(f"Epoch {epoch} Loss: {epoch_loss / len(train_loader):.4f}") # Train 정확도 train_acc = evaluate(model, train_loader) print(f"Train Accuracy: {train_acc:.2f}%") # Test 정확도 test_acc = evaluate(model, test_loader) print(f"Test Accuracy: {test_acc:.2f}%")
Epoch 1/5: 100%|██████████| 5/5 [00:20<00:00, 4.05s/it] Epoch 1 Loss: 2.0248 Train Accuracy: 21.05% Test Accuracy: 10.00% Epoch 2/5: 100%|██████████| 5/5 [00:17<00:00, 3.49s/it] Epoch 2 Loss: 1.7882 Train Accuracy: 42.11% Test Accuracy: 30.00% Epoch 3/5: 100%|██████████| 5/5 [00:17<00:00, 3.43s/it] Epoch 3 Loss: 1.5779 Train Accuracy: 47.37% Test Accuracy: 40.00% Epoch 4/5: 100%|██████████| 5/5 [00:22<00:00, 4.44s/it] Epoch 4 Loss: 1.4334 Train Accuracy: 57.89% Test Accuracy: 50.00% Epoch 5/5: 100%|██████████| 5/5 [00:17<00:00, 3.52s/it] Epoch 5 Loss: 1.3096 Train Accuracy: 55.26% Test Accuracy: 50.00%
- 코드 테스트 (일반)
코드 - emoset train / sketch test
import torch import os from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from torch.utils.data import DataLoader from torch.optim import AdamW import torch.nn as nn from peft import get_peft_model, LoraConfig, TaskType from tqdm import tqdm # tqdm를 사용해 진행 상태를 시각화 # GPU 설정 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Device:", device) # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) # 2. 데이터셋 로드 및 분리 train_dataset = load_dataset("xodhks/EmoSet118K", split="train") test_dataset = load_dataset("xodhks/Children_Sketch", split="train") # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["This image represents Happiness.", "This image represents Anger.", "This image represents Surprise.", "This image represents Disgust.", "This image represents Fear.", "This image represents Sadness."] # 4. 데이터셋 처리 함수 정의 def collate_fn(samples): images = [s['image'] for s in samples] labels = [s['label'] for s in samples] inputs = processor(images=images, text=possible_labels, return_tensors="pt", padding=True) inputs['labels'] = torch.tensor(labels) return inputs # DataLoader 설정 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn) # 5. LoRA 설정 lora_config = LoraConfig( r=8, # rank of low-rank matrices lora_alpha=16, # scaling factor for LoRA target_modules=["q_proj", "v_proj"], # LoRA will be applied to attention projections lora_dropout=0.1, # dropout rate for LoRA layers bias="none", # no bias terms in LoRA layers ) # 6. LoRA 적용 모델 준비 model = get_peft_model(model, lora_config) # 7. 학습 설정 optimizer = AdamW(model.parameters(), lr=5e-5) model.to(device) num_epochs = 100 # 8. 평가 함수 def evaluate(model, loader): model.eval() correct = 0 total = 0 with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) logits = outputs.logits_per_image # 이미지-텍스트 유사도 preds = logits.argmax(dim=1) correct += (preds == labels).sum().item() total += labels.size(0) return 100 * correct / total # 9. top 모델 저장 함수 정의 def save_top_models(epoch, accuracy, model, top_model=None): if top_model is None or accuracy > top_model['accuracy']: model_save_path = f"best_model_epoch_{epoch}.pth" torch.save(model.state_dict(), model_save_path) print(f"Model saved at epoch {epoch} with accuracy {accuracy:.2f}%") return {"epoch": epoch, "accuracy": accuracy, "model_path": model_save_path} return top_model # 10. 학습 루프 top_model = None # 최상의 모델을 저장할 변수 for epoch in range(1, num_epochs + 1): model.train() epoch_loss = 0 for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"): inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) loss = torch.nn.functional.cross_entropy(outputs.logits_per_image, labels) loss.backward() optimizer.step() optimizer.zero_grad() epoch_loss += loss.item() print(f"Epoch {epoch} Loss: {epoch_loss / len(train_loader):.4f}") # Train 정확도 train_acc = evaluate(model, train_loader) print(f"Train Accuracy: {train_acc:.2f}%") # Test 정확도 test_acc = evaluate(model, test_loader) print(f"Test Accuracy: {test_acc:.2f}%") # top 모델 저장 top_model = save_top_models(epoch, test_acc, model, top_model) print("Finished Training")
Epoch 1/100: 100%|██████████| 2250/2250 [06:49<00:00, 5.50it/s] Epoch 1 Loss: 0.4847 Train Accuracy: 87.77% Test Accuracy: 53.96% Model saved at epoch 1 with accuracy 53.96% Epoch 2/100: 100%|██████████| 2250/2250 [06:49<00:00, 5.50it/s] Epoch 2 Loss: 0.3358 Train Accuracy: 90.54% Test Accuracy: 53.31% Epoch 3/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.51it/s] Epoch 3 Loss: 0.2681 Train Accuracy: 92.64% Test Accuracy: 52.95% Epoch 4/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.51it/s] Epoch 4 Loss: 0.2045 Train Accuracy: 95.47% Test Accuracy: 54.14% Model saved at epoch 4 with accuracy 54.14% Epoch 5/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.51it/s] Epoch 5 Loss: 0.1506 Train Accuracy: 96.12% Test Accuracy: 51.57% Epoch 6/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.50it/s] Epoch 6 Loss: 0.1057 Train Accuracy: 98.09% Test Accuracy: 53.04% Epoch 7/100: 100%|██████████| 2250/2250 [06:47<00:00, 5.52it/s] Epoch 7 Loss: 0.0697 Train Accuracy: 98.98% Test Accuracy: 53.68% Epoch 8/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.50it/s] Epoch 8 Loss: 0.0460 Train Accuracy: 99.47% Test Accuracy: 52.39% Epoch 9/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.51it/s] Epoch 9 Loss: 0.0282 Train Accuracy: 99.62% Test Accuracy: 52.49% Epoch 10/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.51it/s] Epoch 10 Loss: 0.0201 Train Accuracy: 99.38% Test Accuracy: 53.41% Epoch 11/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.51it/s] Epoch 11 Loss: 0.0182 Train Accuracy: 99.48% Test Accuracy: 52.03% Epoch 12/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.51it/s] Epoch 12 Loss: 0.0137 Train Accuracy: 99.87% Test Accuracy: 53.04% Epoch 13/100: 100%|██████████| 2250/2250 [06:49<00:00, 5.49it/s] Epoch 13 Loss: 0.0135 Train Accuracy: 99.86% Test Accuracy: 49.08% Epoch 14/100: 100%|██████████| 2250/2250 [06:49<00:00, 5.49it/s] Epoch 14 Loss: 0.0076 Train Accuracy: 99.93% Test Accuracy: 53.41% Epoch 15/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.50it/s] Epoch 15 Loss: 0.0111 Train Accuracy: 99.87% Test Accuracy: 51.93% Epoch 16/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.50it/s] Epoch 16 Loss: 0.0059 Train Accuracy: 99.87% Test Accuracy: 53.50% Epoch 17/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.50it/s] Epoch 17 Loss: 0.0089 Train Accuracy: 99.89% Test Accuracy: 50.37% Epoch 18/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.50it/s] Epoch 18 Loss: 0.0076 Train Accuracy: 99.93% Test Accuracy: 54.05% Epoch 19/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.51it/s] Epoch 19 Loss: 0.0058 Train Accuracy: 99.98% Test Accuracy: 53.04% Epoch 20/100: 100%|██████████| 2250/2250 [06:48<00:00, 5.51it/s] Epoch 20 Loss: 0.0077 Train Accuracy: 99.75% Test Accuracy: 53.78%
- openAI 수정 / Gemini test
코드 (Open AI 수정본)
import json import requests import openai import base64 from PIL import Image # Image 클래스를 가져오는 부분 import os # OpenAI API 키 설정 openai.api_key = 'sk-proj-...' # 데이터셋과 레이블 목록 설정 dataset_path = r'G:\dataset\dataset_info.json' # JSON 파일 경로 설정 images_base_path = r'G:\dataset' # 이미지 기본 경로 설정 # JSON 파일에서 데이터 로드 with open(dataset_path, 'r', encoding='utf-8') as f: dataset = json.load(f) correct_predictions = 0 # 데이터셋 반복 처리 for data in dataset: # 이미지 파일 경로 가져오기 image_filename = data['image'] image_path = os.path.join(images_base_path, image_filename) # 이미지 전체 경로 생성 label = data['label'] # 이미지 로드 및 base64로 인코딩 try: with Image.open(image_path) as img: img = img.convert("RGB") # 이미지를 RGB로 변환 img = img.resize((256, 256)) # 이미지 크기 조정 (256x256) img.save("temp_image.jpg", format="JPEG") # 임시 JPEG 이미지로 저장 with open("temp_image.jpg", "rb") as image_file: image_data = image_file.read() image_base64 = base64.b64encode(image_data).decode('utf-8') # 이미지를 base64로 인코딩 except FileNotFoundError: print(f"파일을 찾을 수 없습니다: {image_path}") continue # 이미지가 없으면 다음으로 넘어감 # API 요청 possible_labels = ["Happiness", "Disgust", "Fear", "Sadness", "Anger", "Surprise"] question = ( f"Analyze the following image data and predict the emotion it represents. " f"Choose one of these labels only: {', '.join(possible_labels)}.\n\n" # f"Data: {data}\n\n" # JSON 데이터 포함 f"[IMAGE] {image_base64}\n\n" # base64 인코딩된 이미지 추가 "Provide only the label without any explanation: " ) # OpenAI API 요청 response = requests.post( "https://api.openai.com/v1/chat/completions", headers={ "Authorization": f"Bearer {openai.api_key}", "Content-Type": "application/json" }, json={ "model": "gpt-4o-mini", # GPT 비전 모델 사용 "messages": [ {"role": "user", "content": question} ], "max_tokens": 50, } ) # API 응답 출력 # print(f"Response from API: {response.text}") # 응답 출력 # 예측된 라벨 추출 if response.status_code == 200: answer = response.json() predicted_label = answer['choices'][0]['message']['content'].strip() else: print(f"API 요청 실패: {response.status_code} - {response.text}") continue # 예측이 맞는지 확인 if predicted_label == label: correct_predictions += 1 # 임시 파일 삭제 if os.path.exists("temp_image.jpg"): os.remove("temp_image.jpg") # 정확도 출력 accuracy = correct_predictions / len(dataset) print(f"Model accuracy: {accuracy * 100:.2f}%")
코드 (Open AI 정확한 테스트 random seed 42로 테스트)
담주 mixed data 제작
뭘 넣을건지 확실히 정하기 → 파라미터 프롬프팅 다 세팅하기
mini meeting
바꿔 볼 파라미터
- epoch
- 프롬프트
- random seed
비교
- 정확도
- 긍정/부정
table
- CLIP, ViT 비교
ㅤ | ㅤ | CLIP | ViT | ResNet |
Pretrain(PEFT) | loss | 0.1 이하 | 0.1 이하 | ㅤ |
ㅤ | LoRA config | ㅤ | ㅤ | ㅤ |
ㅤ | loss function | CrossEntropyLoss | CrossEntropyLoss | ㅤ |
ㅤ | optimizer | AdamW | AdamW | ㅤ |
ㅤ | prompt | ㅤ | - | ㅤ |
train(full finetuning) | epoch | 10 | 10 | ㅤ |
ㅤ | loss function | CrossEntropyLoss | CrossEntropyLoss | ㅤ |
ㅤ | optimizer | AdamW | AdamW | ㅤ |
ㅤ | prompt | ㅤ | - | ㅤ |
test | random seed | ㅤ | ㅤ | ㅤ |
output | accuracy | ㅤ | ㅤ | ㅤ |
ㅤ | top-N accuracy | ㅤ | ㅤ | ㅤ |
ㅤ | tsne | ㅤ | ㅤ | ㅤ |
- Pretrain 방식 비교
ㅤ | ㅤ | CLIP(with emoset) | CLIP(with mone) | CLIP(with crawling) | CLIP(with emoset crawling) | CLIP(with mone crawling) | CLIP(with X) |
Pretrain(PEFT) | loss | 0.1 이하 | 0.1 이하 | 0.1 이하 | ㅤ | ㅤ | - |
ㅤ | LoRA config | ㅤ | ㅤ | ㅤ | ㅤ | ㅤ | - |
ㅤ | loss function | CrossEntropyLoss | CrossEntropyLoss | CrossEntropyLoss | CrossEntropyLoss | CrossEntropyLoss | - |
ㅤ | optimizer | AdamW | AdamW | AdamW | AdamW | AdamW | - |
ㅤ | prompt | ㅤ | - | ㅤ | ㅤ | ㅤ | - |
train(full finetuning) | epoch | 10 | 10 | 10 | ㅤ | ㅤ | 10 |
ㅤ | loss function | CrossEntropyLoss | CrossEntropyLoss | CrossEntropyLoss | CrossEntropyLoss | CrossEntropyLoss | ㅤ |
ㅤ | optimizer | AdamW | AdamW | AdamW | AdamW | AdamW | ㅤ |
ㅤ | prompt | ㅤ | - | ㅤ | ㅤ | ㅤ | ㅤ |
test | random seed | ㅤ | ㅤ | ㅤ | ㅤ | ㅤ | ㅤ |
output | accuracy | ㅤ | ㅤ | ㅤ | ㅤ | ㅤ | ㅤ |
ㅤ | top-N accuracy | ㅤ | ㅤ | ㅤ | ㅤ | ㅤ | ㅤ |
ㅤ | tsne | ㅤ | ㅤ | ㅤ | ㅤ | ㅤ | ㅤ |
- 여러 번 학습 vs 한 번 학습(mixed data) 비교
ㅤ | ㅤ | CLIP(단계 별) | CLIP(한 번) |
Pretrain(PEFT) | loss | 0.1 이하 | 0.1 이하 |
ㅤ | LoRA config | ㅤ | ㅤ |
ㅤ | loss function | CrossEntropyLoss | CrossEntropyLoss |
ㅤ | optimizer | AdamW | AdamW |
ㅤ | prompt | ㅤ | - |
train(full finetuning) | epoch | 10 | 10 |
ㅤ | loss function | CrossEntropyLoss | CrossEntropyLoss |
ㅤ | optimizer | AdamW | AdamW |
ㅤ | prompt | ㅤ | - |
test | random seed | ㅤ | ㅤ |
output | accuracy | ㅤ | ㅤ |
ㅤ | top-N accuracy | ㅤ | ㅤ |
ㅤ | tsne | ㅤ | ㅤ |
- 파라미터 비교
위 것들 중 젤 좋은 것에 대해 파라미터 비교
diffusion
diffusion 조사해 보기 → 뭐.. 대충 쉽게 할 수 있다..
tsne
코드 - ugrp train (아마 모든 파라미터 활성화 시키고 학습) / ugrp test 60%
from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from torch.utils.data import DataLoader import torch from transformers import AdamW from tqdm import tqdm from sklearn.manifold import TSNE import matplotlib.pyplot as plt import numpy as np # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) # 2. 데이터셋 로드 및 분리 dataset = load_dataset("JANGJIWON/UGRP_sketchset_textbook") split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) # 80% train, 20% test train_dataset = split_dataset["train"] test_dataset = split_dataset["test"] # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["Happiness", "Sadness", "Disgust", "Fear", "Anger", "Surprise"] # 4. 데이터셋 처리 함수 정의 def collate_fn(samples): images = [s['image'] for s in samples] labels = [s['label'] for s in samples] inputs = processor(images=images, text=possible_labels, return_tensors="pt", padding=True) inputs['labels'] = torch.tensor(labels) return inputs # DataLoader 설정 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn) # 5. 학습 설정 optimizer = AdamW(model.parameters(), lr=5e-5) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) num_epochs = 5 # 6. 학습 및 평가 함수 def evaluate(model, loader): model.eval() correct = 0 total = 0 with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) logits = outputs.logits_per_image # 이미지-텍스트 유사도 preds = logits.argmax(dim=1) correct += (preds == labels).sum().item() total += labels.size(0) return 100 * correct / total # 7. 학습 루프 for epoch in range(1, num_epochs + 1): model.train() epoch_loss = 0 for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"): inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) loss = torch.nn.functional.cross_entropy(outputs.logits_per_image, labels) loss.backward() optimizer.step() optimizer.zero_grad() epoch_loss += loss.item() print(f"Epoch {epoch} Loss: {epoch_loss / len(train_loader):.4f}") # Train 정확도 train_acc = evaluate(model, train_loader) print(f"Train Accuracy: {train_acc:.2f}%") # Test 정확도 test_acc = evaluate(model, test_loader) print(f"Test Accuracy: {test_acc:.2f}%") # 8. t-SNE 시각화를 위한 임베딩 추출 def extract_embeddings(model, loader): model.eval() image_embeddings = [] labels = [] with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} batch_labels = batch['labels'].numpy() outputs = model(**inputs) image_embeds = outputs.image_embeds.cpu().numpy() image_embeddings.append(image_embeds) labels.extend(batch_labels) image_embeddings = np.vstack(image_embeddings) return image_embeddings, np.array(labels) # Train 데이터에서 임베딩 추출 image_embeds, embed_labels = extract_embeddings(model, train_loader) # t-SNE 적용 tsne = TSNE(n_components=2, random_state=42) image_tsne = tsne.fit_transform(image_embeds) # 시각화 plt.figure(figsize=(10, 7)) scatter = plt.scatter(image_tsne[:, 0], image_tsne[:, 1], c=embed_labels, cmap='viridis', alpha=0.7) plt.colorbar(scatter, ticks=range(len(possible_labels)), label='Emotion Labels') plt.title('t-SNE Visualization of Image Embeddings') plt.xlabel('t-SNE 1') plt.ylabel('t-SNE 2') plt.show() # Test 데이터에서 임베딩 추출 test_image_embeds, test_embed_labels = extract_embeddings(model, test_loader) # t-SNE 적용 test_image_tsne = tsne.fit_transform(test_image_embeds) # 시각화 plt.figure(figsize=(10, 7)) scatter = plt.scatter(test_image_tsne[:, 0], test_image_tsne[:, 1], c=test_embed_labels, cmap='viridis', alpha=0.7) plt.colorbar(scatter, ticks=range(len(possible_labels)), label='Emotion Labels') plt.title('t-SNE Visualization of Test Image Embeddings') plt.xlabel('t-SNE 1') plt.ylabel('t-SNE 2') plt.show()

코드 - emoset mone train / ugrp train (아마 모든 파라미터 활성화 시키고 학습) / ugrp test 60%
from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from torch.utils.data import DataLoader import torch from transformers import AdamW from tqdm import tqdm from peft import get_peft_model, LoraConfig from peft import PeftModel from huggingface_hub import hf_hub_download # 1. 모델 및 프로세서 로드 model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name) processor = CLIPProcessor.from_pretrained(model_name) model_weights_path = hf_hub_download(repo_id="JANGJIWON/EmoSet118K_MonetStyle_CLIP_student", filename="best_model_epoch_17.pth") model.load_state_dict(torch.load(model_weights_path, map_location='cpu'), strict=False) # 2. 데이터셋 로드 및 분리 dataset = load_dataset("JANGJIWON/UGRP_sketchset_textbook") split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) # 80% train, 20% test train_dataset = split_dataset["train"] test_dataset = split_dataset["test"] # 3. 레이블을 감정 텍스트로 매핑 possible_labels = ["Happiness", "Sadness", "Disgust", "Fear", "Anger", "Surprise"] # 4. 데이터셋 처리 함수 정의 def collate_fn(samples): images = [s['image'] for s in samples] labels = [s['label'] for s in samples] inputs = processor(images=images, text=possible_labels, return_tensors="pt", padding=True) inputs['labels'] = torch.tensor(labels) return inputs # DataLoader 설정 train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn) test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn) # 5. 학습 설정 optimizer = AdamW(model.parameters(), lr=5e-5) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) num_epochs = 1 # 6. 학습 및 평가 함수 def evaluate(model, loader): model.eval() correct = 0 total = 0 with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) logits = outputs.logits_per_image # 이미지-텍스트 유사도 preds = logits.argmax(dim=1) correct += (preds == labels).sum().item() total += labels.size(0) return 100 * correct / total # 7. 학습 루프 for epoch in range(1, num_epochs + 1): model.train() epoch_loss = 0 for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"): inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} labels = batch['labels'].to(device) outputs = model(**inputs) loss = torch.nn.functional.cross_entropy(outputs.logits_per_image, labels) loss.backward() optimizer.step() optimizer.zero_grad() epoch_loss += loss.item() print(f"Epoch {epoch} Loss: {epoch_loss / len(train_loader):.4f}") # Train 정확도 train_acc = evaluate(model, train_loader) print(f"Train Accuracy: {train_acc:.2f}%") # Test 정확도 test_acc = evaluate(model, test_loader) print(f"Test Accuracy: {test_acc:.2f}%") # 8. t-SNE 시각화를 위한 임베딩 추출 def extract_embeddings(model, loader): model.eval() image_embeddings = [] labels = [] with torch.no_grad(): for batch in loader: inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} batch_labels = batch['labels'].numpy() outputs = model(**inputs) image_embeds = outputs.image_embeds.cpu().numpy() image_embeddings.append(image_embeds) labels.extend(batch_labels) image_embeddings = np.vstack(image_embeddings) return image_embeddings, np.array(labels) # Train 데이터에서 임베딩 추출 image_embeds, embed_labels = extract_embeddings(model, train_loader) # t-SNE 적용 tsne = TSNE(n_components=2, random_state=42) image_tsne = tsne.fit_transform(image_embeds) # 시각화 plt.figure(figsize=(10, 7)) scatter = plt.scatter(image_tsne[:, 0], image_tsne[:, 1], c=embed_labels, cmap='viridis', alpha=0.7) plt.colorbar(scatter, ticks=range(len(possible_labels)), label='Emotion Labels') plt.title('t-SNE Visualization of Image Embeddings') plt.xlabel('t-SNE 1') plt.ylabel('t-SNE 2') plt.show() # Test 데이터에서 임베딩 추출 test_image_embeds, test_embed_labels = extract_embeddings(model, test_loader) # t-SNE 적용 test_image_tsne = tsne.fit_transform(test_image_embeds) # 시각화 plt.figure(figsize=(10, 7)) scatter = plt.scatter(test_image_tsne[:, 0], test_image_tsne[:, 1], c=test_embed_labels, cmap='viridis', alpha=0.7) plt.colorbar(scatter, ticks=range(len(possible_labels)), label='Emotion Labels') plt.title('t-SNE Visualization of Test Image Embeddings') plt.xlabel('t-SNE 1') plt.ylabel('t-SNE 2') plt.show()



3. Silhouette Score
- Silhouette Score는 클러스터링의 품질을 평가하는 지표로, 각 데이터 포인트가 동일 클러스터 내에서 얼마나 잘 맞는지를 측정합니다.
- t-SNE로 클러스터링된 데이터에 대해 Silhouette Score를 계산하면, 클러스터 내 응집도와 클러스터 간 분리도를 평가할 수 있습니다.
- 이 방법은 군집의 품질을 평가하는 데 유용합니다.
python 코드 복사 from sklearn.metrics import silhouette_score silhouette_avg = silhouette_score(image_tsne, embed_labels) print(f'Silhouette Score: {silhouette_avg:.4f}')
결제
Todo
지피티 코드 다시 짜오기
crawling 모아오기
mix data 만들기