import os
import torch
from torchvision import models, transforms
from torch.utils.data import DataLoader
from torch.optim import Adam
from torchvision.models import resnet50, ResNet50_Weights
import torch.nn as nn
from datasets import load_dataset
from PIL import Image
# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)
# 데이터셋 로드
train_dataset = load_dataset("crawling-emotions-in-google-train", split="train")
test_dataset = load_dataset("JANGJIWON/UGRP_sketchset_textbook", split="train")
# 테스트 데이터셋의 유효 라벨 목록
test_valid_label_indices = [0, 1, 2, 3, 4, 5]
model = resnet50(weights=weights)
model = model.to(device)
# 모델 저장을 위한 디렉토리 생성
os.makedirs("top_models", exist_ok=True)
top_models = []
# 데이터셋 전처리 설정
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
# 데이터셋 전처리 함수
def preprocess_data(batch):
# 이미지가 이미 PIL 객체인 경우
if isinstance(batch['image'], list):
batch['image'] = [img.convert("RGB") if isinstance(img, Image.Image) else Image.open(img).convert("RGB") for img in batch['image']]
elif isinstance(batch['image'], Image.Image):
batch['image'] = batch['image'].convert("RGB")
else:
batch['image'] = Image.open(batch['image']).convert("RGB")
# transform 적용
batch['image'] = [transform(img) for img in batch['image']]
return batch
train_dataset = train_dataset.with_transform(preprocess_data)
test_dataset = test_dataset.with_transform(preprocess_data)
# DataLoader 설정
def collate_fn(batch):
images = [item['image'] for item in batch]
labels = [item['label'] for item in batch]
# 텐서로 변환
images = torch.stack(images, dim=0)
labels = torch.tensor(labels, dtype=torch.long)
return images, labels
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=2)
# 손실 함수 및 옵티마이저 정의
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-4)
# 평가 함수
def evaluate(model, data_loader):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for images, labels in data_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, preds = torch.max(outputs, 1)
for pred, label in zip(preds, labels):
if pred.item() in test_valid_label_indices:
if pred.item() == label.item():
correct += 1
total += 1
accuracy = 100 * correct / total
return accuracy
# 모델 저장 함수
def save_top_models(epoch, accuracy, model, top_models):
model_filename = f"model_epoch_{epoch + 1}_accuracy_{accuracy:.2f}.pth"
model_path = os.path.join("top_models", model_filename)
top_models.append((accuracy, model_path))
top_models = sorted(top_models, key=lambda x: x[0], reverse=True)[:10]
torch.save(model.state_dict(), model_path)
print("\nTop 10 Models (by accuracy):")
for i, (acc, path) in enumerate(top_models, 1):
print(f"Rank {i}: Accuracy = {acc:.2f}%, Model Path = {path}")
return top_models
# 학습 루프
num_epochs = 10
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")
test_accuracy = evaluate(model, test_loader)
print(f"Test Accuracy after Epoch {epoch+1}: {test_accuracy:.2f}%")
top_models = save_top_models(epoch, test_accuracy, model, top_models)
print("Finished Training")
import torch
import clip
from PIL import Image
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# 장치 설정
device = "cuda" if torch.cuda.is_available() else "cpu"
# CLIP 모델 및 전처리기 로드
model, preprocess = clip.load("ViT-B/32", device=device)
# 이미지 로드 및 전처리
image_path = "path_to_your_image.jpg" # 분석할 이미지 경로
image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
# 텍스트 설명 후보 생성
text_descriptions = [
"happy",
"sadness",
"anger",
"surprise",
"fear"
"disgust"
]
text_tokens = clip.tokenize(text_descriptions).to(device)
# 이미지와 텍스트 특징 추출
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text_tokens)
# 유사도 계산
logits_per_image, logits_per_text = model(image, text_tokens)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
# 가장 높은 확률의 텍스트 설명 선택
predicted_description = text_descriptions[probs.argmax()]
# LLM(GPT-2) 로드 -> 변화 가능
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
# 감정 및 의도 생성 프롬프트
prompt = f"The image shows {predicted_description}. The person's emotion and intention are:"
# 입력 토큰화
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# 텍스트 생성
gpt2_model.eval()
with torch.no_grad():
outputs = gpt2_model.generate(
inputs.input_ids,
max_length=50,
num_return_sequences=1,
no_repeat_ngram_size=2,
early_stopping=True
)
# 생성된 텍스트 디코딩
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)