LoRA나 Adaper 논문에서 비교 대상이 되는 FL이 뭔지!
[FL]
what
비교 대상이 되는 fine tuning은 보통 ‘전통적인’ 파인 튜닝, 즉 일부, 또는 가중치를 바꾸는 튜닝을 의미한다.
위 페이지에서 다룬 4가지 방법을 사용해서 frozen하는 방식으로 메모리 이득을 얻기는 할 듯!?
code → add FC layer
지피티한테 물어본 코드
import math import torch import torch.nn as nn import torch.optim as optim import torchvision.transforms as transforms import torchvision.datasets as datasets from torch.utils.data import Subset, DataLoader from torch.utils.data.dataset import Dataset from PIL import Image import timm # ViT 모델 로드 def load_vit_model(): model = timm.create_model('vit_base_patch16_224', pretrained=True) preprocess = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), ]) return model, preprocess class FilteredCIFAR10(Dataset): def __init__(self, root, train=True, transform=None, download=False): self.cifar10 = datasets.CIFAR10(root=root, train=train, transform=transform, download=download) self.data = [] self.targets = [] for img, target in zip(self.cifar10.data, self.cifar10.targets): if target < 8: # Only keep classes 0-7 self.data.append(img) self.targets.append(target) def __len__(self): return len(self.data) def __getitem__(self, idx): img, target = self.data[idx], self.targets[idx] img = Image.fromarray(img) if self.cifar10.transform: img = self.cifar10.transform(img) return img, target class ViTFineTuning(nn.Module): def __init__(self, base_model, num_classes): super(ViTFineTuning, self).__init__() self.base_model = base_model self.fc = nn.Linear(self.base_model.head.in_features, num_classes) # 새로운 Fully Connected Layer 추가 self.base_model.head = self.fc # 모델의 헤드 부분을 새로운 FC 레이어로 교체 def forward(self, x): x = self.base_model.forward_features(x) x = x[:, 0] # 첫 번째 클래스 토큰만 사용 x = self.fc(x) return x # GPU 사용 설정 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # ViT 모델과 전처리 함수 로드 base_model, preprocess = load_vit_model() model = ViTFineTuning(base_model, num_classes=8).to(device) # num_classes 변경 criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) # 학습률 스케줄러 추가 # 전처리 함수로 데이터 로딩 train_dataset = FilteredCIFAR10(root='./data', train=True, download=True, transform=preprocess) test_dataset = FilteredCIFAR10(root='./data', train=False, download=True, transform=preprocess) # 훈련 및 테스트 데이터셋에서 100개의 샘플 선택 train_subset = Subset(train_dataset, range(100)) test_subset = Subset(test_dataset, range(100)) # 데이터 로더 train_loader = DataLoader(train_subset, batch_size=64, shuffle=True, num_workers=2) test_loader = DataLoader(test_subset, batch_size=64, shuffle=False, num_workers=2) # 훈련 루프 (간단히) num_epochs = 20 for epoch in range(num_epochs): model.train() for images, labels in train_loader: images, labels = images.to(device), labels.to(device) optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() scheduler.step() # 학습률 스케줄러 스텝 print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') # 에포크마다 정확도 계산 model.eval() with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: images, labels = images.to(device), labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = 100 * correct / total print(f'Accuracy after epoch {epoch+1}: {accuracy:.2f} %') print('Finished Training') # 최종 정확도 model.eval() with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: images, labels = images.to(device), labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print(f'Final Test Accuracy: {100 * correct / total:.2f} %')
일부 layer를 frozen하고 학습하게 만든 코드
import math import torch import torch.nn as nn import torch.optim as optim import torchvision.transforms as transforms import torchvision.datasets as datasets from torch.utils.data import Subset, DataLoader from torch.utils.data.dataset import Dataset from PIL import Image import timm # ViT 모델 로드 def load_vit_model(): model = timm.create_model('vit_base_patch16_224', pretrained=True) preprocess = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), ]) return model, preprocess class FilteredCIFAR10(Dataset): def __init__(self, root, train=True, transform=None, download=False): self.cifar10 = datasets.CIFAR10(root=root, train=train, transform=transform, download=download) self.data = [] self.targets = [] for img, target in zip(self.cifar10.data, self.cifar10.targets): if target < 8: # Only keep classes 0-7 self.data.append(img) self.targets.append(target) def __len__(self): return len(self.data) def __getitem__(self, idx): img, target = self.data[idx], self.targets[idx] img = Image.fromarray(img) if self.cifar10.transform: img = self.cifar10.transform(img) return img, target class ViTFineTuning(nn.Module): def __init__(self, base_model, num_classes): super(ViTFineTuning, self).__init__() self.base_model = base_model self.fc = nn.Linear(self.base_model.head.in_features, num_classes) # 새로운 Fully Connected Layer 추가 self.base_model.head = self.fc # 모델의 헤드 부분을 새로운 FC 레이어로 교체 def forward(self, x): x = self.base_model.forward_features(x) x = x[:, 0] # 첫 번째 클래스 토큰만 사용 x = self.fc(x) return x # GPU 사용 설정 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # ViT 모델과 전처리 함수 로드 base_model, preprocess = load_vit_model() model = ViTFineTuning(base_model, num_classes=8).to(device) # num_classes 변경 # 일부 레이어만 학습시키고 나머지를 고정 for name, param in model.named_parameters(): if 'head' not in name: # head 외의 파라미터는 고정 param.requires_grad = False criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01, momentum=0.9) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) # 학습률 스케줄러 추가 # 전처리 함수로 데이터 로딩 train_dataset = FilteredCIFAR10(root='./data', train=True, download=True, transform=preprocess) test_dataset = FilteredCIFAR10(root='./data', train=False, download=True, transform=preprocess) # 훈련 및 테스트 데이터셋에서 100개의 샘플 선택 train_subset = Subset(train_dataset, range(100)) test_subset = Subset(test_dataset, range(100)) # 데이터 로더 train_loader = DataLoader(train_subset, batch_size=64, shuffle=True, num_workers=2) test_loader = DataLoader(test_subset, batch_size=64, shuffle=False, num_workers=2) # 훈련 루프 (간단히) num_epochs = 20 for epoch in range(num_epochs): model.train() for images, labels in train_loader: images, labels = images.to(device), labels.to(device) optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() scheduler.step() # 학습률 스케줄러 스텝 print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') # 에포크마다 정확도 계산 model.eval() with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: images, labels = images.to(device), labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = 100 * correct / total print(f'Accuracy after epoch {epoch+1}: {accuracy:.2f} %') print('Finished Training') # 최종 정확도 model.eval() with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: images, labels = images.to(device), labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print(f'Final Test Accuracy: {100 * correct / total:.2f} %')
간단한 코드 설명
: ViT model의 head 일부를 FC layer로 바꾸어서 학습 시킨다.
result → add FC layer
오히려 잘된다?
개잘된다..
Epoch [1/20], Loss: 1.7136 Accuracy after epoch 1: 45.00 % Epoch [2/20], Loss: 1.1403 Accuracy after epoch 2: 73.00 % Epoch [3/20], Loss: 0.2301 Accuracy after epoch 3: 89.00 % Epoch [4/20], Loss: 0.0677 Accuracy after epoch 4: 93.00 % Epoch [5/20], Loss: 0.0132 Accuracy after epoch 5: 95.00 % Epoch [6/20], Loss: 0.0053 Accuracy after epoch 6: 93.00 % Epoch [7/20], Loss: 0.0082 Accuracy after epoch 7: 93.00 % Epoch [8/20], Loss: 0.0074 Accuracy after epoch 8: 93.00 % Epoch [9/20], Loss: 0.0075 Accuracy after epoch 9: 93.00 % Epoch [10/20], Loss: 0.0085 Accuracy after epoch 10: 93.00 % Epoch [11/20], Loss: 0.0031 Accuracy after epoch 11: 93.00 % Epoch [12/20], Loss: 0.0016 Accuracy after epoch 12: 93.00 % Epoch [13/20], Loss: 0.0075 Accuracy after epoch 13: 94.00 % Epoch [14/20], Loss: 0.0049 Accuracy after epoch 14: 94.00 % Epoch [15/20], Loss: 0.0034 Accuracy after epoch 15: 94.00 % Epoch [16/20], Loss: 0.0045 Accuracy after epoch 16: 94.00 % Epoch [17/20], Loss: 0.0067 Accuracy after epoch 17: 94.00 % Epoch [18/20], Loss: 0.0019 Accuracy after epoch 18: 94.00 % Epoch [19/20], Loss: 0.0058 Accuracy after epoch 19: 94.00 % Epoch [20/20], Loss: 0.0061 Accuracy after epoch 20: 94.00 % Finished Training Final Test Accuracy: 94.00 %
→ 지금 나는 FC 레이어, 즉 출력층만 바꿔서 학습을 시킨거다. 이런 높은 정확도는 image net과 CIFAR10의 이미지들이 서로 유사했기 때문에 가능한 결과라고 예측할 수 있다.
→ 당연히 FC 레이어만 학습 시켰기 때문에 학습 속도는 LoRA에서 보다 빨랐을 수 밖에 없었을 듯!
Q
FL이 무조건적으로 LoRA나 adapter보다 안좋은가?
놉. 데이터셋이 pretrained model의 데이터셋과 유사하다면 FL의 결과가 더 좋을 수 있다.
이번에는 CIFAR10이라는 새로운 데이터 셋이 기존 ImageNet과 비슷해서 이런 결과가 나왔던 것 같다.
FL이 LoRA보다 속도가 더 빨랐던 이유? / 메모리를 덜 썼던 이유?
이번에는 FC 레이어만 학습 시켜서 그랬던 것 같다.
code → add FC layer and train some layer
일부 layer를 train!
block과 layer의 차이?
아직 자세히는 모름…
블럭은 transfomer model에서 사용되는 용어인 것 같다..
여러 개의 layer 한 묶음을 block이라고 하는 것 같다..
이 부분이 일부 block train 하는 과정
# 일부 레이어만 학습시키고 나머지를 고정 for name, param in model.named_parameters(): if 'head' not in name and 'blocks.10' not in name and 'blocks.11' not in name: # 마지막 두 블록과 헤드 외의 파라미터는 고정 param.requires_grad = False
import math import torch import torch.nn as nn import torch.optim as optim import torchvision.transforms as transforms import torchvision.datasets as datasets from torch.utils.data import Subset, DataLoader from torch.utils.data.dataset import Dataset from PIL import Image import timm # ViT 모델 로드 def load_vit_model(): model = timm.create_model('vit_base_patch16_224', pretrained=True) preprocess = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), ]) return model, preprocess class FilteredCIFAR10(Dataset): def __init__(self, root, train=True, transform=None, download=False): self.cifar10 = datasets.CIFAR10(root=root, train=train, transform=transform, download=download) self.data = [] self.targets = [] for img, target in zip(self.cifar10.data, self.cifar10.targets): if target < 8: # Only keep classes 0-7 self.data.append(img) self.targets.append(target) def __len__(self): return len(self.data) def __getitem__(self, idx): img, target = self.data[idx], self.targets[idx] img = Image.fromarray(img) if self.cifar10.transform: img = self.cifar10.transform(img) return img, target class ViTFineTuning(nn.Module): def __init__(self, base_model, num_classes): super(ViTFineTuning, self).__init__() self.base_model = base_model self.fc = nn.Linear(self.base_model.head.in_features, num_classes) # 새로운 Fully Connected Layer 추가 self.base_model.head = self.fc # 모델의 헤드 부분을 새로운 FC 레이어로 교체 def forward(self, x): x = self.base_model.forward_features(x) x = x[:, 0] # 첫 번째 클래스 토큰만 사용 x = self.fc(x) return x # GPU 사용 설정 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # ViT 모델과 전처리 함수 로드 base_model, preprocess = load_vit_model() model = ViTFineTuning(base_model, num_classes=8).to(device) # num_classes 변경 # 일부 레이어만 학습시키고 나머지를 고정 for name, param in model.named_parameters(): if 'head' not in name and 'blocks.10' not in name and 'blocks.11' not in name: # 마지막 두 블록과 헤드 외의 파라미터는 고정 param.requires_grad = False criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01, momentum=0.9) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) # 학습률 스케줄러 추가 # 전처리 함수로 데이터 로딩 train_dataset = FilteredCIFAR10(root='./data', train=True, download=True, transform=preprocess) test_dataset = FilteredCIFAR10(root='./data', train=False, download=True, transform=preprocess) # 훈련 및 테스트 데이터셋에서 100개의 샘플 선택 train_subset = Subset(train_dataset, range(100)) test_subset = Subset(test_dataset, range(100)) # 데이터 로더 train_loader = DataLoader(train_subset, batch_size=64, shuffle=True, num_workers=2) test_loader = DataLoader(test_subset, batch_size=64, shuffle=False, num_workers=2) # 훈련 루프 (간단히) num_epochs = 20 for epoch in range(num_epochs): model.train() for images, labels in train_loader: images, labels = images.to(device), labels.to(device) optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() scheduler.step() # 학습률 스케줄러 스텝 print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') # 에포크마다 정확도 계산 model.eval() with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: images, labels = images.to(device), labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = 100 * correct / total print(f'Accuracy after epoch {epoch+1}: {accuracy:.2f} %') print('Finished Training') # 최종 정확도 model.eval() with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: images, labels = images.to(device), labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print(f'Final Test Accuracy: {100 * correct / total:.2f} %')
result → add FC layer and train some layer
Good!
Epoch [1/20], Loss: 1.8082 Accuracy after epoch 1: 52.00 % Epoch [2/20], Loss: 0.6429 Accuracy after epoch 2: 93.00 % Epoch [3/20], Loss: 0.0665 Accuracy after epoch 3: 92.00 % Epoch [4/20], Loss: 0.0219 Accuracy after epoch 4: 94.00 % Epoch [5/20], Loss: 0.0014 Accuracy after epoch 5: 94.00 % Epoch [6/20], Loss: 0.0002 Accuracy after epoch 6: 94.00 % Epoch [7/20], Loss: 0.0000 Accuracy after epoch 7: 95.00 % Epoch [8/20], Loss: 0.0002 Accuracy after epoch 8: 95.00 % Epoch [9/20], Loss: 0.0000 Accuracy after epoch 9: 95.00 % Epoch [10/20], Loss: 0.0003 Accuracy after epoch 10: 95.00 % Epoch [11/20], Loss: 0.0001 Accuracy after epoch 11: 95.00 % Epoch [12/20], Loss: 0.0002 Accuracy after epoch 12: 95.00 % Epoch [13/20], Loss: 0.0000 Accuracy after epoch 13: 95.00 % Epoch [14/20], Loss: 0.0000 Accuracy after epoch 14: 95.00 % Epoch [15/20], Loss: 0.0000 Accuracy after epoch 15: 95.00 % Epoch [16/20], Loss: 0.0000 Accuracy after epoch 16: 95.00 % Epoch [17/20], Loss: 0.0000 Accuracy after epoch 17: 95.00 % Epoch [18/20], Loss: 0.0003 Accuracy after epoch 18: 95.00 % Epoch [19/20], Loss: 0.0004 Accuracy after epoch 19: 95.00 % Epoch [20/20], Loss: 0.0002 Accuracy after epoch 20: 95.00 % Finished Training
code without GPT…
GPT 없이도 코드를 짜보아야 할텐데…