Skip to content

【开源实习】altclip模型微调 #2049

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions llm/finetune/altclip/altclip_mindnlp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import os
# 设置Hugging Face镜像源,加速模型下载
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import mindspore as ms
from mindspore import nn, context, Tensor, save_checkpoint
from mindspore.common.initializer import Normal
from mindspore.dataset import Cifar10Dataset, GeneratorDataset
from mindspore.dataset.vision import Resize, ToTensor, Normalize
from mindspore.dataset.transforms import TypeCast
import mindspore.common.dtype as mstype
from mindnlp.transformers import CLIPImageProcessor, AltCLIPVisionModel, AltCLIPVisionConfig
from sklearn.metrics import accuracy_score, f1_score

context.set_context(mode=context.PYNATIVE_MODE,
device_target="Ascend")


model_name = "BAAI/AltCLIP"
image_processor = CLIPImageProcessor.from_pretrained(model_name)
vision_model = AltCLIPVisionModel.from_pretrained(model_name)
vision_config = AltCLIPVisionConfig.from_pretrained(model_name)


for param in vision_model.get_parameters():
param.requires_grad = True

class ImageClassifier(nn.Cell):
def __init__(self, vision_model, num_classes=10):
super().__init__()
self.vision_model = vision_model
self.classifier = nn.Dense(vision_model.config.hidden_size, num_classes, weight_init=Normal(0.02))

def construct(self, pixel_values):
outputs = self.vision_model(pixel_values=pixel_values)
pooled_output = outputs.pooler_output
logits = self.classifier(pooled_output)
return logits

# 实例化分类模型
model = ImageClassifier(vision_model, num_classes=10)

# 图像处理
mean = image_processor.image_mean
std = image_processor.image_std

transform = [
Resize((224, 224)),
ToTensor(),
TypeCast(mstype.float32),
Normalize(mean=mean, std=std, is_hwc=False)
]

# 数据加载
def load_cifar10_data(batch_size=64):
type_cast_label = TypeCast(mstype.int32)
dataset = Cifar10Dataset("./cifar10_data_bin", usage='train', shuffle=False)
dataset = dataset.map(operations=transform, input_columns="image")

images, labels = [], []
for item in dataset.create_dict_iterator():
images.append(item["image"].asnumpy())
labels.append(item["label"].asnumpy())

images = np.array(images)
labels = np.array(labels)

train_idx, val_idx = train_test_split(np.arange(len(images)), test_size=0.2, random_state=42)
train_data = [(images[i], labels[i]) for i in train_idx]
val_data = [(images[i], labels[i]) for i in val_idx]

train_loader = GeneratorDataset(train_data, column_names=["image", "label"], shuffle=True)
train_loader = train_loader.map(operations=type_cast_label, input_columns="label").batch(batch_size)

val_loader = GeneratorDataset(val_data, column_names=["image", "label"], shuffle=False)
val_loader = val_loader.map(operations=type_cast_label, input_columns="label").batch(batch_size)

return train_loader, val_loader

# 自定义学习率调度器
class WarmupDecayLR:
def __init__(self, base_lr, total_steps, warmup_steps):
self.base_lr = base_lr
self.total_steps = total_steps
self.warmup_steps = warmup_steps
self.step_num = 0

def step(self):
self.step_num += 1
if self.step_num < self.warmup_steps:
return self.base_lr * self.step_num / self.warmup_steps
else:
decay_factor = (self.total_steps - self.step_num) / max(self.total_steps - self.warmup_steps, 1)
return self.base_lr * decay_factor

# 训练模型函数
def train_model(model, train_loader, val_loader, epochs=10, base_lr=2e-5):
loss_fn = nn.CrossEntropyLoss()

# 参数分组
decay_params = []
no_decay_params = []
for param in model.trainable_params():
pname = param.name.lower()
if "bias" in pname or "layernorm" in pname:
no_decay_params.append(param)
else:
decay_params.append(param)

group_params = [
{"params": decay_params, "weight_decay": 0.01},
{"params": no_decay_params, "weight_decay": 0.0},
{"order_params": model.trainable_params()}
]

optimizer = nn.AdamWeightDecay(params=group_params, learning_rate=base_lr)

total_steps = train_loader.get_dataset_size() * epochs
scheduler = WarmupDecayLR(base_lr, total_steps, warmup_steps=int(0.1 * total_steps))

net_with_loss = nn.WithLossCell(model, loss_fn)
train_network = nn.TrainOneStepCell(net_with_loss, optimizer)
train_network.set_train()

for epoch in range(epochs):
print(f"\nEpoch {epoch + 1}/{epochs}")
total_loss = 0

for images, labels in tqdm(train_loader.create_tuple_iterator(), desc="Training"):
lr = scheduler.step()
optimizer.learning_rate = Tensor(lr, dtype=ms.float32)

loss = train_network(images, labels)
total_loss += loss.asnumpy()

avg_loss = total_loss / train_loader.get_dataset_size()
print(f"Train Loss: {avg_loss:.4f}")

# 验证阶段
model.set_train(False)
all_preds, all_labels = [], []
val_loss_total = 0

for images, labels in tqdm(val_loader.create_tuple_iterator(), desc="Validation"):
logits = model(images)
loss = loss_fn(logits, labels)
val_loss_total += loss.asnumpy()

preds = logits.argmax(axis=1).asnumpy()
all_preds.extend(preds)
all_labels.extend(labels.asnumpy())

val_loss_avg = val_loss_total / val_loader.get_dataset_size()
acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average="macro")

print(f"Validation Loss: {val_loss_avg:.4f}")
print(f"Validation Accuracy: {acc:.4f}")
print(f"Validation F1 Score: {f1:.4f}")
model.set_train(True)

return model

# 主函数入口
def main():
print("加载 CIFAR-10 数据...")
train_loader, val_loader = load_cifar10_data(batch_size=32)

print("开始训练 AltCLIP 图像分类模型(MindNLP)...")
trained_model = train_model(model, train_loader, val_loader, epochs=10)

save_checkpoint(trained_model, "altclip_mindnlp.ckpt")
print("模型已保存至 altclip_mindnlp.ckpt")

if __name__ == "__main__":
main()
133 changes: 133 additions & 0 deletions llm/finetune/altclip/altclip_pytorch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from transformers import (
CLIPImageProcessor,
AltCLIPVisionModel,
AltCLIPVisionConfig,
get_linear_schedule_with_warmup
)
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm

# 设备设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 加载预训练模型与图像处理器
model_name = "BAAI/AltCLIP"
image_processor = CLIPImageProcessor.from_pretrained(model_name)
vision_model = AltCLIPVisionModel.from_pretrained(model_name)
vision_config = AltCLIPVisionConfig.from_pretrained(model_name)
vision_model.to(device)

# 解冻全部参数
for param in vision_model.parameters():
param.requires_grad = True

# 定义带分类头的模型
class ImageClassifier(torch.nn.Module):
def __init__(self, vision_model, num_classes=10):
super().__init__()
self.vision_model = vision_model
self.classifier = torch.nn.Linear(vision_config.hidden_size, num_classes)

def forward(self, pixel_values):
outputs = self.vision_model(pixel_values=pixel_values)
pooled_output = outputs.pooler_output
logits = self.classifier(pooled_output)
return logits

model = ImageClassifier(vision_model, num_classes=10).to(device)

# 图像预处理
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
])

# 加载数据集(仅 train 和 val)
def load_cifar10_data(root='./data', batch_size=64):
full_train_dataset = datasets.CIFAR10(root=root, train=True, download=True, transform=transform)
indices = list(range(len(full_train_dataset)))
train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

train_dataset = Subset(full_train_dataset, train_indices)
val_dataset = Subset(full_train_dataset, val_indices)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

return train_loader, val_loader

# 模型训练 + 验证函数
def train_model(model, train_loader, val_loader, epochs=10):
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(epochs):
print(f"\nEpoch {epoch + 1}/{epochs}")
model.train()
total_loss = 0

for images, labels in tqdm(train_loader, desc="Training"):
images, labels = images.to(device), labels.to(device)
logits = model(images)
loss = torch.nn.functional.cross_entropy(logits, labels)

loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()

total_loss += loss.item()

avg_train_loss = total_loss / len(train_loader)
print(f"Train Loss: {avg_train_loss:.4f}")

model.eval()
all_preds, all_labels = [], []
val_loss = 0

with torch.no_grad():
for images, labels in tqdm(val_loader, desc="Validation"):
images, labels = images.to(device), labels.to(device)
logits = model(images)
loss = torch.nn.functional.cross_entropy(logits, labels)
val_loss += loss.item()

preds = torch.argmax(logits, dim=1)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())

avg_val_loss = val_loss / len(val_loader)
val_acc = accuracy_score(all_labels, all_preds)
val_f1 = f1_score(all_labels, all_preds, average='macro')
print(f"Val Loss: {avg_val_loss:.4f}, Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}")

return model

# 主函数
def main():
print("加载 CIFAR-10 数据集...")
train_loader, val_loader = load_cifar10_data(batch_size=32)

print("开始训练 AltCLIP 图像分类模型...")
trained_model = train_model(model, train_loader, val_loader, epochs=10)

output_dir = "./cifar10_altclip_model"
os.makedirs(output_dir, exist_ok=True)
torch.save(trained_model.state_dict(), os.path.join(output_dir, "altclip_cifar10.pth"))
print(f"模型已保存至 {output_dir}")

if __name__ == "__main__":
main()