diff --git a/.gitignore b/.gitignore index 4d96fbb..d0176a9 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,6 @@ __pycache__ # ignore .idea folder .idea + +# pycaret log +*.log diff --git a/No Caffeine No Gain.pdf b/No Caffeine No Gain.pdf new file mode 100644 index 0000000..c0c7e78 Binary files /dev/null and b/No Caffeine No Gain.pdf differ diff --git a/args.py b/args.py index 45aecc0..37bf28e 100644 --- a/args.py +++ b/args.py @@ -71,7 +71,11 @@ def parse_args(mode='train'): # Pseudo Labeling parser.add_argument('--use_pseudo', default=False, type=bool, help='Using Pseudo labeling') parser.add_argument('--pseudo_label_file', default='', type=str, help='file path for pseudo labeling') - + + # Finetuning + parser.add_argument('--use_finetune', default=False, type=bool, help='Using Fine Tuning') + parser.add_argument('--trained_model', default='/opt/ml/code/p4-dkt-no_caffeine_no_gain/models/re_pse_Bert_40_5/model_epoch7.pt', type=str, help='pretrained model path') + # log parser.add_argument('--log_steps', default=50, type=int, help='print log per n steps') diff --git a/baseline.ipynb b/baseline.ipynb deleted file mode 100644 index 6e2541f..0000000 --- a/baseline.ipynb +++ /dev/null @@ -1,1326 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Nv5EvIVPnz0y" - }, - "source": [ - "# LSTM 활용한 베이스라인" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install easydict" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "wtJhitPznz06" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import os\n", - "import torch\n", - "import easydict\n", - "import numpy as np\n", - "from sklearn.preprocessing import LabelEncoder\n", - "import time\n", - "import datetime\n", - "from datetime import datetime\n", - "import random\n", - "import wandb" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6w3E-ACunz07" - }, - "source": [ - "## 1. 데이터 로드 및 전처리 컴포넌트" - ] - }, - { - "cell_type": "code", - "execution_count": 224, - "metadata": { - "id": "od9O-ttAnz08" - }, - "outputs": [], - "source": [ - "import os\n", - "from datetime import datetime\n", - "import time\n", - "import tqdm\n", - "import pandas as pd\n", - "import random\n", - "from sklearn.preprocessing import LabelEncoder\n", - "import numpy as np\n", - "import torch\n", - "\n", - "class Preprocess:\n", - " def __init__(self,args):\n", - " self.args = args\n", - " self.train_data = None\n", - " self.test_data = None\n", - " \n", - "\n", - " def get_train_data(self):\n", - " return self.train_data\n", - "\n", - " def get_test_data(self):\n", - " return self.test_data\n", - "\n", - " def split_data(self, data, ratio=0.7, shuffle=True, seed=0):\n", - " \"\"\"\n", - " split data into two parts with a given ratio.\n", - " \"\"\"\n", - " if shuffle:\n", - " random.seed(seed) # fix to default seed 0\n", - " random.shuffle(data)\n", - "\n", - " size = int(len(data) * ratio)\n", - " data_1 = data[:size]\n", - " data_2 = data[size:]\n", - "\n", - " return data_1, data_2\n", - "\n", - " def __save_labels(self, encoder, name):\n", - " le_path = os.path.join(self.args.asset_dir, name + '_classes.npy')\n", - " np.save(le_path, encoder.classes_)\n", - "\n", - " def __preprocessing(self, df, is_train = True):\n", - " cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']\n", - "\n", - " if not os.path.exists(self.args.asset_dir):\n", - " os.makedirs(self.args.asset_dir)\n", - " \n", - " for col in cate_cols:\n", - " \n", - " \n", - " le = LabelEncoder()\n", - " if is_train:\n", - " #For UNKNOWN class\n", - " a = df[col].unique().tolist() + ['unknown']\n", - " le.fit(a)\n", - " self.__save_labels(le, col)\n", - " else:\n", - " label_path = os.path.join(self.args.asset_dir,col+'_classes.npy')\n", - " le.classes_ = np.load(label_path)\n", - " \n", - " df[col] = df[col].apply(lambda x: x if x in le.classes_ else 'unknown')\n", - "\n", - " #모든 컬럼이 범주형이라고 가정\n", - " df[col]= df[col].astype(str)\n", - " test = le.transform(df[col])\n", - " df[col] = test\n", - " \n", - "\n", - " def convert_time(s):\n", - " timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())\n", - " return int(timestamp)\n", - "\n", - " df['Timestamp'] = df['Timestamp'].apply(convert_time)\n", - " \n", - " return df\n", - "\n", - " def __feature_engineering(self, df):\n", - " #TODO\n", - " return df\n", - "\n", - " def load_data_from_file(self, file_name, is_train=True):\n", - " csv_file_path = os.path.join(self.args.data_dir, file_name)\n", - " df = pd.read_csv(csv_file_path)#, nrows=100000)\n", - " df = self.__feature_engineering(df)\n", - " # df = self.__preprocessing(df, is_train)\n", - "\n", - " # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용\n", - "\n", - " \n", - " self.args.n_questions = len(np.load(os.path.join(self.args.asset_dir,'assessmentItemID_classes.npy')))\n", - " self.args.n_test = len(np.load(os.path.join(self.args.asset_dir,'testId_classes.npy')))\n", - " self.args.n_tag = len(np.load(os.path.join(self.args.asset_dir,'KnowledgeTag_classes.npy')))\n", - " \n", - "\n", - "\n", - " df = df.sort_values(by=['userID','Timestamp'], axis=0)\n", - " columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag']\n", - " group = df[columns].groupby('userID').apply(\n", - " lambda r: (\n", - " r['testId'].values, \n", - " r['assessmentItemID'].values,\n", - " r['KnowledgeTag'].values,\n", - " r['answerCode'].values\n", - " )\n", - " )\n", - "\n", - " return df[columns]\n", - "\n", - "\n", - " def load_train_data(self, file_name):\n", - " self.train_data = self.load_data_from_file(file_name)\n", - "\n", - " def load_test_data(self, file_name):\n", - " self.test_data = self.load_data_from_file(file_name, is_train= False)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "E-MQhPevnz08" - }, - "source": [ - "## 2. 데이터 셋 / 데이터 로더" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "h29rn8YNnz09" - }, - "outputs": [], - "source": [ - "class DKTDataset(torch.utils.data.Dataset):\n", - " def __init__(self, data, args):\n", - " self.data = data\n", - " self.args = args\n", - "\n", - " def __getitem__(self, index):\n", - " row = self.data[index]\n", - "\n", - " # 각 data의 sequence length\n", - " seq_len = len(row[0])\n", - "\n", - " test, question, tag, correct = row[0], row[1], row[2], row[3]\n", - " \n", - "\n", - " cate_cols = [test, question, tag, correct]\n", - "\n", - " # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다\n", - " if seq_len > self.args.max_seq_len:\n", - " for i, col in enumerate(cate_cols):\n", - " cate_cols[i] = col[-self.args.max_seq_len:]\n", - " mask = np.ones(self.args.max_seq_len, dtype=np.int16)\n", - " else:\n", - " mask = np.zeros(self.args.max_seq_len, dtype=np.int16)\n", - " mask[-seq_len:] = 1\n", - "\n", - " # mask도 columns 목록에 포함시킴\n", - " cate_cols.append(mask)\n", - "\n", - " # np.array -> torch.tensor 형변환\n", - " for i, col in enumerate(cate_cols):\n", - " cate_cols[i] = torch.tensor(col)\n", - "\n", - " return cate_cols\n", - "\n", - " def __len__(self):\n", - " return len(self.data)\n", - "\n", - "\n", - "\n", - "\n", - "def collate(batch):\n", - " col_n = len(batch[0])\n", - " col_list = [[] for _ in range(col_n)]\n", - " max_seq_len = len(batch[0][-1])\n", - "\n", - " \n", - " # batch의 값들을 각 column끼리 그룹화\n", - " for row in batch:\n", - " for i, col in enumerate(row):\n", - " pre_padded = torch.zeros(max_seq_len)\n", - " pre_padded[-len(col):] = col\n", - " col_list[i].append(pre_padded)\n", - "\n", - "\n", - " for i, _ in enumerate(col_list):\n", - " col_list[i] =torch.stack(col_list[i])\n", - " \n", - " return tuple(col_list)\n", - "\n", - "\n", - "def get_loaders(args, train, valid):\n", - "\n", - " pin_memory = False\n", - " train_loader, valid_loader = None, None\n", - " \n", - " if train is not None:\n", - " trainset = DKTDataset(train, args)\n", - " train_loader = torch.utils.data.DataLoader(trainset, num_workers=args.num_workers, shuffle=True,\n", - " batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)\n", - " if valid is not None:\n", - " valset = DKTDataset(valid, args)\n", - " valid_loader = torch.utils.data.DataLoader(valset, num_workers=args.num_workers, shuffle=False,\n", - " batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)\n", - "\n", - " return train_loader, valid_loader" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QyiplxY6nz0-" - }, - "source": [ - "## 3. LSTM 기반의 모델" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "aO72oKAgnz0-" - }, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F \n", - "import numpy as np\n", - "import copy\n", - "import math\n", - "\n", - "try:\n", - " from transformers.modeling_bert import BertConfig, BertEncoder, BertModel \n", - "except:\n", - " from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel \n", - "\n", - "\n", - "\n", - "\n", - "class LSTM(nn.Module):\n", - "\n", - " def __init__(self, args):\n", - " super(LSTM, self).__init__()\n", - " self.args = args\n", - " self.device = args.device\n", - "\n", - " self.hidden_dim = self.args.hidden_dim\n", - " self.n_layers = self.args.n_layers\n", - "\n", - " # Embedding \n", - " # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)\n", - " self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)\n", - " self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)\n", - " self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)\n", - " self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)\n", - "\n", - " # embedding combination projection\n", - " self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)\n", - "\n", - " self.lstm = nn.LSTM(self.hidden_dim,\n", - " self.hidden_dim,\n", - " self.n_layers,\n", - " batch_first=True)\n", - " \n", - " # Fully connected layer\n", - " self.fc = nn.Linear(self.hidden_dim, 1)\n", - "\n", - " self.activation = nn.Sigmoid()\n", - "\n", - " def init_hidden(self, batch_size):\n", - " h = torch.zeros(\n", - " self.n_layers,\n", - " batch_size,\n", - " self.hidden_dim)\n", - " h = h.to(self.device)\n", - "\n", - " c = torch.zeros(\n", - " self.n_layers,\n", - " batch_size,\n", - " self.hidden_dim)\n", - " c = c.to(self.device)\n", - "\n", - " return (h, c)\n", - "\n", - " def forward(self, input):\n", - "\n", - " test, question, tag, _, mask, interaction, _ = input\n", - "\n", - " batch_size = interaction.size(0)\n", - "\n", - " # Embedding\n", - "\n", - " embed_interaction = self.embedding_interaction(interaction)\n", - " embed_test = self.embedding_test(test)\n", - " embed_question = self.embedding_question(question)\n", - " embed_tag = self.embedding_tag(tag)\n", - " \n", - "\n", - " embed = torch.cat([embed_interaction,\n", - " embed_test,\n", - " embed_question,\n", - " embed_tag,], 2)\n", - "\n", - " X = self.comb_proj(embed)\n", - "\n", - " hidden = self.init_hidden(batch_size)\n", - " out, hidden = self.lstm(X, hidden)\n", - " out = out.contiguous().view(batch_size, -1, self.hidden_dim)\n", - "\n", - " out = self.fc(out)\n", - " preds = self.activation(out).view(batch_size, -1)\n", - "\n", - " return preds\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NEaAa6Prnz0_" - }, - "source": [ - "## 4. 모델 훈련을 위한 함수들" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "r_wU37QGnz0_" - }, - "outputs": [], - "source": [ - "import os, sys\n", - "\n", - "import numpy as np\n", - "\n", - "import tarfile\n", - "import torch\n", - "from torch import nn\n", - "import torch.nn.functional as F\n", - "from torch.optim import Adam, AdamW\n", - "\n", - "from torch.optim.lr_scheduler import ReduceLROnPlateau\n", - "\n", - "from transformers import get_linear_schedule_with_warmup\n", - "from transformers import get_cosine_schedule_with_warmup\n", - "\n", - "from sklearn.metrics import roc_auc_score\n", - "from sklearn.metrics import accuracy_score\n", - "import scipy.stats\n", - "\n", - "\n", - "# 훈련을 하기 위한 세팅\n", - "def get_optimizer(model, args):\n", - " if args.optimizer == 'adam':\n", - " optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)\n", - " if args.optimizer == 'adamW':\n", - " optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)\n", - " \n", - " # 모든 parameter들의 grad값을 0으로 초기화\n", - " optimizer.zero_grad()\n", - " \n", - " return optimizer\n", - "\n", - "def get_scheduler(optimizer, args):\n", - " if args.scheduler == 'plateau':\n", - " scheduler = ReduceLROnPlateau(optimizer, patience=10, factor=0.5, mode='max', verbose=True)\n", - " elif args.scheduler == 'linear_warmup':\n", - " scheduler = get_linear_schedule_with_warmup(optimizer,\n", - " num_warmup_steps=args.warmup_steps,\n", - " num_training_steps=args.total_steps)\n", - " return scheduler\n", - "\n", - "def get_criterion(pred, target):\n", - " loss = nn.BCELoss(reduction=\"none\")\n", - " return loss(pred, target)\n", - "\n", - "def get_metric(targets, preds):\n", - " auc = roc_auc_score(targets, preds)\n", - " acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))\n", - "\n", - " return auc, acc\n", - "\n", - "def get_model(args):\n", - " \"\"\"\n", - " Load model and move tensors to a given devices.\n", - " \"\"\"\n", - " if args.model == 'lstm': model = LSTM(args)\n", - " \n", - "\n", - " model.to(args.device)\n", - "\n", - " return model\n", - "\n", - "\n", - "# 배치 전처리\n", - "def process_batch(batch, args):\n", - "\n", - " test, question, tag, correct, mask = batch\n", - " \n", - " \n", - " # change to float\n", - " mask = mask.type(torch.FloatTensor)\n", - " correct = correct.type(torch.FloatTensor)\n", - "\n", - " # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용\n", - " # saint의 경우 decoder에 들어가는 input이다\n", - " interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.\n", - " interaction = interaction.roll(shifts=1, dims=1)\n", - " interaction[:, 0] = 0 # set padding index to the first sequence\n", - " interaction = (interaction * mask).to(torch.int64)\n", - " # print(interaction)\n", - " # exit()\n", - " # test_id, question_id, tag\n", - " test = ((test + 1) * mask).to(torch.int64)\n", - " question = ((question + 1) * mask).to(torch.int64)\n", - " tag = ((tag + 1) * mask).to(torch.int64)\n", - "\n", - " # gather index\n", - " # 마지막 sequence만 사용하기 위한 index\n", - " gather_index = torch.tensor(np.count_nonzero(mask, axis=1))\n", - " gather_index = gather_index.view(-1, 1) - 1\n", - "\n", - "\n", - " # device memory로 이동\n", - "\n", - " test = test.to(args.device)\n", - " question = question.to(args.device)\n", - "\n", - "\n", - " tag = tag.to(args.device)\n", - " correct = correct.to(args.device)\n", - " mask = mask.to(args.device)\n", - "\n", - " interaction = interaction.to(args.device)\n", - " gather_index = gather_index.to(args.device)\n", - "\n", - " return (test, question,\n", - " tag, correct, mask,\n", - " interaction, gather_index)\n", - "\n", - "\n", - "# loss계산하고 parameter update!\n", - "def compute_loss(preds, targets):\n", - " \"\"\"\n", - " Args :\n", - " preds : (batch_size, max_seq_len)\n", - " targets : (batch_size, max_seq_len)\n", - "\n", - " \"\"\"\n", - " loss = get_criterion(preds, targets)\n", - " #마지막 시퀀드에 대한 값만 loss 계산\n", - " loss = loss[:,-1]\n", - " loss = torch.mean(loss)\n", - " return loss\n", - "\n", - "def update_params(loss, model, optimizer, args):\n", - " loss.backward()\n", - " torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)\n", - " optimizer.step()\n", - " optimizer.zero_grad()\n", - "\n", - "\n", - "\n", - "def save_checkpoint(state, model_dir, model_filename):\n", - " print('saving model ...')\n", - " if not os.path.exists(model_dir):\n", - " os.makedirs(model_dir) \n", - " torch.save(state, os.path.join(model_dir, model_filename))\n", - "\n", - "\n", - "\n", - "def load_model(args):\n", - " \n", - " \n", - " model_path = os.path.join(args.model_dir, args.model_name)\n", - " print(\"Loading Model from:\", model_path)\n", - " load_state = torch.load(model_path)\n", - " model = get_model(args)\n", - "\n", - " # 1. load model state\n", - " model.load_state_dict(load_state['state_dict'], strict=True)\n", - " \n", - " \n", - " print(\"Loading Model from:\", model_path, \"...Finished.\")\n", - " return model\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YO_xFaJYnz1B" - }, - "source": [ - "## 5. 전체 프로세스를 담당하는 함수들" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "BMiIOHgJnz1D" - }, - "outputs": [], - "source": [ - "\n", - "def run(args, train_data, valid_data):\n", - " train_loader, valid_loader = get_loaders(args, train_data, valid_data)\n", - " \n", - " # only when using warmup scheduler\n", - " args.total_steps = int(len(train_loader.dataset) / args.batch_size) * (args.n_epochs)\n", - " args.warmup_steps = args.total_steps // 10\n", - " \n", - " model = get_model(args)\n", - " optimizer = get_optimizer(model, args)\n", - " scheduler = get_scheduler(optimizer, args)\n", - "\n", - " best_auc = -1\n", - " early_stopping_counter = 0\n", - " for epoch in range(args.n_epochs):\n", - "\n", - " print(f\"Start Training: Epoch {epoch + 1}\")\n", - " \n", - " ### TRAIN\n", - " train_auc, train_acc, train_loss = train(train_loader, model, optimizer, args)\n", - " \n", - " ### VALID\n", - " auc, acc, _, _ = validate(valid_loader, model, args)\n", - "\n", - " ### TODO: model save or early stopping\n", - " wandb.log({\"epoch\": epoch, \"train_loss\": train_loss, \"train_auc\": train_auc, \"train_acc\":train_acc,\n", - " \"valid_auc\":auc, \"valid_acc\":acc})\n", - " if auc > best_auc:\n", - " best_auc = auc\n", - " # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.\n", - " model_to_save = model.module if hasattr(model, 'module') else model\n", - " save_checkpoint({\n", - " 'epoch': epoch + 1,\n", - " 'state_dict': model_to_save.state_dict(),\n", - " },\n", - " args.model_dir, 'model.pt',\n", - " )\n", - " early_stopping_counter = 0\n", - " else:\n", - " early_stopping_counter += 1\n", - " if early_stopping_counter >= args.patience:\n", - " print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')\n", - " break\n", - "\n", - " # scheduler\n", - " if args.scheduler == 'plateau':\n", - " scheduler.step(best_auc)\n", - " else:\n", - " scheduler.step()\n", - "\n", - "\n", - "def train(train_loader, model, optimizer, args):\n", - " model.train()\n", - "\n", - " total_preds = []\n", - " total_targets = []\n", - " losses = []\n", - " for step, batch in enumerate(train_loader):\n", - " input = process_batch(batch, args)\n", - " preds = model(input)\n", - " targets = input[3] # correct\n", - "\n", - "\n", - " loss = compute_loss(preds, targets)\n", - " update_params(loss, model, optimizer, args)\n", - "\n", - " if step % args.log_steps == 0:\n", - " print(f\"Training steps: {step} Loss: {str(loss.item())}\")\n", - " \n", - " # predictions\n", - " preds = preds[:,-1]\n", - " targets = targets[:,-1]\n", - "\n", - " if args.device == 'cuda':\n", - " preds = preds.to('cpu').detach().numpy()\n", - " targets = targets.to('cpu').detach().numpy()\n", - " else: # cpu\n", - " preds = preds.detach().numpy()\n", - " targets = targets.detach().numpy()\n", - " \n", - " total_preds.append(preds)\n", - " total_targets.append(targets)\n", - " losses.append(loss)\n", - " \n", - "\n", - " total_preds = np.concatenate(total_preds)\n", - " total_targets = np.concatenate(total_targets)\n", - "\n", - " # Train AUC / ACC\n", - " auc, acc = get_metric(total_targets, total_preds)\n", - " loss_avg = sum(losses)/len(losses)\n", - " print(f'TRAIN AUC : {auc} ACC : {acc}')\n", - " return auc, acc, loss_avg\n", - " \n", - "\n", - "def validate(valid_loader, model, args):\n", - " model.eval()\n", - "\n", - " total_preds = []\n", - " total_targets = []\n", - " for step, batch in enumerate(valid_loader):\n", - " input = process_batch(batch, args)\n", - "\n", - " preds = model(input)\n", - " targets = input[3] # correct\n", - "\n", - "\n", - " # predictions\n", - " preds = preds[:,-1]\n", - " targets = targets[:,-1]\n", - " \n", - " if args.device == 'cuda':\n", - " preds = preds.to('cpu').detach().numpy()\n", - " targets = targets.to('cpu').detach().numpy()\n", - " else: # cpu\n", - " preds = preds.detach().numpy()\n", - " targets = targets.detach().numpy()\n", - "\n", - " total_preds.append(preds)\n", - " total_targets.append(targets)\n", - "\n", - " total_preds = np.concatenate(total_preds)\n", - " total_targets = np.concatenate(total_targets)\n", - "\n", - " # Train AUC / ACC\n", - " auc, acc = get_metric(total_targets, total_preds)\n", - " \n", - " print(f'VALID AUC : {auc} ACC : {acc}\\n')\n", - "\n", - " return auc, acc, total_preds, total_targets\n", - "\n", - "\n", - "\n", - "def inference(args, test_data):\n", - " \n", - " model = load_model(args)\n", - " model.eval()\n", - " _, test_loader = get_loaders(args, None, test_data)\n", - " \n", - " \n", - " total_preds = []\n", - " \n", - " for step, batch in enumerate(test_loader):\n", - " input = process_batch(batch, args)\n", - "\n", - " preds = model(input)\n", - " \n", - "\n", - " # predictions\n", - " preds = preds[:,-1]\n", - " \n", - "\n", - " if args.device == 'cuda':\n", - " preds = preds.to('cpu').detach().numpy()\n", - " else: # cpu\n", - " preds = preds.detach().numpy()\n", - " \n", - " total_preds+=list(preds)\n", - "\n", - " write_path = os.path.join(args.output_dir, \"output.csv\")\n", - " if not os.path.exists(args.output_dir):\n", - " os.makedirs(args.output_dir) \n", - " with open(write_path, 'w', encoding='utf8') as w:\n", - " print(\"writing prediction : {}\".format(write_path))\n", - " w.write(\"id,prediction\\n\")\n", - " for id, p in enumerate(total_preds):\n", - " w.write('{},{}\\n'.format(id,p))\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gPEE00qUnz1E" - }, - "source": [ - "## 6.실행부분" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": { - "id": "qZmwQenqnz1E" - }, - "outputs": [], - "source": [ - "data_dir = '/opt/ml/input/data/train_dataset'\n", - "file_name = 'train_data.csv'\n", - "test_file_name = 'test_data.csv'\n", - "\n", - "config = {}\n", - "\n", - "# 설정\n", - "config['seed'] = 42\n", - "config['device'] = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - "config['data_dir'] = data_dir\n", - "config['asset_dir'] = 'asset'\n", - "config['model_dir'] = 'models'\n", - "config['model_name'] = 'model.pt'\n", - "config['output_dir'] = 'output'\n", - "\n", - "# 데이터\n", - "config['max_seq_len'] = 20\n", - "config['num_workers'] = 1\n", - "\n", - "\n", - "# 모델\n", - "config['hidden_dim'] = 64\n", - "config['n_layers'] = 2\n", - "config['dropout'] = 0.2\n", - "\n", - "# 훈련\n", - "config['n_epochs'] = 20\n", - "config['batch_size'] = 64\n", - "config['lr'] = 0.0001\n", - "config['clip_grad'] = 10\n", - "config['log_steps'] = 50\n", - "config['patience'] = 5\n", - "\n", - "\n", - "\n", - "### 중요 ###\n", - "config['model'] = 'lstm'\n", - "config['optimizer'] = 'adam'\n", - "config['scheduler'] = 'plateau'\n", - "\n", - "\n", - "args = easydict.EasyDict(config)" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [], - "source": [ - "def setSeeds(seed = 42):\n", - " # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.\n", - " os.environ['PYTHONHASHSEED'] = str(seed)\n", - " random.seed(seed)\n", - " np.random.seed(seed)\n", - " torch.manual_seed(seed) \n", - " torch.cuda.manual_seed(seed)\n", - " torch.backends.cudnn.deterministic = True" - ] - }, - { - "cell_type": "code", - "execution_count": 225, - "metadata": { - "id": "rNaRoFrLnz1E" - }, - "outputs": [], - "source": [ - "setSeeds(42)\n", - "\n", - "preprocess = Preprocess(args)\n", - "preprocess.load_train_data(file_name)\n", - "\n", - "train_data = preprocess.get_train_data()\n", - "# train_data, valid_data = preprocess.split_data(train_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 226, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | userID | \n", - "assessmentItemID | \n", - "testId | \n", - "answerCode | \n", - "KnowledgeTag | \n", - "
---|---|---|---|---|---|
0 | \n", - "0 | \n", - "A060001001 | \n", - "A060000001 | \n", - "1 | \n", - "7224 | \n", - "
1 | \n", - "0 | \n", - "A060001002 | \n", - "A060000001 | \n", - "1 | \n", - "7225 | \n", - "
2 | \n", - "0 | \n", - "A060001003 | \n", - "A060000001 | \n", - "1 | \n", - "7225 | \n", - "
3 | \n", - "0 | \n", - "A060001004 | \n", - "A060000001 | \n", - "1 | \n", - "7225 | \n", - "
4 | \n", - "0 | \n", - "A060001005 | \n", - "A060000001 | \n", - "1 | \n", - "7225 | \n", - "
... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "
2266581 | \n", - "7441 | \n", - "A030071005 | \n", - "A030000071 | \n", - "0 | \n", - "438 | \n", - "
2266582 | \n", - "7441 | \n", - "A040165001 | \n", - "A040000165 | \n", - "1 | \n", - "8836 | \n", - "
2266583 | \n", - "7441 | \n", - "A040165002 | \n", - "A040000165 | \n", - "1 | \n", - "8836 | \n", - "
2266584 | \n", - "7441 | \n", - "A040165003 | \n", - "A040000165 | \n", - "1 | \n", - "8836 | \n", - "
2266585 | \n", - "7441 | \n", - "A040165004 | \n", - "A040000165 | \n", - "1 | \n", - "8836 | \n", - "
2266586 rows × 5 columns
\n", - "\n", - " | userID | \n", - "assessmentItemID | \n", - "testId | \n", - "KnowledgeTag | \n", - "
---|---|---|---|---|
0 | \n", - "0 | \n", - "5354 | \n", - "975 | \n", - "618 | \n", - "
1 | \n", - "0 | \n", - "5355 | \n", - "975 | \n", - "619 | \n", - "
2 | \n", - "0 | \n", - "5356 | \n", - "975 | \n", - "619 | \n", - "
3 | \n", - "0 | \n", - "5357 | \n", - "975 | \n", - "619 | \n", - "
4 | \n", - "0 | \n", - "5358 | \n", - "975 | \n", - "619 | \n", - "
... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "
2266581 | \n", - "7441 | \n", - "2373 | \n", - "456 | \n", - "375 | \n", - "
2266582 | \n", - "7441 | \n", - "3909 | \n", - "748 | \n", - "784 | \n", - "
2266583 | \n", - "7441 | \n", - "3910 | \n", - "748 | \n", - "784 | \n", - "
2266584 | \n", - "7441 | \n", - "3911 | \n", - "748 | \n", - "784 | \n", - "
2266585 | \n", - "7441 | \n", - "3912 | \n", - "748 | \n", - "784 | \n", - "
2266586 rows × 4 columns
\n", - "