diff --git a/.gitignore b/.gitignore index 4d96fbb..d0176a9 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,6 @@ __pycache__ # ignore .idea folder .idea + +# pycaret log +*.log diff --git a/No Caffeine No Gain.pdf b/No Caffeine No Gain.pdf new file mode 100644 index 0000000..c0c7e78 Binary files /dev/null and b/No Caffeine No Gain.pdf differ diff --git a/args.py b/args.py index 45aecc0..37bf28e 100644 --- a/args.py +++ b/args.py @@ -71,7 +71,11 @@ def parse_args(mode='train'): # Pseudo Labeling parser.add_argument('--use_pseudo', default=False, type=bool, help='Using Pseudo labeling') parser.add_argument('--pseudo_label_file', default='', type=str, help='file path for pseudo labeling') - + + # Finetuning + parser.add_argument('--use_finetune', default=False, type=bool, help='Using Fine Tuning') + parser.add_argument('--trained_model', default='/opt/ml/code/p4-dkt-no_caffeine_no_gain/models/re_pse_Bert_40_5/model_epoch7.pt', type=str, help='pretrained model path') + # log parser.add_argument('--log_steps', default=50, type=int, help='print log per n steps') diff --git a/baseline.ipynb b/baseline.ipynb deleted file mode 100644 index 6e2541f..0000000 --- a/baseline.ipynb +++ /dev/null @@ -1,1326 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Nv5EvIVPnz0y" - }, - "source": [ - "# LSTM 활용한 베이스라인" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install easydict" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "wtJhitPznz06" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import os\n", - "import torch\n", - "import easydict\n", - "import numpy as np\n", - "from sklearn.preprocessing import LabelEncoder\n", - "import time\n", - "import datetime\n", - "from datetime import datetime\n", - "import random\n", - "import wandb" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6w3E-ACunz07" - }, - "source": [ - "## 1. 데이터 로드 및 전처리 컴포넌트" - ] - }, - { - "cell_type": "code", - "execution_count": 224, - "metadata": { - "id": "od9O-ttAnz08" - }, - "outputs": [], - "source": [ - "import os\n", - "from datetime import datetime\n", - "import time\n", - "import tqdm\n", - "import pandas as pd\n", - "import random\n", - "from sklearn.preprocessing import LabelEncoder\n", - "import numpy as np\n", - "import torch\n", - "\n", - "class Preprocess:\n", - " def __init__(self,args):\n", - " self.args = args\n", - " self.train_data = None\n", - " self.test_data = None\n", - " \n", - "\n", - " def get_train_data(self):\n", - " return self.train_data\n", - "\n", - " def get_test_data(self):\n", - " return self.test_data\n", - "\n", - " def split_data(self, data, ratio=0.7, shuffle=True, seed=0):\n", - " \"\"\"\n", - " split data into two parts with a given ratio.\n", - " \"\"\"\n", - " if shuffle:\n", - " random.seed(seed) # fix to default seed 0\n", - " random.shuffle(data)\n", - "\n", - " size = int(len(data) * ratio)\n", - " data_1 = data[:size]\n", - " data_2 = data[size:]\n", - "\n", - " return data_1, data_2\n", - "\n", - " def __save_labels(self, encoder, name):\n", - " le_path = os.path.join(self.args.asset_dir, name + '_classes.npy')\n", - " np.save(le_path, encoder.classes_)\n", - "\n", - " def __preprocessing(self, df, is_train = True):\n", - " cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']\n", - "\n", - " if not os.path.exists(self.args.asset_dir):\n", - " os.makedirs(self.args.asset_dir)\n", - " \n", - " for col in cate_cols:\n", - " \n", - " \n", - " le = LabelEncoder()\n", - " if is_train:\n", - " #For UNKNOWN class\n", - " a = df[col].unique().tolist() + ['unknown']\n", - " le.fit(a)\n", - " self.__save_labels(le, col)\n", - " else:\n", - " label_path = os.path.join(self.args.asset_dir,col+'_classes.npy')\n", - " le.classes_ = np.load(label_path)\n", - " \n", - " df[col] = df[col].apply(lambda x: x if x in le.classes_ else 'unknown')\n", - "\n", - " #모든 컬럼이 범주형이라고 가정\n", - " df[col]= df[col].astype(str)\n", - " test = le.transform(df[col])\n", - " df[col] = test\n", - " \n", - "\n", - " def convert_time(s):\n", - " timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())\n", - " return int(timestamp)\n", - "\n", - " df['Timestamp'] = df['Timestamp'].apply(convert_time)\n", - " \n", - " return df\n", - "\n", - " def __feature_engineering(self, df):\n", - " #TODO\n", - " return df\n", - "\n", - " def load_data_from_file(self, file_name, is_train=True):\n", - " csv_file_path = os.path.join(self.args.data_dir, file_name)\n", - " df = pd.read_csv(csv_file_path)#, nrows=100000)\n", - " df = self.__feature_engineering(df)\n", - " # df = self.__preprocessing(df, is_train)\n", - "\n", - " # 추후 feature를 embedding할 시에 embedding_layer의 input 크기를 결정할때 사용\n", - "\n", - " \n", - " self.args.n_questions = len(np.load(os.path.join(self.args.asset_dir,'assessmentItemID_classes.npy')))\n", - " self.args.n_test = len(np.load(os.path.join(self.args.asset_dir,'testId_classes.npy')))\n", - " self.args.n_tag = len(np.load(os.path.join(self.args.asset_dir,'KnowledgeTag_classes.npy')))\n", - " \n", - "\n", - "\n", - " df = df.sort_values(by=['userID','Timestamp'], axis=0)\n", - " columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag']\n", - " group = df[columns].groupby('userID').apply(\n", - " lambda r: (\n", - " r['testId'].values, \n", - " r['assessmentItemID'].values,\n", - " r['KnowledgeTag'].values,\n", - " r['answerCode'].values\n", - " )\n", - " )\n", - "\n", - " return df[columns]\n", - "\n", - "\n", - " def load_train_data(self, file_name):\n", - " self.train_data = self.load_data_from_file(file_name)\n", - "\n", - " def load_test_data(self, file_name):\n", - " self.test_data = self.load_data_from_file(file_name, is_train= False)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "E-MQhPevnz08" - }, - "source": [ - "## 2. 데이터 셋 / 데이터 로더" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "h29rn8YNnz09" - }, - "outputs": [], - "source": [ - "class DKTDataset(torch.utils.data.Dataset):\n", - " def __init__(self, data, args):\n", - " self.data = data\n", - " self.args = args\n", - "\n", - " def __getitem__(self, index):\n", - " row = self.data[index]\n", - "\n", - " # 각 data의 sequence length\n", - " seq_len = len(row[0])\n", - "\n", - " test, question, tag, correct = row[0], row[1], row[2], row[3]\n", - " \n", - "\n", - " cate_cols = [test, question, tag, correct]\n", - "\n", - " # max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다\n", - " if seq_len > self.args.max_seq_len:\n", - " for i, col in enumerate(cate_cols):\n", - " cate_cols[i] = col[-self.args.max_seq_len:]\n", - " mask = np.ones(self.args.max_seq_len, dtype=np.int16)\n", - " else:\n", - " mask = np.zeros(self.args.max_seq_len, dtype=np.int16)\n", - " mask[-seq_len:] = 1\n", - "\n", - " # mask도 columns 목록에 포함시킴\n", - " cate_cols.append(mask)\n", - "\n", - " # np.array -> torch.tensor 형변환\n", - " for i, col in enumerate(cate_cols):\n", - " cate_cols[i] = torch.tensor(col)\n", - "\n", - " return cate_cols\n", - "\n", - " def __len__(self):\n", - " return len(self.data)\n", - "\n", - "\n", - "\n", - "\n", - "def collate(batch):\n", - " col_n = len(batch[0])\n", - " col_list = [[] for _ in range(col_n)]\n", - " max_seq_len = len(batch[0][-1])\n", - "\n", - " \n", - " # batch의 값들을 각 column끼리 그룹화\n", - " for row in batch:\n", - " for i, col in enumerate(row):\n", - " pre_padded = torch.zeros(max_seq_len)\n", - " pre_padded[-len(col):] = col\n", - " col_list[i].append(pre_padded)\n", - "\n", - "\n", - " for i, _ in enumerate(col_list):\n", - " col_list[i] =torch.stack(col_list[i])\n", - " \n", - " return tuple(col_list)\n", - "\n", - "\n", - "def get_loaders(args, train, valid):\n", - "\n", - " pin_memory = False\n", - " train_loader, valid_loader = None, None\n", - " \n", - " if train is not None:\n", - " trainset = DKTDataset(train, args)\n", - " train_loader = torch.utils.data.DataLoader(trainset, num_workers=args.num_workers, shuffle=True,\n", - " batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)\n", - " if valid is not None:\n", - " valset = DKTDataset(valid, args)\n", - " valid_loader = torch.utils.data.DataLoader(valset, num_workers=args.num_workers, shuffle=False,\n", - " batch_size=args.batch_size, pin_memory=pin_memory, collate_fn=collate)\n", - "\n", - " return train_loader, valid_loader" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QyiplxY6nz0-" - }, - "source": [ - "## 3. LSTM 기반의 모델" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "aO72oKAgnz0-" - }, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F \n", - "import numpy as np\n", - "import copy\n", - "import math\n", - "\n", - "try:\n", - " from transformers.modeling_bert import BertConfig, BertEncoder, BertModel \n", - "except:\n", - " from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel \n", - "\n", - "\n", - "\n", - "\n", - "class LSTM(nn.Module):\n", - "\n", - " def __init__(self, args):\n", - " super(LSTM, self).__init__()\n", - " self.args = args\n", - " self.device = args.device\n", - "\n", - " self.hidden_dim = self.args.hidden_dim\n", - " self.n_layers = self.args.n_layers\n", - "\n", - " # Embedding \n", - " # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)\n", - " self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)\n", - " self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)\n", - " self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)\n", - " self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)\n", - "\n", - " # embedding combination projection\n", - " self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)\n", - "\n", - " self.lstm = nn.LSTM(self.hidden_dim,\n", - " self.hidden_dim,\n", - " self.n_layers,\n", - " batch_first=True)\n", - " \n", - " # Fully connected layer\n", - " self.fc = nn.Linear(self.hidden_dim, 1)\n", - "\n", - " self.activation = nn.Sigmoid()\n", - "\n", - " def init_hidden(self, batch_size):\n", - " h = torch.zeros(\n", - " self.n_layers,\n", - " batch_size,\n", - " self.hidden_dim)\n", - " h = h.to(self.device)\n", - "\n", - " c = torch.zeros(\n", - " self.n_layers,\n", - " batch_size,\n", - " self.hidden_dim)\n", - " c = c.to(self.device)\n", - "\n", - " return (h, c)\n", - "\n", - " def forward(self, input):\n", - "\n", - " test, question, tag, _, mask, interaction, _ = input\n", - "\n", - " batch_size = interaction.size(0)\n", - "\n", - " # Embedding\n", - "\n", - " embed_interaction = self.embedding_interaction(interaction)\n", - " embed_test = self.embedding_test(test)\n", - " embed_question = self.embedding_question(question)\n", - " embed_tag = self.embedding_tag(tag)\n", - " \n", - "\n", - " embed = torch.cat([embed_interaction,\n", - " embed_test,\n", - " embed_question,\n", - " embed_tag,], 2)\n", - "\n", - " X = self.comb_proj(embed)\n", - "\n", - " hidden = self.init_hidden(batch_size)\n", - " out, hidden = self.lstm(X, hidden)\n", - " out = out.contiguous().view(batch_size, -1, self.hidden_dim)\n", - "\n", - " out = self.fc(out)\n", - " preds = self.activation(out).view(batch_size, -1)\n", - "\n", - " return preds\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NEaAa6Prnz0_" - }, - "source": [ - "## 4. 모델 훈련을 위한 함수들" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "r_wU37QGnz0_" - }, - "outputs": [], - "source": [ - "import os, sys\n", - "\n", - "import numpy as np\n", - "\n", - "import tarfile\n", - "import torch\n", - "from torch import nn\n", - "import torch.nn.functional as F\n", - "from torch.optim import Adam, AdamW\n", - "\n", - "from torch.optim.lr_scheduler import ReduceLROnPlateau\n", - "\n", - "from transformers import get_linear_schedule_with_warmup\n", - "from transformers import get_cosine_schedule_with_warmup\n", - "\n", - "from sklearn.metrics import roc_auc_score\n", - "from sklearn.metrics import accuracy_score\n", - "import scipy.stats\n", - "\n", - "\n", - "# 훈련을 하기 위한 세팅\n", - "def get_optimizer(model, args):\n", - " if args.optimizer == 'adam':\n", - " optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)\n", - " if args.optimizer == 'adamW':\n", - " optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)\n", - " \n", - " # 모든 parameter들의 grad값을 0으로 초기화\n", - " optimizer.zero_grad()\n", - " \n", - " return optimizer\n", - "\n", - "def get_scheduler(optimizer, args):\n", - " if args.scheduler == 'plateau':\n", - " scheduler = ReduceLROnPlateau(optimizer, patience=10, factor=0.5, mode='max', verbose=True)\n", - " elif args.scheduler == 'linear_warmup':\n", - " scheduler = get_linear_schedule_with_warmup(optimizer,\n", - " num_warmup_steps=args.warmup_steps,\n", - " num_training_steps=args.total_steps)\n", - " return scheduler\n", - "\n", - "def get_criterion(pred, target):\n", - " loss = nn.BCELoss(reduction=\"none\")\n", - " return loss(pred, target)\n", - "\n", - "def get_metric(targets, preds):\n", - " auc = roc_auc_score(targets, preds)\n", - " acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))\n", - "\n", - " return auc, acc\n", - "\n", - "def get_model(args):\n", - " \"\"\"\n", - " Load model and move tensors to a given devices.\n", - " \"\"\"\n", - " if args.model == 'lstm': model = LSTM(args)\n", - " \n", - "\n", - " model.to(args.device)\n", - "\n", - " return model\n", - "\n", - "\n", - "# 배치 전처리\n", - "def process_batch(batch, args):\n", - "\n", - " test, question, tag, correct, mask = batch\n", - " \n", - " \n", - " # change to float\n", - " mask = mask.type(torch.FloatTensor)\n", - " correct = correct.type(torch.FloatTensor)\n", - "\n", - " # interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용\n", - " # saint의 경우 decoder에 들어가는 input이다\n", - " interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.\n", - " interaction = interaction.roll(shifts=1, dims=1)\n", - " interaction[:, 0] = 0 # set padding index to the first sequence\n", - " interaction = (interaction * mask).to(torch.int64)\n", - " # print(interaction)\n", - " # exit()\n", - " # test_id, question_id, tag\n", - " test = ((test + 1) * mask).to(torch.int64)\n", - " question = ((question + 1) * mask).to(torch.int64)\n", - " tag = ((tag + 1) * mask).to(torch.int64)\n", - "\n", - " # gather index\n", - " # 마지막 sequence만 사용하기 위한 index\n", - " gather_index = torch.tensor(np.count_nonzero(mask, axis=1))\n", - " gather_index = gather_index.view(-1, 1) - 1\n", - "\n", - "\n", - " # device memory로 이동\n", - "\n", - " test = test.to(args.device)\n", - " question = question.to(args.device)\n", - "\n", - "\n", - " tag = tag.to(args.device)\n", - " correct = correct.to(args.device)\n", - " mask = mask.to(args.device)\n", - "\n", - " interaction = interaction.to(args.device)\n", - " gather_index = gather_index.to(args.device)\n", - "\n", - " return (test, question,\n", - " tag, correct, mask,\n", - " interaction, gather_index)\n", - "\n", - "\n", - "# loss계산하고 parameter update!\n", - "def compute_loss(preds, targets):\n", - " \"\"\"\n", - " Args :\n", - " preds : (batch_size, max_seq_len)\n", - " targets : (batch_size, max_seq_len)\n", - "\n", - " \"\"\"\n", - " loss = get_criterion(preds, targets)\n", - " #마지막 시퀀드에 대한 값만 loss 계산\n", - " loss = loss[:,-1]\n", - " loss = torch.mean(loss)\n", - " return loss\n", - "\n", - "def update_params(loss, model, optimizer, args):\n", - " loss.backward()\n", - " torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)\n", - " optimizer.step()\n", - " optimizer.zero_grad()\n", - "\n", - "\n", - "\n", - "def save_checkpoint(state, model_dir, model_filename):\n", - " print('saving model ...')\n", - " if not os.path.exists(model_dir):\n", - " os.makedirs(model_dir) \n", - " torch.save(state, os.path.join(model_dir, model_filename))\n", - "\n", - "\n", - "\n", - "def load_model(args):\n", - " \n", - " \n", - " model_path = os.path.join(args.model_dir, args.model_name)\n", - " print(\"Loading Model from:\", model_path)\n", - " load_state = torch.load(model_path)\n", - " model = get_model(args)\n", - "\n", - " # 1. load model state\n", - " model.load_state_dict(load_state['state_dict'], strict=True)\n", - " \n", - " \n", - " print(\"Loading Model from:\", model_path, \"...Finished.\")\n", - " return model\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YO_xFaJYnz1B" - }, - "source": [ - "## 5. 전체 프로세스를 담당하는 함수들" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "BMiIOHgJnz1D" - }, - "outputs": [], - "source": [ - "\n", - "def run(args, train_data, valid_data):\n", - " train_loader, valid_loader = get_loaders(args, train_data, valid_data)\n", - " \n", - " # only when using warmup scheduler\n", - " args.total_steps = int(len(train_loader.dataset) / args.batch_size) * (args.n_epochs)\n", - " args.warmup_steps = args.total_steps // 10\n", - " \n", - " model = get_model(args)\n", - " optimizer = get_optimizer(model, args)\n", - " scheduler = get_scheduler(optimizer, args)\n", - "\n", - " best_auc = -1\n", - " early_stopping_counter = 0\n", - " for epoch in range(args.n_epochs):\n", - "\n", - " print(f\"Start Training: Epoch {epoch + 1}\")\n", - " \n", - " ### TRAIN\n", - " train_auc, train_acc, train_loss = train(train_loader, model, optimizer, args)\n", - " \n", - " ### VALID\n", - " auc, acc, _, _ = validate(valid_loader, model, args)\n", - "\n", - " ### TODO: model save or early stopping\n", - " wandb.log({\"epoch\": epoch, \"train_loss\": train_loss, \"train_auc\": train_auc, \"train_acc\":train_acc,\n", - " \"valid_auc\":auc, \"valid_acc\":acc})\n", - " if auc > best_auc:\n", - " best_auc = auc\n", - " # torch.nn.DataParallel로 감싸진 경우 원래의 model을 가져옵니다.\n", - " model_to_save = model.module if hasattr(model, 'module') else model\n", - " save_checkpoint({\n", - " 'epoch': epoch + 1,\n", - " 'state_dict': model_to_save.state_dict(),\n", - " },\n", - " args.model_dir, 'model.pt',\n", - " )\n", - " early_stopping_counter = 0\n", - " else:\n", - " early_stopping_counter += 1\n", - " if early_stopping_counter >= args.patience:\n", - " print(f'EarlyStopping counter: {early_stopping_counter} out of {args.patience}')\n", - " break\n", - "\n", - " # scheduler\n", - " if args.scheduler == 'plateau':\n", - " scheduler.step(best_auc)\n", - " else:\n", - " scheduler.step()\n", - "\n", - "\n", - "def train(train_loader, model, optimizer, args):\n", - " model.train()\n", - "\n", - " total_preds = []\n", - " total_targets = []\n", - " losses = []\n", - " for step, batch in enumerate(train_loader):\n", - " input = process_batch(batch, args)\n", - " preds = model(input)\n", - " targets = input[3] # correct\n", - "\n", - "\n", - " loss = compute_loss(preds, targets)\n", - " update_params(loss, model, optimizer, args)\n", - "\n", - " if step % args.log_steps == 0:\n", - " print(f\"Training steps: {step} Loss: {str(loss.item())}\")\n", - " \n", - " # predictions\n", - " preds = preds[:,-1]\n", - " targets = targets[:,-1]\n", - "\n", - " if args.device == 'cuda':\n", - " preds = preds.to('cpu').detach().numpy()\n", - " targets = targets.to('cpu').detach().numpy()\n", - " else: # cpu\n", - " preds = preds.detach().numpy()\n", - " targets = targets.detach().numpy()\n", - " \n", - " total_preds.append(preds)\n", - " total_targets.append(targets)\n", - " losses.append(loss)\n", - " \n", - "\n", - " total_preds = np.concatenate(total_preds)\n", - " total_targets = np.concatenate(total_targets)\n", - "\n", - " # Train AUC / ACC\n", - " auc, acc = get_metric(total_targets, total_preds)\n", - " loss_avg = sum(losses)/len(losses)\n", - " print(f'TRAIN AUC : {auc} ACC : {acc}')\n", - " return auc, acc, loss_avg\n", - " \n", - "\n", - "def validate(valid_loader, model, args):\n", - " model.eval()\n", - "\n", - " total_preds = []\n", - " total_targets = []\n", - " for step, batch in enumerate(valid_loader):\n", - " input = process_batch(batch, args)\n", - "\n", - " preds = model(input)\n", - " targets = input[3] # correct\n", - "\n", - "\n", - " # predictions\n", - " preds = preds[:,-1]\n", - " targets = targets[:,-1]\n", - " \n", - " if args.device == 'cuda':\n", - " preds = preds.to('cpu').detach().numpy()\n", - " targets = targets.to('cpu').detach().numpy()\n", - " else: # cpu\n", - " preds = preds.detach().numpy()\n", - " targets = targets.detach().numpy()\n", - "\n", - " total_preds.append(preds)\n", - " total_targets.append(targets)\n", - "\n", - " total_preds = np.concatenate(total_preds)\n", - " total_targets = np.concatenate(total_targets)\n", - "\n", - " # Train AUC / ACC\n", - " auc, acc = get_metric(total_targets, total_preds)\n", - " \n", - " print(f'VALID AUC : {auc} ACC : {acc}\\n')\n", - "\n", - " return auc, acc, total_preds, total_targets\n", - "\n", - "\n", - "\n", - "def inference(args, test_data):\n", - " \n", - " model = load_model(args)\n", - " model.eval()\n", - " _, test_loader = get_loaders(args, None, test_data)\n", - " \n", - " \n", - " total_preds = []\n", - " \n", - " for step, batch in enumerate(test_loader):\n", - " input = process_batch(batch, args)\n", - "\n", - " preds = model(input)\n", - " \n", - "\n", - " # predictions\n", - " preds = preds[:,-1]\n", - " \n", - "\n", - " if args.device == 'cuda':\n", - " preds = preds.to('cpu').detach().numpy()\n", - " else: # cpu\n", - " preds = preds.detach().numpy()\n", - " \n", - " total_preds+=list(preds)\n", - "\n", - " write_path = os.path.join(args.output_dir, \"output.csv\")\n", - " if not os.path.exists(args.output_dir):\n", - " os.makedirs(args.output_dir) \n", - " with open(write_path, 'w', encoding='utf8') as w:\n", - " print(\"writing prediction : {}\".format(write_path))\n", - " w.write(\"id,prediction\\n\")\n", - " for id, p in enumerate(total_preds):\n", - " w.write('{},{}\\n'.format(id,p))\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gPEE00qUnz1E" - }, - "source": [ - "## 6.실행부분" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": { - "id": "qZmwQenqnz1E" - }, - "outputs": [], - "source": [ - "data_dir = '/opt/ml/input/data/train_dataset'\n", - "file_name = 'train_data.csv'\n", - "test_file_name = 'test_data.csv'\n", - "\n", - "config = {}\n", - "\n", - "# 설정\n", - "config['seed'] = 42\n", - "config['device'] = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - "config['data_dir'] = data_dir\n", - "config['asset_dir'] = 'asset'\n", - "config['model_dir'] = 'models'\n", - "config['model_name'] = 'model.pt'\n", - "config['output_dir'] = 'output'\n", - "\n", - "# 데이터\n", - "config['max_seq_len'] = 20\n", - "config['num_workers'] = 1\n", - "\n", - "\n", - "# 모델\n", - "config['hidden_dim'] = 64\n", - "config['n_layers'] = 2\n", - "config['dropout'] = 0.2\n", - "\n", - "# 훈련\n", - "config['n_epochs'] = 20\n", - "config['batch_size'] = 64\n", - "config['lr'] = 0.0001\n", - "config['clip_grad'] = 10\n", - "config['log_steps'] = 50\n", - "config['patience'] = 5\n", - "\n", - "\n", - "\n", - "### 중요 ###\n", - "config['model'] = 'lstm'\n", - "config['optimizer'] = 'adam'\n", - "config['scheduler'] = 'plateau'\n", - "\n", - "\n", - "args = easydict.EasyDict(config)" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [], - "source": [ - "def setSeeds(seed = 42):\n", - " # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.\n", - " os.environ['PYTHONHASHSEED'] = str(seed)\n", - " random.seed(seed)\n", - " np.random.seed(seed)\n", - " torch.manual_seed(seed) \n", - " torch.cuda.manual_seed(seed)\n", - " torch.backends.cudnn.deterministic = True" - ] - }, - { - "cell_type": "code", - "execution_count": 225, - "metadata": { - "id": "rNaRoFrLnz1E" - }, - "outputs": [], - "source": [ - "setSeeds(42)\n", - "\n", - "preprocess = Preprocess(args)\n", - "preprocess.load_train_data(file_name)\n", - "\n", - "train_data = preprocess.get_train_data()\n", - "# train_data, valid_data = preprocess.split_data(train_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 226, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
userIDassessmentItemIDtestIdanswerCodeKnowledgeTag
00A060001001A06000000117224
10A060001002A06000000117225
20A060001003A06000000117225
30A060001004A06000000117225
40A060001005A06000000117225
..................
22665817441A030071005A0300000710438
22665827441A040165001A04000016518836
22665837441A040165002A04000016518836
22665847441A040165003A04000016518836
22665857441A040165004A04000016518836
\n", - "

2266586 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " userID assessmentItemID testId answerCode KnowledgeTag\n", - "0 0 A060001001 A060000001 1 7224\n", - "1 0 A060001002 A060000001 1 7225\n", - "2 0 A060001003 A060000001 1 7225\n", - "3 0 A060001004 A060000001 1 7225\n", - "4 0 A060001005 A060000001 1 7225\n", - "... ... ... ... ... ...\n", - "2266581 7441 A030071005 A030000071 0 438\n", - "2266582 7441 A040165001 A040000165 1 8836\n", - "2266583 7441 A040165002 A040000165 1 8836\n", - "2266584 7441 A040165003 A040000165 1 8836\n", - "2266585 7441 A040165004 A040000165 1 8836\n", - "\n", - "[2266586 rows x 5 columns]" - ] - }, - "execution_count": 226, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_data[:]" - ] - }, - { - "cell_type": "code", - "execution_count": 228, - "metadata": {}, - "outputs": [], - "source": [ - "train_data['assessmentItemID'] = train_data['assessmentItemID'].str[1:]" - ] - }, - { - "cell_type": "code", - "execution_count": 233, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dtype('O')" - ] - }, - "execution_count": 233, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_data.values.dtype" - ] - }, - { - "cell_type": "code", - "execution_count": 209, - "metadata": {}, - "outputs": [], - "source": [ - "t = train_data['answerCode'].values" - ] - }, - { - "cell_type": "code", - "execution_count": 222, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 1\n", - "2 1\n", - "3 1\n", - "4 1\n", - " ..\n", - "2266581 0\n", - "2266582 1\n", - "2266583 1\n", - "2266584 1\n", - "2266585 1\n", - "Name: answerCode, Length: 2266586, dtype: int64" - ] - }, - "execution_count": 222, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_data.pop('answerCode')" - ] - }, - { - "cell_type": "code", - "execution_count": 223, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
userIDassessmentItemIDtestIdKnowledgeTag
005354975618
105355975619
205356975619
305357975619
405358975619
...............
226658174412373456375
226658274413909748784
226658374413910748784
226658474413911748784
226658574413912748784
\n", - "

2266586 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " userID assessmentItemID testId KnowledgeTag\n", - "0 0 5354 975 618\n", - "1 0 5355 975 619\n", - "2 0 5356 975 619\n", - "3 0 5357 975 619\n", - "4 0 5358 975 619\n", - "... ... ... ... ...\n", - "2266581 7441 2373 456 375\n", - "2266582 7441 3909 748 784\n", - "2266583 7441 3910 748 784\n", - "2266584 7441 3911 748 784\n", - "2266585 7441 3912 748 784\n", - "\n", - "[2266586 rows x 4 columns]" - ] - }, - "execution_count": 223, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_data['ass'] = df['testId'].str[-3:]" - ] - }, - { - "cell_type": "code", - "execution_count": 215, - "metadata": {}, - "outputs": [], - "source": [ - "t2 = train_data.values" - ] - }, - { - "cell_type": "code", - "execution_count": 217, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([1, 1, 1, ..., 1, 1, 1])" - ] - }, - "execution_count": 217, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "t" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mahnyujin\u001b[0m (use `wandb login --relogin` to force relogin)\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 123, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wandb.login()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wandb.init(project='dkt', config=config)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "v9qV6aXonz1E", - "outputId": "0d36ac2e-7ca2-4fc0-cf4c-ea296bc40ce4" - }, - "outputs": [], - "source": [ - "run(args, train_data, valid_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QFY0zXGFnz1F" - }, - "source": [ - "## Inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PcTCBhrZnz1G" - }, - "outputs": [], - "source": [ - "preprocess = Preprocess(args)\n", - "preprocess.load_test_data(test_file_name)\n", - "test_data = preprocess.get_test_data()\n", - "inference(args, test_data)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "3강_lstm_baseline.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/dkt/dataloader.py b/dkt/dataloader.py index 2b22d8c..0be1fb1 100644 --- a/dkt/dataloader.py +++ b/dkt/dataloader.py @@ -101,7 +101,7 @@ def __preprocessing(self, main_df, sub_df=None, is_train=True): le.fit(a) self.__save_labels(le, col) else: - label_path = os.path.join(self.args.asset_dir, col + '_classes.npy') + label_path = os.path.join(self.args.asset_dir,col+'_classes.npy') le.classes_ = np.load(label_path) main_df[col] = main_df[col].apply(lambda x: x if x in le.classes_ else 'unknown') @@ -109,23 +109,16 @@ def __preprocessing(self, main_df, sub_df=None, is_train=True): main_df[col]= main_df[col].astype(str) trans = le.transform(main_df[col]) main_df[col] = trans - - - def convert_time(s): - timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple()) - return int(timestamp) - - main_df['Timestamp'] = main_df['Timestamp'].apply(convert_time) return main_df def __feature_engineering(self, df): self.args.USERID_COLUMN = ['userID'] + self.args.USE_COLUMN = ['elapsed', 'time_bin', 'classification', 'paperNum', 'problemNum', 'user_total_acc', 'test_acc', 'assessment_acc', 'tag_acc', 'past_correct', 'past_content_count', 'correct_per_hour', 'same_tag', 'cont_tag'] + self.args.EXCLUDE_COLUMN = ['assessmentItemID', 'testId', 'Timestamp', 'KnowledgeTag', 'item', 'item_order', 'user_total_correct_cnt', 'user_total_ans_cnt', 'test_size', 'retest', 'user_test_ans_cnt', 'user_test_correct_cnt', 'user_acc', 'test_mean', 'test_sum', 'ItemID_mean', 'ItemID_sum','tag_mean', 'tag_sum', 'hours', 'time', 'correct_shift_-2', 'correct_shift_-1', 'correct_shift_1', 'correct_shift_2', 'total_used_time', 'shift', 'future_correct', 'past_content_correct', 'past_count', 'average_correct', 'average_content_correct', 'mean_time', 'time_median', 'hour', 'hour_mode', 'is_night', 'normalized_time', 'relative_time', 'time_cut', 'time_qcut'] self.args.ANSWER_COLUMN = ['answerCode'] - self.args.USE_COLUMN = ['assessmentItemID', 'testId', 'Timestamp', 'KnowledgeTag', 'elapsed', 'item', 'item_order', 'user_total_correct_cnt', 'user_total_ans_cnt', 'user_total_acc', 'test_size', 'retest', 'user_test_ans_cnt', 'user_test_correct_cnt', 'user_acc', 'test_mean', 'test_sum', 'ItemID_mean', 'ItemID_sum', 'tag_mean', 'tag_sum', 'classification', 'paperNum', 'problemNum', 'hours', 'time_bin', 'tag_acc', 'assessment_acc', 'test_acc', 'time', 'correct_shift_-2', 'correct_shift_-1', 'correct_shift_1', 'correct_shift_2', 'total_used_time', 'shift', 'past_correct', 'future_correct', 'past_content_correct', 'past_count', 'average_correct', 'past_content_count', 'average_content_correct', 'mean_time', 'assessmentItemID_mean', 'assessmentItemID_std', 'answerCode_mean', 'answerCode_std', 'KnowledgeTag_mean', 'KnowledgeTag_std', 'elapsed_mean', 'elapsed_std', 'item_mean', 'item_std', 'item_order_mean', 'item_order_std', 'user_total_correct_cnt_mean', 'user_total_correct_cnt_std', 'user_total_ans_cnt_mean', 'user_total_ans_cnt_std', 'user_total_acc_mean', 'user_total_acc_std', 'test_size_mean', 'test_size_std', 'retest_mean', 'retest_std', 'user_test_ans_cnt_mean', 'user_test_ans_cnt_std', 'user_test_correct_cnt_mean', 'user_test_correct_cnt_std', 'user_acc_mean', 'user_acc_std', 'test_mean_mean', 'test_mean_std', 'test_sum_mean', 'test_sum_std', 'ItemID_mean_mean', 'ItemID_mean_std', 'ItemID_sum_mean', 'ItemID_sum_std', 'tag_mean_mean', 'tag_mean_std', 'tag_sum_mean', 'tag_sum_std', 'classification_mean', 'classification_std', 'paperNum_mean', 'paperNum_std', 'problemNum_mean', 'problemNum_std', 'hours_mean', 'hours_std', 'time_bin_mean', 'time_bin_std', 'tag_acc_mean', 'tag_acc_std', 'assessment_acc_mean', 'assessment_acc_std', 'test_acc_mean', 'test_acc_std', 'time_mean', 'time_std', 'correct_shift_-2_mean', 'correct_shift_-2_std', 'correct_shift_-1_mean', 'correct_shift_-1_std', 'correct_shift_1_mean', 'correct_shift_1_std', 'correct_shift_2_mean', 'correct_shift_2_std', 'total_used_time_mean', 'total_used_time_std', 'shift_mean', 'shift_std', 'past_correct_mean', 'past_correct_std', 'future_correct_mean', 'future_correct_std', 'past_content_correct_mean', 'past_content_correct_std', 'past_count_mean', 'past_count_std', 'average_correct_mean', 'average_correct_std', 'past_content_count_mean', 'past_content_count_std', 'average_content_correct_mean', 'average_content_correct_std', 'mean_time_mean', 'mean_time_std', 'time_median', 'hour', 'correct_per_hour', 'hour_mode', 'is_night', 'normalized_time', 'relative_time', 'time_cut', 'time_qcut', 'same_tag', 'cont_tag', ] - self.args.EXCLUDE_COLUMN = [] - + assert df.head().shape[1] == len(self.args.USERID_COLUMN) + len(self.args.ANSWER_COLUMN) + len( self.args.USE_COLUMN) + len(self.args.EXCLUDE_COLUMN) @@ -141,7 +134,7 @@ def load_data_from_file(self, main_file_name, sub_file_name=None, is_train=True) # args.use_test_to_train이 True일때 test셋도 학습에 사용 if self.args.use_test_to_train: csv_file_path = os.path.join(self.args.data_dir, self.args.test_file_name) - test_df = pd.read_csv(csv_file) + test_df = pd.read_csv(csv_file_path) test_df = test_df[test_df.answerCode != -1].copy() main_df += test_df print("test셋 학습에 추가!") @@ -168,7 +161,7 @@ def load_data_from_file(self, main_file_name, sub_file_name=None, is_train=True) self.df_apply_function ) if self.args.model =='tabnet': - g = main_df[self.args.USERID_COLUMN + self.args.USE_COLUMN+self.args.ANSWER_COLUMN] + g = main_df[columns] return g return group.values @@ -178,19 +171,7 @@ def load_train_data(self, train_file, valid_file): print() if self.args.window: - augmented_train_numpy_name = train_file.split('.')[0] + '_msl' + str(self.args.max_seq_len) + '_st' + str(self.args.stride) + '.npy' - augmented_train_numpy_path = os.path.join(self.args.data_dir, augmented_train_numpy_name) - - if os.path.exists(augmented_train_numpy_path): - print(f"{augmented_train_numpy_name} exists!") - self.train_data = np.load(augmented_train_numpy_path, allow_pickle=True) - print(f"{augmented_train_numpy_name} is loaded!") - else: - print(f"{augmented_train_numpy_name} doesn't exist!") - self.train_data = self.sliding_window() - np.save(augmented_train_numpy_path, self.train_data) - print(f"{augmented_train_numpy_name} is saved!") - print() + self.train_data = self.sliding_window() def load_valid_data(self, valid_file): diff --git a/dkt/model.py b/dkt/model.py index d8d994a..c9d3849 100644 --- a/dkt/model.py +++ b/dkt/model.py @@ -3,21 +3,22 @@ import torch.nn as nn import torch.nn.functional as F import numpy as np +import pandas as pd import copy import math import re import os from pytorch_tabnet.tab_model import TabNetClassifier from pytorch_tabnet.pretraining import TabNetPretrainer +from pycaret.classification import * +from pycaret.utils import check_metric +import random try: from transformers.modeling_bert import BertConfig, BertEncoder, BertModel except: from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel - - - class LSTM(nn.Module): def __init__(self, args): @@ -36,11 +37,6 @@ def __init__(self, args): for value in self.args.n_embedding_layers: self.embedding_features.append(nn.Embedding(value + 1, self.hidden_dim // self.args.dim_div)) - #self.embedding_classification = nn.Embedding(self.args.n_class + 1, self.hidden_dim//3) - #self.embedding_paperNum = nn.Embedding(self.args.n_paper + 1, self.hidden_dim//3) - #self.embedding_problemNum = nn.Embedding(self.args.n_problem + 1, self.hidden_dim//3) - #self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3) - # embedding combination projection # +1은 interaction self.comb_proj = nn.Linear((self.hidden_dim//self.args.dim_div)*(len(self.args.n_embedding_layers)+1), self.hidden_dim) @@ -84,10 +80,6 @@ def forward(self, input): for _input, _embedding_feature in zip(input[:-4], self.embedding_features): value = _embedding_feature(_input) embed_features.append(value) - #embed_classification = self.embedding_classification(classification) - #embed_paperNum = self.embedding_paperNum(paperNum) - #embed_problemNum = self.embedding_problemNum(problemNum) - #embed_tag = self.embedding_tag(tag) embed_features = [embed_interaction] + embed_features @@ -418,7 +410,6 @@ def init_hidden(self, batch_size): def forward(self, input): -# test, question, tag, _, mask, interaction, index = input _, mask, interaction, index = input[-4:] batch_size = interaction.size(0) seq_len = interaction.size(1) @@ -733,58 +724,6 @@ def forward(self, input): return preds -class EncoderLayer(nn.Module): - def __init__(self, args): - super(EncoderLayer, self).__init__() - self.args = args - self.device = args.device - - # Defining some parameters - self.hidden_dim = self.args.hidden_dim - self.n_layers = self.args.n_layers - - self.query = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim) - self.key = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim) - self.value = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim) - - self.attn = nn.MultiheadAttention(embed_dim=self.hidden_dim, num_heads=self.args.n_heads) - - self.ffn1 = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim) - self.ffn2 = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim) - - if self.args.layer_norm: - self.ln1 = nn.LayerNorm(self.hidden_dim) - self.ln2 = nn.LayerNorm(self.hidden_dim) - - - def forward(self, embed, mask): - q = self.query(embed).permute(1, 0, 2) - k = self.key(embed).permute(1, 0, 2) - v = self.value(embed).permute(1, 0, 2) - - ## attention - out, _ = self.attn(q, k, v, attn_mask=mask) - - ## residual + layer norm - out = out.permute(1, 0, 2) - out = embed + out - - if self.args.layer_norm: - out = self.ln1(out) - - ## feed forward network - out = self.ffn1(out) - out = F.relu(out) - out = self.ffn2(out) - - ## residual + layer norm - out = embed + out - - if self.args.layer_norm: - out = self.ln2(out) - - return out - class TabNet(nn.Module): def __init__(self, args): super(TabNet, self).__init__() @@ -833,4 +772,26 @@ def forward(self): if self.args.tabnet_pretrain: return self.unsupervised_model, self.clf return self.clf - \ No newline at end of file + + +class LGBM(nn.Module): + def __init__(self, args): + self.args = args + self.device = args.device + + def fit(self, X_train, y_train, X_valid, y_valid, FEATS, categorical_features=[],numeric_features=[],seed=47): + + X_trn = X_train.merge(y_train, on=X_train.index) + X_val = X_valid.merge(y_valid, on=X_valid.index) + + random.seed(seed) + settings = setup(data=X_trn[FEATS], target='answerCode', categorical_features=categorical_features, numeric_features=numeric_features, silent=True) + + lgbm = create_model('lightgbm', sort='AUC') + tuned_lgbm = tune_model(lgbm, optimize='AUC', fold=10) + final_lgbm = finalize_model(tuned_lgbm) + + log = [] + prediction = predict_model(final_lgbm, data=X_val[FEATS], raw_score=True) + log.append(f"{check_metric(prediction['answerCode'], prediction['Label'], metric='Accuracy')} ,{check_metric(prediction['answerCode'], prediction['Score_1'], metric='AUC')}") + return final_lgbm, log \ No newline at end of file diff --git a/dkt/trainer.py b/dkt/trainer.py index 5080b49..a43aae1 100644 --- a/dkt/trainer.py +++ b/dkt/trainer.py @@ -10,9 +10,11 @@ from .scheduler import get_scheduler from .criterion import get_criterion from .metric import get_metric -from .model import LSTM, LSTMATTN, Bert, LastQuery, TfixupBert, Saint, TabNet +from .model import LSTM, LSTMATTN, Bert, LastQuery, TfixupBert, Saint, TabNet, LGBM from pytorch_tabnet.tab_model import TabNetClassifier from pytorch_tabnet.pretraining import TabNetPretrainer +from pycaret.classification import * +from pycaret.utils import check_metric from datetime import timedelta, timezone, datetime import wandb @@ -48,8 +50,6 @@ def tabnet_run(args, train_data, valid_data, test_data): valid_data.pop('answerCode') x_v = valid_data.values - print(x_t.dtype) - if args.tabnet_pretrain: model = get_tabnet_model(args) pre_model, model = model.forward() @@ -104,7 +104,44 @@ def tabnet_run(args, train_data, valid_data, test_data): 'valid_full_logloss' : model.history['valid_logloss'][idx], }) - +def lgbm_run(args): + print(args) + + train = pd.read_csv(os.path.join(args.data_dir, args.train_file_name)) + valid = pd.read_csv(os.path.join(args.data_dir, args.valid_file_name)) + test = pd.read_csv(os.path.join(args.data_dir, args.test_file_name)) + + if args.use_pseudo: + pseudo_labels = pd.read_csv(args.pseudo_label_file) # '/opt/ml/p4-dkt-no_caffeine_no_gain/highest.csv' + pseudo_labels = pseudo_labels['prediction'].to_numpy() + pseudo_labels = np.where(pseudo_labels >= 0.5, 1, 0) + + pseudo_train_data = update_train_data(pseudo_labels, train, test) + train = pseudo_train_data + + model_dir = os.path.join(args.model_dir, args.model_name) + os.makedirs(model_dir, exist_ok=True) + json.dump( + vars(args), + open(f"{model_dir}/exp_config.json", "w"), + indent=2, + ensure_ascii=False, + ) + print(f"\n{model_dir}/exp_config.json is saved!\n") + + FEATS = args.ANSWER_COLUMN + args.USE_COLUMN + + X_train = train[args.USE_COLUMN] + y_train = train[args.ANSWER_COLUMN] + + X_valid = valid[args.USE_COLUMN] + y_valid = valid[args.ANSWER_COLUMN] + + model = LGBM(args) + model, log = model.fit(X_train, y_train, X_valid, y_valid, FEATS) + save_model(model, f"{model_dir}/model") + if args.use_wandb: + wandb.log({log}) def run(args, train_data, valid_data, test_data): if args.use_pseudo: @@ -114,6 +151,10 @@ def run(args, train_data, valid_data, test_data): pseudo_train_data = update_train_data(pseudo_labels, train_data, test_data) train_data = pseudo_train_data + + print(f"# of train data : {len(train_data)}") + print(f"# of valid data : {len(valid_data)}") + print() train_loader, valid_loader = get_loaders(args, train_data, valid_data) # only when using warmup scheduler @@ -133,6 +174,11 @@ def run(args, train_data, valid_data, test_data): print(f"\n{model_dir}/exp_config.json is saved!\n") model = get_model(args) + if args.use_finetune: + load_state = torch.load(args.trained_model) + model.load_state_dict(load_state['state_dict'], strict=True) + print(f"{args.trained_model} is loaded!") + optimizer = get_optimizer(model, args) scheduler = get_scheduler(optimizer, args) @@ -276,8 +322,8 @@ def tabnet_inference(args, test_data): prediction_name = datetime.now(timezone(timedelta(hours=9))).strftime('%m%d_%H%M') - output_dir = '/opt/ml/p4-dkt-no_caffeine_no_gain/output/' - write_path = os.path.join(output_dir, f"{prediction_name}.csv") + output_dir = args.output_dir + write_path = os.path.join(output_dir, f"{args.model_name}.csv") if not os.path.exists(output_dir): os.makedirs(output_dir) with open(write_path, 'w', encoding='utf8') as w: @@ -286,6 +332,31 @@ def tabnet_inference(args, test_data): for id, p in enumerate(preds): w.write('{},{}\n'.format(id,p)) +def lgbm_inference(args): + from pycaret.classification import load_model as py_load_model + + model_dir = os.path.join(args.model_dir, args.model_name) + loaded_clf = py_load_model(f"{model_dir}/model") + + test_data = pd.read_csv(os.path.join(args.data_dir, args.test_file_name)) + test = test_data[test_data['userID'] != test_data['userID'].shift(-1)] + + FEATS = args.ANSWER_COLUMN + args.USE_COLUMN + + prediction = predict_model(loaded_clf, data=test[FEATS], raw_score=True) + preds = prediction.Score_1.values + + prediction_name = datetime.now(timezone(timedelta(hours=9))).strftime('%m%d_%H%M') + + output_dir = args.output_dir + write_path = os.path.join(output_dir, f"{args.model_name}.csv") + if not os.path.exists(output_dir): + os.makedirs(output_dir) + with open(write_path, 'w', encoding='utf8') as w: + print("writing prediction : {}".format(write_path)) + w.write("id,prediction\n") + for id, p in enumerate(preds): + w.write('{},{}\n'.format(id,p)) def inference(args, test_data): @@ -321,9 +392,7 @@ def inference(args, test_data): w.write("id,prediction\n") for id, p in enumerate(total_preds): w.write('{},{}\n'.format(id,p)) - - - + def get_tabnet_model(args): if args.tabnet_pretrain: pretrain_model,model = TabNet(args) @@ -458,10 +527,12 @@ def update_train_data(pseudo_labels, train_data, test_data): pseudo_test_data = copy.deepcopy(test_data) # pseudo label 테스트 데이터 update - for test_data, pseudo_label in zip(pseudo_test_data, pseudo_labels): - test_data[-1][-1] = pseudo_label + for p_test_data, pseudo_label in zip(pseudo_test_data, pseudo_labels): + p_test_data[-1][-1] = pseudo_label # train data 업데이트 - pseudo_train_data = np.concatenate((train_data, pseudo_test_data)) + # pseudo_train_data = np.concatenate((train_data, pseudo_test_data)) + pseudo_train_data = pseudo_test_data + print("pseudo_trian is ready!") return pseudo_train_data diff --git a/ensemble.py b/ensemble.py new file mode 100644 index 0000000..9d2022a --- /dev/null +++ b/ensemble.py @@ -0,0 +1,18 @@ +import pandas as pd +import os + +# soft ensemble +output_path = 'your output path' +output_name_list = ['your out files in output path'] +outputs = [] + +for output_name in output_name_list: + outputs.append(pd.read_csv(os.join(output_path, output_name))) + +outputs_pd = outputs[-1].copy() +outputs_pd['prediction'] = 0.0 + +for i in range(len(outputs_pd)): + outputs_pd['prediction'] += outputs_pd[i]['prediction'] +outputs_pd['prediction'] = outputs_pd['prediction'] / len(outputs_pd) +outputs_pd.to_csv("soft_ensemble.csv", index=False) \ No newline at end of file diff --git a/inference.py b/inference.py index e49e871..238fbf7 100644 --- a/inference.py +++ b/inference.py @@ -11,7 +11,6 @@ def main(args): device = "cuda" if torch.cuda.is_available() else "cpu" args.device = device - preprocess = Preprocess(args) preprocess.load_test_data(args.test_file_name) test_data = preprocess.get_test_data() @@ -24,6 +23,8 @@ def main(args): if args.model == 'tabnet': test_data_shift = test_data[test_data['userID'] != test_data['userID'].shift(-1)] trainer.tabnet_inference(args, test_data_shift) + elif args.model == 'lgbm': + trainer.lgbm_inference(args) else: trainer.inference(args, test_data) diff --git a/make_elapsed.py b/make_custom_data/make_elapsed.py similarity index 100% rename from make_elapsed.py rename to make_custom_data/make_elapsed.py diff --git a/make_fixed_data.py b/make_custom_data/make_fixed_data.py similarity index 100% rename from make_fixed_data.py rename to make_custom_data/make_fixed_data.py diff --git a/make_original_fixed_data.py b/make_custom_data/make_original_fixed_data.py similarity index 100% rename from make_original_fixed_data.py rename to make_custom_data/make_original_fixed_data.py diff --git a/requirements.txt b/requirements.txt index 80d0982..41a6496 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ sklearn tqdm wandb transformers -easydict \ No newline at end of file +easydict +pytorch-tabnet \ No newline at end of file diff --git a/train.py b/train.py index aafa166..2aa0315 100644 --- a/train.py +++ b/train.py @@ -6,6 +6,8 @@ import torch from dkt.utils import setSeeds import wandb +import json +import argparse def main(args): if args.use_wandb: @@ -22,12 +24,16 @@ def main(args): train_data = preprocess.get_train_data() valid_data = preprocess.get_valid_data() test_data = None + + if args.use_pseudo: preprocess.load_test_data(args.test_file_name) test_data = preprocess.get_test_data() if args.model == 'tabnet': trainer.tabnet_run(args, train_data, valid_data, test_data) + elif args.model == 'lgbm': + trainer.lgbm_run(args) else: trainer.run(args, train_data, valid_data, test_data)