|
| 1 | +import datetime |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +import numpy as np |
| 5 | +import lightgbm as lgb |
| 6 | +from sklearn.model_selection import KFold |
| 7 | +from sklearn.metrics import mean_squared_error |
| 8 | +import time |
| 9 | +from torch.utils.data import Dataset, DataLoader |
| 10 | +from torchvision import transforms, utils |
| 11 | + |
| 12 | + |
| 13 | +def preprocess_transaction(new_trans: pd.DataFrame, merchants: pd.DataFrame): |
| 14 | + new_trans['purchase_period'] = new_trans['purchase_date'].dt.year.astype(str).str.cat(new_trans['purchase_date'].dt.week.astype(str), sep="_") |
| 15 | + new_trans['purchase_period'] = new_trans['purchase_period'].astype('category').cat.codes |
| 16 | + new_trans['purchase_month'] = new_trans['purchase_date'].dt.month |
| 17 | + |
| 18 | + new_trans = pd.get_dummies(new_trans, columns=['category_2', 'category_3']) |
| 19 | + for col in ['authorized_flag', 'category_1']: |
| 20 | + new_trans[col] = new_trans[col].map({'Y': 1, 'N': 0}) |
| 21 | + |
| 22 | + drop_columns = ['city_id', 'state_id', 'subsector_id', 'merchant_group_id', 'merchant_category_id', 'category_2', 'active_months_lag3', 'active_months_lag6', 'active_months_lag12'] |
| 23 | + merchants.drop(drop_columns, axis=1, inplace=True) |
| 24 | + for col in ['category_1', 'category_4']: |
| 25 | + merchants[col] = merchants[col].map({'Y': 1, 'N': 0}) |
| 26 | + merchants = pd.get_dummies(merchants, columns=['most_recent_sales_range', 'most_recent_purchases_range']) |
| 27 | + |
| 28 | + return pd.merge(new_trans, merchants, on="merchant_id", how="left", suffixes=('_nt', '_mc')) |
| 29 | + |
| 30 | + |
| 31 | +def reduce_mem_usage(df, verbose=True): |
| 32 | + numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] |
| 33 | + start_mem = df.memory_usage().sum() / 1024 ** 2 |
| 34 | + for col in df.columns: |
| 35 | + col_type = df[col].dtypes |
| 36 | + if col_type in numerics: |
| 37 | + c_min = df[col].min() |
| 38 | + c_max = df[col].max() |
| 39 | + if str(col_type)[:3] == 'int': |
| 40 | + if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: |
| 41 | + df[col] = df[col].astype(np.int8) |
| 42 | + elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: |
| 43 | + df[col] = df[col].astype(np.int16) |
| 44 | + elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: |
| 45 | + df[col] = df[col].astype(np.int32) |
| 46 | + elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: |
| 47 | + df[col] = df[col].astype(np.int64) |
| 48 | + else: |
| 49 | + if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: |
| 50 | + df[col] = df[col].astype(np.float16) |
| 51 | + elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: |
| 52 | + df[col] = df[col].astype(np.float32) |
| 53 | + else: |
| 54 | + df[col] = df[col].astype(np.float64) |
| 55 | + end_mem = df.memory_usage().sum() / 1024 ** 2 |
| 56 | + if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem)) |
| 57 | + return df |
| 58 | + |
| 59 | + |
| 60 | +def train_lgbm(train: pd.DataFrame, test: pd.DataFrame, target, features, categorical_feats): |
| 61 | + param = {'num_leaves': 100, |
| 62 | + 'min_data_in_leaf': 30, |
| 63 | + 'objective': 'regression', |
| 64 | + 'max_depth': 6, |
| 65 | + 'learning_rate': 0.005, |
| 66 | + "min_child_samples": 20, |
| 67 | + "boosting": "gbdt", |
| 68 | + "feature_fraction": 0.9, |
| 69 | + "bagging_freq": 1, |
| 70 | + "bagging_fraction": 0.9, |
| 71 | + "bagging_seed": 11, |
| 72 | + "metric": 'rmse', |
| 73 | + "lambda_l1": 0.1, |
| 74 | + "verbosity": -1} |
| 75 | + folds = KFold(n_splits=5, shuffle=True, random_state=15) |
| 76 | + oof = np.zeros(len(train)) |
| 77 | + predictions = np.zeros(len(test)) |
| 78 | + start = time.time() |
| 79 | + feature_importance_df = pd.DataFrame() |
| 80 | + |
| 81 | + for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)): |
| 82 | + print("fold n°{}".format(fold_)) |
| 83 | + trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats) |
| 84 | + val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats) |
| 85 | + |
| 86 | + num_round = 10000 |
| 87 | + clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100, early_stopping_rounds=200) |
| 88 | + oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration) |
| 89 | + |
| 90 | + fold_importance_df = pd.DataFrame() |
| 91 | + fold_importance_df["feature"] = features |
| 92 | + fold_importance_df["importance"] = clf.feature_importance() |
| 93 | + fold_importance_df["fold"] = fold_ + 1 |
| 94 | + feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) |
| 95 | + |
| 96 | + predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits |
| 97 | + |
| 98 | + print("CV score: {:<8.5f}".format(mean_squared_error(oof, target) ** 0.5)) |
| 99 | + |
| 100 | + |
| 101 | +def create_submission(test: pd.DataFrame, predictions): |
| 102 | + sub_df = pd.DataFrame({"card_id": test["card_id"].values}) |
| 103 | + sub_df["target"] = predictions |
| 104 | + sub_df.to_csv("submit.csv", index=False) |
| 105 | + |
| 106 | + |
| 107 | +def read_data(input_file: str): |
| 108 | + df = pd.read_csv(input_file, parse_dates=['first_active_month']) |
| 109 | + df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days |
| 110 | + return df |
| 111 | + |
| 112 | + |
| 113 | +class SampleDataset(Dataset): |
| 114 | + |
| 115 | + def __init__(self, train_df: pd.DataFrame, trans_df: pd.DataFrame): |
| 116 | + self.train_df: pd.DataFrame = train_df |
| 117 | + self.trans_df: pd.DataFrame = trans_df |
| 118 | + |
| 119 | + def __len__(self): |
| 120 | + return self.train_df.shape(0) |
| 121 | + |
| 122 | + def __getitem__(self, idx): |
| 123 | + card_id = self.train_df.iloc[idx, 0] |
| 124 | + card_features = self.train_df.iloc[idx, 1:].as_matrix() |
| 125 | + |
| 126 | + return { |
| 127 | + 'card_id': card_id, |
| 128 | + 'features': card_features |
| 129 | + } |
0 commit comments