ducva
diff --git a/‎.gitignore
Lines changed: 7 additions & 0 deletions b/‎.gitignore
Lines changed: 7 additions & 0 deletions
diff --git a/‎environment-cpu.yml
Lines changed: 48 additions & 0 deletions b/‎environment-cpu.yml
Lines changed: 48 additions & 0 deletions
diff --git a/‎environment.yml
Lines changed: 104 additions & 0 deletions b/‎environment.yml
Lines changed: 104 additions & 0 deletions
diff --git a/‎examples/elo.py
Lines changed: 129 additions & 0 deletions b/‎examples/elo.py
Lines changed: 129 additions & 0 deletions
diff --git a/‎examples/news_groups.py
Lines changed: 86 additions & 0 deletions b/‎examples/news_groups.py
Lines changed: 86 additions & 0 deletions
@@ -0,0 +1,7 @@
+examples/data
+.ipynb_checkpoints
+*.pyc
+.idea
+__pycache__
+.vscode
+data
@@ -0,0 +1,48 @@
+name: vision
+channels:
+  - fastai
+  - pytorch
+  - defaults
+dependencies:
+  - blas=1.0=mkl
+  - ca-certificates=2018.03.07=0
+  - certifi=2018.10.15=py36_0
+  - cffi=1.11.5=py36h6174b99_1
+  - freetype=2.9.1=hb4e5f40_0
+  - intel-openmp=2019.1=144
+  - jpeg=9b=he5867d9_2
+  - libcxx=4.0.1=hcfea43d_1
+  - libcxxabi=4.0.1=hcfea43d_1
+  - libedit=3.1.20170329=hb402a30_2
+  - libffi=3.2.1=h475c297_4
+  - libgfortran=3.0.1=h93005f0_2
+  - libpng=1.6.35=ha441bb4_0
+  - libtiff=4.0.9=hcb84e12_2
+  - mkl=2018.0.3=1
+  - mkl_fft=1.0.6=py36hb8a8100_0
+  - mkl_random=1.0.1=py36h5d10147_1
+  - ncurses=6.1=h0a44026_0
+  - ninja=1.8.2=py36h04f5b5a_1
+  - numpy=1.15.4=py36h6a91979_0
+  - numpy-base=1.15.4=py36h8a80b8c_0
+  - olefile=0.46=py36_0
+  - openssl=1.1.1a=h1de35cc_0
+  - pillow=5.3.0=py36hb68e598_0
+  - pip=18.1=py36_0
+  - pycparser=2.19=py36_0
+  - python=3.6.7=haf84260_0
+  - readline=7.0=h1de35cc_5
+  - setuptools=40.6.2=py36_0
+  - six=1.11.0=py36_1
+  - sqlite=3.25.3=ha441bb4_0
+  - tk=8.6.8=ha441bb4_0
+  - wheel=0.32.3=py36_0
+  - xz=5.2.4=h1de35cc_4
+  - zlib=1.2.11=h1de35cc_3
+  - torchvision-nightly-cpu=0.2.1=pyh19dea27_0
+  - pytorch-nightly-cpu=1.0.0.dev20181014=py3.6_0
+  - pip:
+    - torch==1.0.0.dev20181014
+    - torchvision==0.2.1
+    - fastai==1.0.28
+
@@ -0,0 +1,104 @@
+name: vision
+channels:
+  - pytorch
+  - fastai
+  - defaults
+dependencies:
+  - asn1crypto=0.24.0=py36_0
+  - blas=1.0=mkl
+  - bottleneck=1.2.1=py36h035aef0_1
+  - ca-certificates=2018.03.07=0
+  - certifi=2018.10.15=py36_0
+  - cffi=1.11.5=py36he75722e_1
+  - chardet=3.0.4=py36_1
+  - cryptography=2.3.1=py36hc365091_0
+  - cycler=0.10.0=py36_0
+  - cymem=2.0.2=py36hfd86e86_0
+  - cytoolz=0.9.0.1=py36h14c3975_1
+  - dbus=1.13.2=h714fa37_1
+  - dill=0.2.8.2=py36_0
+  - expat=2.2.6=he6710b0_0
+  - fontconfig=2.13.0=h9420a91_0
+  - freetype=2.9.1=h8a8886c_1
+  - glib=2.56.2=hd408876_0
+  - gst-plugins-base=1.14.0=hbbd80ab_1
+  - gstreamer=1.14.0=hb453b48_1
+  - icu=58.2=h9c2bf20_1
+  - idna=2.7=py36_0
+  - intel-openmp=2019.1=144
+  - jpeg=9b=h024ee3a_2
+  - kiwisolver=1.0.1=py36hf484d3e_0
+  - libedit=3.1.20170329=h6b74fdf_2
+  - libffi=3.2.1=hd88cf55_4
+  - libgcc-ng=8.2.0=hdf63c60_1
+  - libgfortran-ng=7.3.0=hdf63c60_0
+  - libpng=1.6.35=hbc83047_0
+  - libstdcxx-ng=8.2.0=hdf63c60_1
+  - libtiff=4.0.9=he85c1e1_2
+  - libuuid=1.0.3=h1bed415_2
+  - libxcb=1.13=h1bed415_1
+  - libxml2=2.9.8=h26e45fe_1
+  - matplotlib=3.0.1=py36h5429711_0
+  - mkl=2018.0.3=1
+  - mkl_fft=1.0.6=py36h7dd41cf_0
+  - mkl_random=1.0.1=py36h4414c95_1
+  - msgpack-numpy=0.4.3.2=py36_0
+  - msgpack-python=0.5.6=py36h6bb024c_1
+  - murmurhash=1.0.1=py36he6710b0_0
+  - ncurses=6.1=hf484d3e_0
+  - ninja=1.8.2=py36h6bb024c_1
+  - numexpr=2.6.8=py36hd89afb7_0
+  - numpy=1.15.4=py36h1d66e8a_0
+  - numpy-base=1.15.4=py36h81de0dd_0
+  - olefile=0.46=py36_0
+  - openssl=1.0.2p=h14c3975_0
+  - pandas=0.23.4=py36h04863e7_0
+  - pcre=8.42=h439df22_0
+  - pillow=5.3.0=py36h34e0f95_0
+  - pip=18.1=py36_0
+  - plac=0.9.6=py36_0
+  - preshed=2.0.1=py36he6710b0_0
+  - pycparser=2.19=py36_0
+  - pyopenssl=18.0.0=py36_0
+  - pyparsing=2.3.0=py36_0
+  - pyqt=5.9.2=py36h05f1152_2
+  - pysocks=1.6.8=py36_0
+  - python=3.6.6=h6e4f718_2
+  - python-dateutil=2.7.5=py36_0
+  - pytz=2018.7=py36_0
+  - pyyaml=3.13=py36h14c3975_0
+  - qt=5.9.6=h8703b6f_2
+  - readline=7.0=h7b6447c_5
+  - regex=2018.08.29=py36h7b6447c_0
+  - requests=2.20.1=py36_0
+  - scipy=1.1.0=py36hfa4b5c9_1
+  - setuptools=40.6.2=py36_0
+  - sip=4.19.8=py36hf484d3e_0
+  - six=1.11.0=py36_1
+  - spacy=2.0.16=py36h962f231_0
+  - sqlite=3.25.3=h7b6447c_0
+  - thinc=6.12.0=py36h4989274_0
+  - tk=8.6.8=hbc83047_0
+  - toolz=0.9.0=py36_0
+  - tornado=5.1.1=py36h7b6447c_0
+  - tqdm=4.28.1=py36h28b3542_0
+  - typing=3.6.4=py36_0
+  - ujson=1.35=py36h14c3975_0
+  - urllib3=1.23=py36_0
+  - wheel=0.32.3=py36_0
+  - wrapt=1.10.11=py36h14c3975_2
+  - xz=5.2.4=h14c3975_4
+  - yaml=0.1.7=had09818_2
+  - zlib=1.2.11=h7b6447c_3
+  - dataclasses=0.6=py_0
+  - fastai=1.0.28=py_1
+  - fastprogress=0.1.15=py_0
+  - torchvision-nightly=0.2.1=py_0
+  - cuda92=1.0=0
+  - pytorch-nightly=1.0.0.dev20181127=py3.6_cuda9.2.148_cudnn7.4.1_0
+  - pip:
+    - msgpack==0.5.6
+    - torch==1.0.0.dev20181127
+    - torchvision==0.2.1
+prefix: /home/ubuntu/anaconda3/envs/vision
+
@@ -0,0 +1,129 @@
+import datetime
+
+import pandas as pd
+import numpy as np
+import lightgbm as lgb
+from sklearn.model_selection import KFold
+from sklearn.metrics import mean_squared_error
+import time
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms, utils
+
+
+def preprocess_transaction(new_trans: pd.DataFrame, merchants: pd.DataFrame):
+    new_trans['purchase_period'] = new_trans['purchase_date'].dt.year.astype(str).str.cat(new_trans['purchase_date'].dt.week.astype(str), sep="_")
+    new_trans['purchase_period'] = new_trans['purchase_period'].astype('category').cat.codes
+    new_trans['purchase_month'] = new_trans['purchase_date'].dt.month
+
+    new_trans = pd.get_dummies(new_trans, columns=['category_2', 'category_3'])
+    for col in ['authorized_flag', 'category_1']:
+        new_trans[col] = new_trans[col].map({'Y': 1, 'N': 0})
+
+    drop_columns = ['city_id', 'state_id', 'subsector_id', 'merchant_group_id', 'merchant_category_id', 'category_2', 'active_months_lag3', 'active_months_lag6', 'active_months_lag12']
+    merchants.drop(drop_columns, axis=1, inplace=True)
+    for col in ['category_1', 'category_4']:
+        merchants[col] = merchants[col].map({'Y': 1, 'N': 0})
+    merchants = pd.get_dummies(merchants, columns=['most_recent_sales_range', 'most_recent_purchases_range'])
+
+    return pd.merge(new_trans, merchants, on="merchant_id", how="left", suffixes=('_nt', '_mc'))
+
+
+def reduce_mem_usage(df, verbose=True):
+    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
+    start_mem = df.memory_usage().sum() / 1024 ** 2
+    for col in df.columns:
+        col_type = df[col].dtypes
+        if col_type in numerics:
+            c_min = df[col].min()
+            c_max = df[col].max()
+            if str(col_type)[:3] == 'int':
+                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
+                    df[col] = df[col].astype(np.int8)
+                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
+                    df[col] = df[col].astype(np.int16)
+                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
+                    df[col] = df[col].astype(np.int32)
+                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
+                    df[col] = df[col].astype(np.int64)
+            else:
+                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
+                    df[col] = df[col].astype(np.float16)
+                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
+                    df[col] = df[col].astype(np.float32)
+                else:
+                    df[col] = df[col].astype(np.float64)
+    end_mem = df.memory_usage().sum() / 1024 ** 2
+    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
+    return df
+
+
+def train_lgbm(train: pd.DataFrame, test: pd.DataFrame, target, features, categorical_feats):
+    param = {'num_leaves': 100,
+             'min_data_in_leaf': 30,
+             'objective': 'regression',
+             'max_depth': 6,
+             'learning_rate': 0.005,
+             "min_child_samples": 20,
+             "boosting": "gbdt",
+             "feature_fraction": 0.9,
+             "bagging_freq": 1,
+             "bagging_fraction": 0.9,
+             "bagging_seed": 11,
+             "metric": 'rmse',
+             "lambda_l1": 0.1,
+             "verbosity": -1}
+    folds = KFold(n_splits=5, shuffle=True, random_state=15)
+    oof = np.zeros(len(train))
+    predictions = np.zeros(len(test))
+    start = time.time()
+    feature_importance_df = pd.DataFrame()
+
+    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
+        print("fold n°{}".format(fold_))
+        trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
+        val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)
+
+        num_round = 10000
+        clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100, early_stopping_rounds=200)
+        oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
+
+        fold_importance_df = pd.DataFrame()
+        fold_importance_df["feature"] = features
+        fold_importance_df["importance"] = clf.feature_importance()
+        fold_importance_df["fold"] = fold_ + 1
+        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
+
+        predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
+
+    print("CV score: {:<8.5f}".format(mean_squared_error(oof, target) ** 0.5))
+
+
+def create_submission(test: pd.DataFrame, predictions):
+    sub_df = pd.DataFrame({"card_id": test["card_id"].values})
+    sub_df["target"] = predictions
+    sub_df.to_csv("submit.csv", index=False)
+
+
+def read_data(input_file: str):
+    df = pd.read_csv(input_file, parse_dates=['first_active_month'])
+    df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days
+    return df
+
+
+class SampleDataset(Dataset):
+
+    def __init__(self, train_df: pd.DataFrame, trans_df: pd.DataFrame):
+        self.train_df: pd.DataFrame = train_df
+        self.trans_df: pd.DataFrame = trans_df
+
+    def __len__(self):
+        return self.train_df.shape(0)
+
+    def __getitem__(self, idx):
+        card_id = self.train_df.iloc[idx, 0]
+        card_features = self.train_df.iloc[idx, 1:].as_matrix()
+
+        return {
+            'card_id': card_id,
+            'features': card_features
+        }
@@ -0,0 +1,86 @@
+import os
+import sys
+
+from fastai.imports.core import *
+from fastai.text.data import TextLMDataBunch, TextClasDataBunch
+from fastai.text.transform import Vocab
+
+sys.path.append("../vision")
+from vision.datasets import *
+from vision.text.transform import TextTokenizer
+
+URI = 'https://s3.amazonaws.com/datasart-ds/20_newsgroup.tgz'
+TEXT_DATA_DIR = "./data/20_newsgroup"
+
+
+def parse_text_data():
+    texts = []  # list of text samples
+    labels_index = {}  # dictionary mapping label name to numeric id
+    labels = []  # list of label ids
+    for name in sorted(os.listdir(TEXT_DATA_DIR)):
+        path = os.path.join(TEXT_DATA_DIR, name)
+        if os.path.isdir(path):
+            label_id = len(labels_index)
+            labels_index[name] = label_id
+            for fname in sorted(os.listdir(path)):
+                if fname.isdigit():
+                    fpath = os.path.join(path, fname)
+                    if sys.version_info < (3,):
+                        f = open(fpath)
+                    else:
+                        f = open(fpath, encoding='latin-1')
+                    t = f.read()
+                    i = t.find('\n\n')  # skip header
+                    if 0 < i:
+                        t = t[i:]
+                    texts.append(t)
+                    f.close()
+                    labels.append(label_id)
+
+    print('Found %s texts.' % len(texts))
+    df: pd.DataFrame = pd.DataFrame.from_dict({'text': texts, 'label': labels})
+    df.to_csv('./data/20_newsgroup.csv')
+    return texts, labels, labels_index
+
+
+def create_data_bunch():
+    """
+    A data bunch includes:
+    - itos object
+    - train_texts
+    - train_labels
+    - val_texts
+    - val_labels
+    - test_texts
+    - test_labels
+    :return:
+    """
+    pass
+
+
+def tokenizer(texts):  # create a tokenizer function
+    tok = TextTokenizer('en')
+    return tok.process_all(texts)
+
+
+if __name__ == "__main__":
+    # 1. Download data
+    # untar_data(URI)
+
+    # 2. Read data and save with 'normal' format: text, label
+    # texts, labels, label_index = parse_text_data()
+    # df = pd.DataFrame.from_dict({'text': texts, 'label': labels})
+    # df.to_csv('./data/20_newsgroup.csv', index=None)
+
+    # 3. Tokenize text to create vocabulary
+    df = pd.read_csv('./data/20_newsgroup.csv')
+
+    tokens = tokenizer(df[:10]['text'].tolist())
+    vocab = Vocab.create(tokens, max_vocab=1000, min_freq=2)
+    print(vocab.itos)
+    print(vocab.stoi)
+
+    # 4. create embedding matrix from pretrained word vectors
+
+
+