Skip to content

Commit 89cd180

Browse files
author
v-duc
committed
add files
1 parent 8801e8d commit 89cd180

File tree

13 files changed

+624
-0
lines changed

13 files changed

+624
-0
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
examples/data
2+
.ipynb_checkpoints
3+
*.pyc
4+
.idea
5+
__pycache__
6+
.vscode
7+
data

environment-cpu.yml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
name: vision
2+
channels:
3+
- fastai
4+
- pytorch
5+
- defaults
6+
dependencies:
7+
- blas=1.0=mkl
8+
- ca-certificates=2018.03.07=0
9+
- certifi=2018.10.15=py36_0
10+
- cffi=1.11.5=py36h6174b99_1
11+
- freetype=2.9.1=hb4e5f40_0
12+
- intel-openmp=2019.1=144
13+
- jpeg=9b=he5867d9_2
14+
- libcxx=4.0.1=hcfea43d_1
15+
- libcxxabi=4.0.1=hcfea43d_1
16+
- libedit=3.1.20170329=hb402a30_2
17+
- libffi=3.2.1=h475c297_4
18+
- libgfortran=3.0.1=h93005f0_2
19+
- libpng=1.6.35=ha441bb4_0
20+
- libtiff=4.0.9=hcb84e12_2
21+
- mkl=2018.0.3=1
22+
- mkl_fft=1.0.6=py36hb8a8100_0
23+
- mkl_random=1.0.1=py36h5d10147_1
24+
- ncurses=6.1=h0a44026_0
25+
- ninja=1.8.2=py36h04f5b5a_1
26+
- numpy=1.15.4=py36h6a91979_0
27+
- numpy-base=1.15.4=py36h8a80b8c_0
28+
- olefile=0.46=py36_0
29+
- openssl=1.1.1a=h1de35cc_0
30+
- pillow=5.3.0=py36hb68e598_0
31+
- pip=18.1=py36_0
32+
- pycparser=2.19=py36_0
33+
- python=3.6.7=haf84260_0
34+
- readline=7.0=h1de35cc_5
35+
- setuptools=40.6.2=py36_0
36+
- six=1.11.0=py36_1
37+
- sqlite=3.25.3=ha441bb4_0
38+
- tk=8.6.8=ha441bb4_0
39+
- wheel=0.32.3=py36_0
40+
- xz=5.2.4=h1de35cc_4
41+
- zlib=1.2.11=h1de35cc_3
42+
- torchvision-nightly-cpu=0.2.1=pyh19dea27_0
43+
- pytorch-nightly-cpu=1.0.0.dev20181014=py3.6_0
44+
- pip:
45+
- torch==1.0.0.dev20181014
46+
- torchvision==0.2.1
47+
- fastai==1.0.28
48+

environment.yml

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
name: vision
2+
channels:
3+
- pytorch
4+
- fastai
5+
- defaults
6+
dependencies:
7+
- asn1crypto=0.24.0=py36_0
8+
- blas=1.0=mkl
9+
- bottleneck=1.2.1=py36h035aef0_1
10+
- ca-certificates=2018.03.07=0
11+
- certifi=2018.10.15=py36_0
12+
- cffi=1.11.5=py36he75722e_1
13+
- chardet=3.0.4=py36_1
14+
- cryptography=2.3.1=py36hc365091_0
15+
- cycler=0.10.0=py36_0
16+
- cymem=2.0.2=py36hfd86e86_0
17+
- cytoolz=0.9.0.1=py36h14c3975_1
18+
- dbus=1.13.2=h714fa37_1
19+
- dill=0.2.8.2=py36_0
20+
- expat=2.2.6=he6710b0_0
21+
- fontconfig=2.13.0=h9420a91_0
22+
- freetype=2.9.1=h8a8886c_1
23+
- glib=2.56.2=hd408876_0
24+
- gst-plugins-base=1.14.0=hbbd80ab_1
25+
- gstreamer=1.14.0=hb453b48_1
26+
- icu=58.2=h9c2bf20_1
27+
- idna=2.7=py36_0
28+
- intel-openmp=2019.1=144
29+
- jpeg=9b=h024ee3a_2
30+
- kiwisolver=1.0.1=py36hf484d3e_0
31+
- libedit=3.1.20170329=h6b74fdf_2
32+
- libffi=3.2.1=hd88cf55_4
33+
- libgcc-ng=8.2.0=hdf63c60_1
34+
- libgfortran-ng=7.3.0=hdf63c60_0
35+
- libpng=1.6.35=hbc83047_0
36+
- libstdcxx-ng=8.2.0=hdf63c60_1
37+
- libtiff=4.0.9=he85c1e1_2
38+
- libuuid=1.0.3=h1bed415_2
39+
- libxcb=1.13=h1bed415_1
40+
- libxml2=2.9.8=h26e45fe_1
41+
- matplotlib=3.0.1=py36h5429711_0
42+
- mkl=2018.0.3=1
43+
- mkl_fft=1.0.6=py36h7dd41cf_0
44+
- mkl_random=1.0.1=py36h4414c95_1
45+
- msgpack-numpy=0.4.3.2=py36_0
46+
- msgpack-python=0.5.6=py36h6bb024c_1
47+
- murmurhash=1.0.1=py36he6710b0_0
48+
- ncurses=6.1=hf484d3e_0
49+
- ninja=1.8.2=py36h6bb024c_1
50+
- numexpr=2.6.8=py36hd89afb7_0
51+
- numpy=1.15.4=py36h1d66e8a_0
52+
- numpy-base=1.15.4=py36h81de0dd_0
53+
- olefile=0.46=py36_0
54+
- openssl=1.0.2p=h14c3975_0
55+
- pandas=0.23.4=py36h04863e7_0
56+
- pcre=8.42=h439df22_0
57+
- pillow=5.3.0=py36h34e0f95_0
58+
- pip=18.1=py36_0
59+
- plac=0.9.6=py36_0
60+
- preshed=2.0.1=py36he6710b0_0
61+
- pycparser=2.19=py36_0
62+
- pyopenssl=18.0.0=py36_0
63+
- pyparsing=2.3.0=py36_0
64+
- pyqt=5.9.2=py36h05f1152_2
65+
- pysocks=1.6.8=py36_0
66+
- python=3.6.6=h6e4f718_2
67+
- python-dateutil=2.7.5=py36_0
68+
- pytz=2018.7=py36_0
69+
- pyyaml=3.13=py36h14c3975_0
70+
- qt=5.9.6=h8703b6f_2
71+
- readline=7.0=h7b6447c_5
72+
- regex=2018.08.29=py36h7b6447c_0
73+
- requests=2.20.1=py36_0
74+
- scipy=1.1.0=py36hfa4b5c9_1
75+
- setuptools=40.6.2=py36_0
76+
- sip=4.19.8=py36hf484d3e_0
77+
- six=1.11.0=py36_1
78+
- spacy=2.0.16=py36h962f231_0
79+
- sqlite=3.25.3=h7b6447c_0
80+
- thinc=6.12.0=py36h4989274_0
81+
- tk=8.6.8=hbc83047_0
82+
- toolz=0.9.0=py36_0
83+
- tornado=5.1.1=py36h7b6447c_0
84+
- tqdm=4.28.1=py36h28b3542_0
85+
- typing=3.6.4=py36_0
86+
- ujson=1.35=py36h14c3975_0
87+
- urllib3=1.23=py36_0
88+
- wheel=0.32.3=py36_0
89+
- wrapt=1.10.11=py36h14c3975_2
90+
- xz=5.2.4=h14c3975_4
91+
- yaml=0.1.7=had09818_2
92+
- zlib=1.2.11=h7b6447c_3
93+
- dataclasses=0.6=py_0
94+
- fastai=1.0.28=py_1
95+
- fastprogress=0.1.15=py_0
96+
- torchvision-nightly=0.2.1=py_0
97+
- cuda92=1.0=0
98+
- pytorch-nightly=1.0.0.dev20181127=py3.6_cuda9.2.148_cudnn7.4.1_0
99+
- pip:
100+
- msgpack==0.5.6
101+
- torch==1.0.0.dev20181127
102+
- torchvision==0.2.1
103+
prefix: /home/ubuntu/anaconda3/envs/vision
104+

examples/elo.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import datetime
2+
3+
import pandas as pd
4+
import numpy as np
5+
import lightgbm as lgb
6+
from sklearn.model_selection import KFold
7+
from sklearn.metrics import mean_squared_error
8+
import time
9+
from torch.utils.data import Dataset, DataLoader
10+
from torchvision import transforms, utils
11+
12+
13+
def preprocess_transaction(new_trans: pd.DataFrame, merchants: pd.DataFrame):
14+
new_trans['purchase_period'] = new_trans['purchase_date'].dt.year.astype(str).str.cat(new_trans['purchase_date'].dt.week.astype(str), sep="_")
15+
new_trans['purchase_period'] = new_trans['purchase_period'].astype('category').cat.codes
16+
new_trans['purchase_month'] = new_trans['purchase_date'].dt.month
17+
18+
new_trans = pd.get_dummies(new_trans, columns=['category_2', 'category_3'])
19+
for col in ['authorized_flag', 'category_1']:
20+
new_trans[col] = new_trans[col].map({'Y': 1, 'N': 0})
21+
22+
drop_columns = ['city_id', 'state_id', 'subsector_id', 'merchant_group_id', 'merchant_category_id', 'category_2', 'active_months_lag3', 'active_months_lag6', 'active_months_lag12']
23+
merchants.drop(drop_columns, axis=1, inplace=True)
24+
for col in ['category_1', 'category_4']:
25+
merchants[col] = merchants[col].map({'Y': 1, 'N': 0})
26+
merchants = pd.get_dummies(merchants, columns=['most_recent_sales_range', 'most_recent_purchases_range'])
27+
28+
return pd.merge(new_trans, merchants, on="merchant_id", how="left", suffixes=('_nt', '_mc'))
29+
30+
31+
def reduce_mem_usage(df, verbose=True):
32+
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
33+
start_mem = df.memory_usage().sum() / 1024 ** 2
34+
for col in df.columns:
35+
col_type = df[col].dtypes
36+
if col_type in numerics:
37+
c_min = df[col].min()
38+
c_max = df[col].max()
39+
if str(col_type)[:3] == 'int':
40+
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
41+
df[col] = df[col].astype(np.int8)
42+
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
43+
df[col] = df[col].astype(np.int16)
44+
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
45+
df[col] = df[col].astype(np.int32)
46+
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
47+
df[col] = df[col].astype(np.int64)
48+
else:
49+
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
50+
df[col] = df[col].astype(np.float16)
51+
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
52+
df[col] = df[col].astype(np.float32)
53+
else:
54+
df[col] = df[col].astype(np.float64)
55+
end_mem = df.memory_usage().sum() / 1024 ** 2
56+
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
57+
return df
58+
59+
60+
def train_lgbm(train: pd.DataFrame, test: pd.DataFrame, target, features, categorical_feats):
61+
param = {'num_leaves': 100,
62+
'min_data_in_leaf': 30,
63+
'objective': 'regression',
64+
'max_depth': 6,
65+
'learning_rate': 0.005,
66+
"min_child_samples": 20,
67+
"boosting": "gbdt",
68+
"feature_fraction": 0.9,
69+
"bagging_freq": 1,
70+
"bagging_fraction": 0.9,
71+
"bagging_seed": 11,
72+
"metric": 'rmse',
73+
"lambda_l1": 0.1,
74+
"verbosity": -1}
75+
folds = KFold(n_splits=5, shuffle=True, random_state=15)
76+
oof = np.zeros(len(train))
77+
predictions = np.zeros(len(test))
78+
start = time.time()
79+
feature_importance_df = pd.DataFrame()
80+
81+
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
82+
print("fold n°{}".format(fold_))
83+
trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
84+
val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)
85+
86+
num_round = 10000
87+
clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100, early_stopping_rounds=200)
88+
oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
89+
90+
fold_importance_df = pd.DataFrame()
91+
fold_importance_df["feature"] = features
92+
fold_importance_df["importance"] = clf.feature_importance()
93+
fold_importance_df["fold"] = fold_ + 1
94+
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
95+
96+
predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
97+
98+
print("CV score: {:<8.5f}".format(mean_squared_error(oof, target) ** 0.5))
99+
100+
101+
def create_submission(test: pd.DataFrame, predictions):
102+
sub_df = pd.DataFrame({"card_id": test["card_id"].values})
103+
sub_df["target"] = predictions
104+
sub_df.to_csv("submit.csv", index=False)
105+
106+
107+
def read_data(input_file: str):
108+
df = pd.read_csv(input_file, parse_dates=['first_active_month'])
109+
df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days
110+
return df
111+
112+
113+
class SampleDataset(Dataset):
114+
115+
def __init__(self, train_df: pd.DataFrame, trans_df: pd.DataFrame):
116+
self.train_df: pd.DataFrame = train_df
117+
self.trans_df: pd.DataFrame = trans_df
118+
119+
def __len__(self):
120+
return self.train_df.shape(0)
121+
122+
def __getitem__(self, idx):
123+
card_id = self.train_df.iloc[idx, 0]
124+
card_features = self.train_df.iloc[idx, 1:].as_matrix()
125+
126+
return {
127+
'card_id': card_id,
128+
'features': card_features
129+
}

examples/news_groups.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import os
2+
import sys
3+
4+
from fastai.imports.core import *
5+
from fastai.text.data import TextLMDataBunch, TextClasDataBunch
6+
from fastai.text.transform import Vocab
7+
8+
sys.path.append("../vision")
9+
from vision.datasets import *
10+
from vision.text.transform import TextTokenizer
11+
12+
URI = 'https://s3.amazonaws.com/datasart-ds/20_newsgroup.tgz'
13+
TEXT_DATA_DIR = "./data/20_newsgroup"
14+
15+
16+
def parse_text_data():
17+
texts = [] # list of text samples
18+
labels_index = {} # dictionary mapping label name to numeric id
19+
labels = [] # list of label ids
20+
for name in sorted(os.listdir(TEXT_DATA_DIR)):
21+
path = os.path.join(TEXT_DATA_DIR, name)
22+
if os.path.isdir(path):
23+
label_id = len(labels_index)
24+
labels_index[name] = label_id
25+
for fname in sorted(os.listdir(path)):
26+
if fname.isdigit():
27+
fpath = os.path.join(path, fname)
28+
if sys.version_info < (3,):
29+
f = open(fpath)
30+
else:
31+
f = open(fpath, encoding='latin-1')
32+
t = f.read()
33+
i = t.find('\n\n') # skip header
34+
if 0 < i:
35+
t = t[i:]
36+
texts.append(t)
37+
f.close()
38+
labels.append(label_id)
39+
40+
print('Found %s texts.' % len(texts))
41+
df: pd.DataFrame = pd.DataFrame.from_dict({'text': texts, 'label': labels})
42+
df.to_csv('./data/20_newsgroup.csv')
43+
return texts, labels, labels_index
44+
45+
46+
def create_data_bunch():
47+
"""
48+
A data bunch includes:
49+
- itos object
50+
- train_texts
51+
- train_labels
52+
- val_texts
53+
- val_labels
54+
- test_texts
55+
- test_labels
56+
:return:
57+
"""
58+
pass
59+
60+
61+
def tokenizer(texts): # create a tokenizer function
62+
tok = TextTokenizer('en')
63+
return tok.process_all(texts)
64+
65+
66+
if __name__ == "__main__":
67+
# 1. Download data
68+
# untar_data(URI)
69+
70+
# 2. Read data and save with 'normal' format: text, label
71+
# texts, labels, label_index = parse_text_data()
72+
# df = pd.DataFrame.from_dict({'text': texts, 'label': labels})
73+
# df.to_csv('./data/20_newsgroup.csv', index=None)
74+
75+
# 3. Tokenize text to create vocabulary
76+
df = pd.read_csv('./data/20_newsgroup.csv')
77+
78+
tokens = tokenizer(df[:10]['text'].tolist())
79+
vocab = Vocab.create(tokens, max_vocab=1000, min_freq=2)
80+
print(vocab.itos)
81+
print(vocab.stoi)
82+
83+
# 4. create embedding matrix from pretrained word vectors
84+
85+
86+

0 commit comments

Comments
 (0)