Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizer merge #30

Closed
wants to merge 32 commits into from
Closed
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
3eed6df
ex1
abbymark Dec 4, 2021
0d65645
Merge branch 'baseline_dataloader' into amc_private
abbymark Dec 4, 2021
28a3829
backup for gpu problem
abbymark Dec 4, 2021
b3e8fee
title_generator
abbymark Dec 4, 2021
252737e
benchmark setup
abbymark Dec 5, 2021
89196ea
dynamic quantization
abbymark Dec 5, 2021
5bcac90
quantization
abbymark Dec 5, 2021
f411bc5
quantization continue
abbymark Dec 5, 2021
950fe8a
half quantization fix
abbymark Dec 5, 2021
9d7e262
Merge pull request #23 from boostcampaitech2/baseline_dataloader
changyong93 Dec 6, 2021
be94a63
kd continue
abbymark Dec 7, 2021
04ab2e4
kd continued
abbymark Dec 7, 2021
947426c
basic kd
abbymark Dec 7, 2021
9df9d1d
merge with main
abbymark Dec 7, 2021
4564c48
cleaning test
abbymark Dec 7, 2021
0c7a3bc
cleaning
abbymark Dec 7, 2021
e774b8e
기타 오류 수정
abbymark Dec 8, 2021
b04a9cf
오류 수정
abbymark Dec 9, 2021
b7447bc
dynamic quantization test fixed
abbymark Dec 9, 2021
ab8a928
tiny distillation
abbymark Dec 11, 2021
19d33ec
modeling
abbymark Dec 12, 2021
b83cc0e
refine tiny distillation
abbymark Dec 13, 2021
fe53aa1
기타
abbymark Dec 13, 2021
abc5cc1
Merge branch 'dev_optimizer' into optimizer_merge
abbymark Dec 13, 2021
a259c6b
quantization 수정
abbymark Dec 13, 2021
4755278
moved for merge
abbymark Dec 13, 2021
f96d0b5
Merge branch 'dev_optimizer' into optimizer_merge
abbymark Dec 13, 2021
8ba74dc
test
abbymark Dec 13, 2021
093cd06
Merge branch 'dev_optimizer' into optimizer_merge
abbymark Dec 13, 2021
71e0bb0
Merge branch 'dev' into optimizer_merge
abbymark Dec 13, 2021
ee4ae01
distillation method added to train
abbymark Dec 13, 2021
3e91aef
argument fix
abbymark Dec 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
__pycache__/**
data/**
**/wandb/**
.env
amc_/*/**
args/__pycache__
wandb/**
model/**
Expand Down
75 changes: 75 additions & 0 deletions performanceBenchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import numpy as np
import torch

from pathlib import Path
from time import perf_counter

from rouge import compute


class PerformanceBenchmark:
def __init__(self, pipeline, dataset, tokenizer, optim_type='base line'):
self.pipeline = pipeline
self.dataset = dataset
self.tokenizer = tokenizer
self.optim_type = optim_type

def compute_rouge(self):
rouge_scores = {}
pred = self.pipeline(self.dataset['text'])
label = self.dataset['title']

pred = [key['summary_text'] for key in pred]
rouge_score = compute(pred, label, self.tokenizer)

# 각각의 rouge1, rouge2, rougeL, rougeLsum의 mid의 fmeasure만 가져간다
for key, value in rouge_score.items():
if key not in rouge_scores:
rouge_scores[key] = round(rouge_score[key].mid.fmeasure * 100, 4)

print("====ROUGE score====")
print(rouge_scores)
return rouge_scores

def compute_size(self):
state_dict = self.pipeline.model.state_dict()
path = Path("model.pt")
torch.save(state_dict, path)

# 모델 사이즈
size_mb = Path(path).stat().st_size / (1024 * 1024)

# 임시 파일 삭제
path.unlink()

print(f"Model size (MB) = {size_mb}")
return {'size_mb': size_mb}

def compute_time(self, query = '최근 대부분의 범죄에 디지털 매체가 사용되면서 디지털 데이터는 필수 조사 대상이 되었다. 하지만 디지털 데이터는 비교적 쉽게 삭제 및변조가 가능하다. 따라서 디지털 증거 획득을 위해 삭제된 데이터의 복구가 필요하며, 파일 카빙은 컴퓨터 포렌식 조사에서 증거를 획득할 수있는 중요한 요소이다. 하지만 현재 사용되는 파일 카빙 도구들은 포렌식 조사를 위한 데이터의 선별을 고려하지 않고 있다. 또 기존의 파일카빙 기법들은 파일의 일부 영역이 덮어써지거나 조각날 경우 복구가 불가능한 단점이 있다. 따라서 본 논문에서는 포렌식 조사시 유용한 정보를 획득할 수 있는 파일을 제안하고, 기존의 파일 카빙 기법보다 효과적으로 데이터를 복구할 수 있는 레코드 파일 카빙 기법을 제시한다.'):
times = []

# Warmup
for i in range(10):
self.pipeline(query)

# Timing
for i in range(100):
start_time = perf_counter()
self.pipeline(query)
time = perf_counter() - start_time
times.append(time)

# Compute Statistics
time_avg_ms = 1000 * np.mean(times)
time_std_ms = 1000 * np.std(times)

print(f"Average time took(ms) {time_avg_ms:.2} +\- {time_std_ms:.2f}")
return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}


def run_benchmark(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Class 내에 정의되어 있는 함수인데 다른 곳에서 쓰이지는 않는 것 같습니다. 정의한 이유가 있을까요?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ipynb에서 각각의 compute함수를 사용하기 번거로울때 쓰려고 만들어 뒀습니다.

metrics = {}
metrics[self.optim_type] = self.compute_size()
metrics[self.optim_type].update(self.compute_time())
metrics[self.optim_type].update(self.compute_accuracy())
return metrics
90 changes: 90 additions & 0 deletions performance_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import datasets
from dotenv import load_dotenv
import transformers
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
import torch

import os
import argparse

from performanceBenchmark import PerformanceBenchmark


load_dotenv(verbose=True)

def performance_test(
*,
check_point = 'gogamza/kobart-summarization',
test_dataset = 'metamong1/summarization_paper',
test_dataset_size = 100,
cpu_flag=False,
test_categories='rouge,time,size',
model=None,
seed=42,
args=None
):
# 기본 세팅
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
api_token = os.getenv('HF_DATASET_API_TOKEN')
if args:
check_point = args.check_point # 'kobart-summarization-finetuned-paper-sample-size-1000/checkpoint-1000'
test_dataset = args.test_dataset
test_dataset_size = args.test_dataset_size
cpu_flag = args.cpu_flag
test_categories = args.test_categories

if cpu_flag:
device='cpu'

# 데이터셋 준비
dataset = datasets.load_dataset(test_dataset, use_auth_token=api_token)
test_dataset = dataset['validation'].shuffle(seed=seed).filter(lambda x: len(x['text'])< 500).select(range(test_dataset_size))

# 토크나이저 준비
tokenizer = AutoTokenizer.from_pretrained(check_point)

# 모델 준비
if not model:
model = AutoModelForSeq2SeqLM.from_pretrained(check_point)

model = model.to(device)

# 사용할 모델 및 파이프라인 준비
summerizer = pipeline(
'summarization',
model=model,
tokenizer=tokenizer,
device = 0 if torch.cuda.is_available() and not cpu_flag else -1
)

# 벤치마크 준비
performance_benchmark = PerformanceBenchmark(summerizer, test_dataset, tokenizer, 'baseline')

# 벤치마크 계산
test_categories = test_categories.split(',')
if 'rouge' in test_categories:
performance_benchmark.compute_rouge()

if 'size' in test_categories:
performance_benchmark.compute_size()

if 'time' in test_categories:
performance_benchmark.compute_time()

def main(args):
performance_test(args=args)

if __name__ == '__main__':
parser = argparse.ArgumentParser()

parser.add_argument('--check_point', type=str, default='gogamza/kobart-summarization', help='model checkpoint (default: gogamza/kobart-summarization)')
parser.add_argument('--test_dataset', type=str, default='metamong1/summarization_paper', help='test dataset (default: metamong1/summarization_paper)')
parser.add_argument('--test_dataset_size', type=int, default=100, help='test dataset size (defualt: 100)')
parser.add_argument('--cpu_flag', action='store_true', help='use cpu (default: gpu)')
parser.add_argument('--test_categories', type=str, default='rouge,time,size', help='test categories seperated by , ex: time,size,rouge (defualt: rouge,time,size)')

args = parser.parse_args()

main(args)
110 changes: 110 additions & 0 deletions quantization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import torch
import torch.nn as nn
from torch.quantization import quantize_dynamic
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

import argparse

import performance_test

def dynamic_quantization(
*,
check_point='gogamza/kobart-summarization',
test_dataset = 'metamong1/summarization_paper',
test_dataset_size = 100,
test_categories='rouge,time,size',
model = None,
test=True,
):
if model:
model_quantized = quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)
elif check_point:
model = AutoModelForSeq2SeqLM.from_pretrained(check_point)
model_quantized = quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)

if test:
performance_test.performance_test(
test_dataset=test_dataset,
test_dataset_size=test_dataset_size,
cpu_flag=True,
test_categories=test_categories,
model=model_quantized,
)

return model_quantized


def half_quantization(
*,
check_point='gogamza/kobart-summarization',
test_dataset = 'metamong1/summarization_paper',
test_dataset_size = 100,
cpu_flag=False,
test_categories='rouge,time,size',
model = None,
test=True
):
if model:
model.half()
for layer in model.modules():
if isinstance(layer, nn.BatchNorm2d):
layer.float()
elif check_point:
model = AutoModelForSeq2SeqLM.from_pretrained(check_point)
model.half()
for layer in model.modules():
if isinstance(layer, nn.BatchNorm2d):
layer.float()

if test:
performance_test.performance_test(
test_dataset=test_dataset,
test_dataset_size=test_dataset_size,
cpu_flag=True,
test_categories=test_categories,
model=model,
)

return model


def main(args):
if args.quantization_type == 'half_quantization':
model = half_quantization(
check_point=args.check_point,
test_dataset=args.test_dataset,
test_dataset_size=args.test_dataset_size,
cpu_flag=args.cpu_flag,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

아래의 경우에는 cpu_flag는 따로 없는 것 같은데 half_quantization함수를 보면 test가 True일 때 cpu_flag = True로 설정하시는데 이 인자가 왜 필요한 건지 모르겠습니다.

test_categories=args.test_categories,
test= args.no_test_flag,
)
elif args.quantization_type == 'dynamic_quantization':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

위 경우와 마찬가지로 cpu_flag는 test가 True이면 항상 True로 performance_test 함수에 들어가는데 두 경우를 왜 나누신 건지 모르겠습니다. 코드 상으로 보면 똑같이 동작하지 않을까요?

model = half_quantization(
check_point=args.check_point,
test_dataset=args.test_dataset,
test_dataset_size=args.test_dataset_size,
test_categories=args.test_categories,
test= args.no_test_flag,
)

if args.save_dir:
torch.save(model.state_dict(), args.save_dir)



if __name__ == '__main__':
parser = argparse.ArgumentParser()

parser.add_argument('--quantization_type', type=str, default='half_quantization', help='quantization type. ex: half_quantization, dynamic_quantization (default: half_quantization)')
parser.add_argument('--check_point', type=str, default='gogamza/kobart-summarization', help='model checkpoint (default: gogamza/kobart-summarization)')
parser.add_argument('--test_dataset', type=str, default='metamong1/summarization_paper', help='test dataset (default: metamong1/summarization_paper)')
parser.add_argument('--test_dataset_size', type=int, default=100, help='test dataset size (defualt: 100)')
parser.add_argument('--cpu_flag', action='store_true', help='use cpu (default: gpu)')
parser.add_argument('--test_categories', type=str, default='rouge,time,size', help='test categories seperated by , ex: time,size,rouge (defualt: rouge,time,size)')
parser.add_argument('--no_test_flag', action='store_false', help='do test performance (default: False)')
parser.add_argument('--save_dir', type=str, default='', help='save model directory (default: "" ')

args = parser.parse_args()

main(args)