diff --git a/.jenkins/build.sh b/.jenkins/build.sh
deleted file mode 100755
index d8f9fc28c..000000000
--- a/.jenkins/build.sh
+++ /dev/null
@@ -1,192 +0,0 @@
-set -ex
-
-if [[ "$COMMIT_SOURCE" == master ]]; then
-  export BUCKET_NAME=pytorch-tutorial-build-master
-else
-  export BUCKET_NAME=pytorch-tutorial-build-pull-request
-fi
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
-
-sudo apt-get update
-sudo apt-get install -y --no-install-recommends unzip p7zip-full sox libsox-dev libsox-fmt-all rsync
-
-export PATH=/opt/conda/bin:$PATH
-rm -rf src
-pip install -r $DIR/../requirements.txt
-
-export PATH=/opt/conda/bin:$PATH
-pip install sphinx==1.8.2 pandas
-
-# For Tensorboard. Until 1.14 moves to the release channel.
-pip install tb-nightly
-
-# Install two language tokenizers for Translation with TorchText tutorial
-python -m spacy download en
-python -m spacy download de
-
-# PyTorch Theme
-rm -rf src
-pip install -e git+git://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-# pillow >= 4.2 will throw error when trying to write mode RGBA as JPEG,
-# this is a workaround to the issue.
-pip install sphinx-gallery==0.3.1 tqdm matplotlib ipython pillow==4.1.1
-
-aws configure set default.s3.multipart_threshold 5120MB
-
-# Decide whether to parallelize tutorial builds, based on $JOB_BASE_NAME
-export NUM_WORKERS=20
-if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
-  # Step 1: Remove runnable code from tutorials that are not supposed to be run
-  python $DIR/remove_runnable_code.py beginner_source/aws_distributed_training_tutorial.py beginner_source/aws_distributed_training_tutorial.py || true
-  # TODO: Fix bugs in these tutorials to make them runnable again
-  # python $DIR/remove_runnable_code.py beginner_source/audio_classifier_tutorial.py beginner_source/audio_classifier_tutorial.py || true
-
-  # Step 2: Keep certain tutorials based on file count, and remove runnable code in all other tutorials
-  # IMPORTANT NOTE: We assume that each tutorial has a UNIQUE filename.
-  export WORKER_ID=$(echo "${JOB_BASE_NAME}" | tr -dc '0-9')
-  count=0
-  FILES_TO_RUN=()
-  for filename in $(find beginner_source/ -name '*.py' -not -path '*/data/*'); do
-    if [ $(($count % $NUM_WORKERS)) != $WORKER_ID ]; then
-      echo "Removing runnable code from "$filename
-      python $DIR/remove_runnable_code.py $filename $filename
-    else
-      echo "Keeping "$filename
-      FILES_TO_RUN+=($(basename $filename .py))
-    fi
-    count=$((count+1))
-  done
-  for filename in $(find intermediate_source/ -name '*.py' -not -path '*/data/*'); do
-    if [ $(($count % $NUM_WORKERS)) != $WORKER_ID ]; then
-      echo "Removing runnable code from "$filename
-      python $DIR/remove_runnable_code.py $filename $filename
-    else
-      echo "Keeping "$filename
-      FILES_TO_RUN+=($(basename $filename .py))
-    fi
-    count=$((count+1))
-  done
-  for filename in $(find advanced_source/ -name '*.py' -not -path '*/data/*'); do
-    if [ $(($count % $NUM_WORKERS)) != $WORKER_ID ]; then
-      echo "Removing runnable code from "$filename
-      python $DIR/remove_runnable_code.py $filename $filename
-    else
-      echo "Keeping "$filename
-      FILES_TO_RUN+=($(basename $filename .py))
-    fi
-    count=$((count+1))
-   done
-   for filename in $(find recipes_source/ -name '*.py' -not -path '*/data/*'); do
-    if [ $(($count % $NUM_WORKERS)) != $WORKER_ID ]; then
-      echo "Removing runnable code from "$filename
-      python $DIR/remove_runnable_code.py $filename $filename
-    else
-      echo "Keeping "$filename
-      FILES_TO_RUN+=($(basename $filename .py))
-    fi
-    count=$((count+1))
-  done
-  echo "FILES_TO_RUN: " ${FILES_TO_RUN[@]}
-
-  # Step 3: Run `make docs` to generate HTML files and static files for these tutorials
-  make docs
-
-  # Step 4: If any of the generated files are not related the tutorial files we want to run,
-  # then we remove them
-  for filename in $(find docs/beginner docs/intermediate docs/advanced docs/recipes -name '*.html'); do
-    file_basename=$(basename $filename .html)
-    if [[ ! " ${FILES_TO_RUN[@]} " =~ " ${file_basename} " ]]; then
-      rm $filename
-    fi
-  done
-  for filename in $(find docs/beginner docs/intermediate docs/advanced docs/recipes -name '*.rst'); do
-    file_basename=$(basename $filename .rst)
-    if [[ ! " ${FILES_TO_RUN[@]} " =~ " ${file_basename} " ]]; then
-      rm $filename
-    fi
-  done
-  for filename in $(find docs/_downloads -name '*.py'); do
-    file_basename=$(basename $filename .py)
-    if [[ ! " ${FILES_TO_RUN[@]} " =~ " ${file_basename} " ]]; then
-      rm $filename
-    fi
-  done
-  for filename in $(find docs/_downloads -name '*.ipynb'); do
-    file_basename=$(basename $filename .ipynb)
-    if [[ ! " ${FILES_TO_RUN[@]} " =~ " ${file_basename} " ]]; then
-      rm $filename
-    fi
-  done
-  for filename in $(find docs/_sources/beginner docs/_sources/intermediate docs/_sources/advanced docs/_sources/recipes -name '*.rst.txt'); do
-    file_basename=$(basename $filename .rst.txt)
-    if [[ ! " ${FILES_TO_RUN[@]} " =~ " ${file_basename} " ]]; then
-      rm $filename
-    fi
-  done
-  for filename in $(find docs/.doctrees/beginner docs/.doctrees/intermediate docs/.doctrees/advanced docs/.doctrees/recipes -name '*.doctree'); do
-    file_basename=$(basename $filename .doctree)
-    if [[ ! " ${FILES_TO_RUN[@]} " =~ " ${file_basename} " ]]; then
-      rm $filename
-    fi
-  done
-
-  # Step 5: Remove INVISIBLE_CODE_BLOCK from .html/.rst.txt/.ipynb/.py files
-  bash $DIR/remove_invisible_code_block_batch.sh docs
-
-  # Step 6: Copy generated files to S3, tag with commit ID
-  7z a worker_${WORKER_ID}.7z docs
-  aws s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z --acl public-read
-elif [[ "${JOB_BASE_NAME}" == *manager ]]; then
-  # Step 1: Generate no-plot HTML pages for all tutorials
-  make html-noplot
-  cp -r _build/html docs
-
-  # Step 2: Wait for all workers to finish
-  set +e
-  for ((worker_id=0;worker_id<NUM_WORKERS;worker_id++)); do
-    until aws s3api head-object --bucket ${BUCKET_NAME} --key ${COMMIT_ID}/worker_$worker_id.7z
-    do
-      echo "Waiting for worker $worker_id to finish..."
-      sleep 5
-    done
-  done
-  set -e
-
-  # Step 3: Download generated with-plot HTML files and static files from S3, merge into one folder
-  mkdir -p docs_with_plot/docs
-  for ((worker_id=0;worker_id<NUM_WORKERS;worker_id++)); do
-    aws s3 cp s3://${BUCKET_NAME}/${COMMIT_ID}/worker_$worker_id.7z worker_$worker_id.7z
-    7z x worker_$worker_id.7z -oworker_$worker_id
-    yes | cp -R worker_$worker_id/docs/* docs_with_plot/docs
-  done
-
-  # Step 4: Copy all generated files into docs
-  rsync -av docs_with_plot/docs/ docs --exclude='**aws_distributed_training_tutorial*'
-
-  # Step 5: Remove INVISIBLE_CODE_BLOCK from .html/.rst.txt/.ipynb/.py files
-  bash $DIR/remove_invisible_code_block_batch.sh docs
-
-  # Step 6: Copy generated HTML files and static files to S3
-  7z a manager.7z docs
-  aws s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z --acl public-read
-
-  # Step 7: push new HTML files and static files to gh-pages
-  if [[ "$COMMIT_SOURCE" == master ]]; then
-    git clone https://github.com/pytorch/tutorials.git -b gh-pages gh-pages
-    cp -r docs/* gh-pages/
-    pushd gh-pages
-    # DANGER! DO NOT REMOVE THE `set +x` SETTING HERE!
-    set +x
-    git remote set-url origin https://${GITHUB_PYTORCHBOT_USERNAME}:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/tutorials.git
-    set -x
-    git add -f -A || true
-    git config user.email "soumith+bot@pytorch.org"
-    git config user.name "pytorchbot"
-    git commit -m "Automated tutorials push" || true
-    git status
-    git push origin gh-pages
-  fi
-else
-  make docs
-fi
diff --git a/.jenkins/delete_html_file_with_runnable_code_removed.py b/.jenkins/delete_html_file_with_runnable_code_removed.py
deleted file mode 100644
index b84a0ecd9..000000000
--- a/.jenkins/delete_html_file_with_runnable_code_removed.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import sys
-import os
-
-html_file_path = sys.argv[1]
-
-with open(html_file_path, 'r', encoding='utf-8') as html_file:
-    html = html_file.read()
-
-if "%%%%%%RUNNABLE_CODE_REMOVED%%%%%%" in html:
-    print("Removing " + html_file_path)
-    os.remove(html_file_path)
diff --git a/.jenkins/remove_invisible_code_block_batch.sh b/.jenkins/remove_invisible_code_block_batch.sh
deleted file mode 100644
index 5de9698de..000000000
--- a/.jenkins/remove_invisible_code_block_batch.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-BUILDDIR=$1
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
-
-# Remove INVISIBLE_CODE_BLOCK from .html/.rst/.rst.txt/.ipynb/.py files
-for filename in $(find $BUILDDIR/beginner $BUILDDIR/intermediate $BUILDDIR/advanced -name '*.html'); do
-    echo "Removing INVISIBLE_CODE_BLOCK from " $filename
-    python $DIR/remove_invisible_code_block_from_html.py $filename $filename
-done
-for filename in $(find $BUILDDIR/_sources/beginner $BUILDDIR/_sources/intermediate $BUILDDIR/_sources/advanced -name '*.rst.txt'); do
-    echo "Removing INVISIBLE_CODE_BLOCK from " $filename
-    python $DIR/remove_invisible_code_block_from_rst_txt.py $filename $filename
-done
-for filename in $(find $BUILDDIR/_downloads -name '*.ipynb'); do
-    echo "Removing INVISIBLE_CODE_BLOCK from " $filename
-    python $DIR/remove_invisible_code_block_from_ipynb.py $filename $filename
-done
-for filename in $(find $BUILDDIR/_downloads -name '*.py'); do
-    echo "Removing INVISIBLE_CODE_BLOCK from " $filename
-    python $DIR/remove_invisible_code_block_from_py.py $filename $filename
-done
diff --git a/.jenkins/remove_invisible_code_block_from_html.py b/.jenkins/remove_invisible_code_block_from_html.py
deleted file mode 100644
index 827b9802d..000000000
--- a/.jenkins/remove_invisible_code_block_from_html.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import sys
-from bs4 import BeautifulSoup
-
-html_file_path = sys.argv[1]
-output_file_path = sys.argv[2]
-
-with open(html_file_path, 'r', encoding='utf-8') as html_file:
-    html = html_file.read()
-html_soup = BeautifulSoup(html, 'html.parser')
-
-elems = html_soup.find_all("div", {"class": "highlight-default"})
-for elem in elems:
-    if "%%%%%%INVISIBLE_CODE_BLOCK%%%%%%" in str(elem):
-        elem.decompose()
-
-with open(output_file_path, "w", encoding='utf-8') as output_file:
-    output_file.write(str(html_soup))
diff --git a/.jenkins/remove_invisible_code_block_from_ipynb.py b/.jenkins/remove_invisible_code_block_from_ipynb.py
deleted file mode 100644
index 69913efb0..000000000
--- a/.jenkins/remove_invisible_code_block_from_ipynb.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import sys
-from bs4 import BeautifulSoup
-
-ipynb_file_path = sys.argv[1]
-output_file_path = sys.argv[2]
-
-with open(ipynb_file_path, 'r', encoding='utf-8') as ipynb_file:
-    ipynb_lines = ipynb_file.readlines()
-
-ipynb_out_lines = []
-
-for line in ipynb_lines:
-    if not '%%%%%%INVISIBLE_CODE_BLOCK%%%%%%' in line:
-        ipynb_out_lines.append(line)
-
-with open(output_file_path, "w", encoding='utf-8') as output_file:
-    for line in ipynb_out_lines:
-        output_file.write(line)
diff --git a/.jenkins/remove_invisible_code_block_from_py.py b/.jenkins/remove_invisible_code_block_from_py.py
deleted file mode 100644
index d39e5f4bf..000000000
--- a/.jenkins/remove_invisible_code_block_from_py.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import sys
-from bs4 import BeautifulSoup
-
-py_file_path = sys.argv[1]
-output_file_path = sys.argv[2]
-
-with open(py_file_path, 'r', encoding='utf-8') as py_file:
-    py_lines = py_file.readlines()
-
-py_out_lines = []
-
-in_invisible_block = False
-for line in py_lines:
-    if not in_invisible_block:
-        if '%%%%%%INVISIBLE_CODE_BLOCK%%%%%%' in line:
-            in_invisible_block = True
-        else:
-            py_out_lines.append(line)
-    else:
-        if '%%%%%%INVISIBLE_CODE_BLOCK%%%%%%' in line:
-            in_invisible_block = False
-
-with open(output_file_path, "w", encoding='utf-8') as output_file:
-    for line in py_out_lines:
-        output_file.write(line)
diff --git a/.jenkins/remove_invisible_code_block_from_rst_txt.py b/.jenkins/remove_invisible_code_block_from_rst_txt.py
deleted file mode 100644
index e6eb648e7..000000000
--- a/.jenkins/remove_invisible_code_block_from_rst_txt.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import sys
-from bs4 import BeautifulSoup
-
-rst_txt_file_path = sys.argv[1]
-output_file_path = sys.argv[2]
-
-with open(rst_txt_file_path, 'r', encoding='utf-8') as rst_txt_file:
-    rst_txt = rst_txt_file.read()
-
-splits = rst_txt.split('.. code-block:: default\n\n\n    # %%%%%%INVISIBLE_CODE_BLOCK%%%%%%\n')
-if len(splits) == 2:
-    code_before_invisible_block = splits[0]
-    code_after_invisible_block = splits[1].split('    # %%%%%%INVISIBLE_CODE_BLOCK%%%%%%\n')[1]
-    rst_txt_out = code_before_invisible_block + code_after_invisible_block
-else:
-    rst_txt_out = rst_txt
-
-with open(output_file_path, "w", encoding='utf-8') as output_file:
-    output_file.write(rst_txt_out)
diff --git a/.jenkins/remove_runnable_code.py b/.jenkins/remove_runnable_code.py
deleted file mode 100644
index 6a61cb656..000000000
--- a/.jenkins/remove_runnable_code.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import sys
-
-STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE = "STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE"
-STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE = "STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE"
-STATE_NORMAL = "STATE_NORMAL"
-
-python_file_path = sys.argv[1]
-output_file_path = sys.argv[2]
-
-with open(python_file_path, 'r', encoding='utf-8') as file:
-    lines = file.readlines()
-    ret_lines = []
-    state = STATE_NORMAL
-    for line in lines:
-        if state == STATE_NORMAL:
-            if line.startswith('#'):
-                ret_lines.append(line)
-                state = STATE_NORMAL
-            elif line.startswith('"""') or line.startswith('r"""'):
-                ret_lines.append(line)
-                state = STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE
-            elif line.startswith("'''") or line.startswith("r'''"):
-                ret_lines.append(line)
-                state = STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE
-            else:
-                ret_lines.append("\n")
-                state = STATE_NORMAL
-        elif state == STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE:
-            if line.startswith('"""'):
-                ret_lines.append(line)
-                state = STATE_NORMAL
-            else:
-                ret_lines.append(line)
-                state = STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE
-        elif state == STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE:
-            if line.startswith("'''"):
-                ret_lines.append(line)
-                state = STATE_NORMAL
-            else:
-                ret_lines.append(line)
-                state = STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE
-
-ret_lines.append("\n# %%%%%%RUNNABLE_CODE_REMOVED%%%%%%")
-
-with open(output_file_path, 'w', encoding='utf-8') as file:
-    for line in ret_lines:
-        file.write(line)
diff --git a/.jenkins/replace_tutorial_html_content.py b/.jenkins/replace_tutorial_html_content.py
deleted file mode 100644
index 587464cd9..000000000
--- a/.jenkins/replace_tutorial_html_content.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import sys
-
-noplot_html_file_path = sys.argv[1]
-hasplot_html_file_path = sys.argv[2]
-output_html_file_path = sys.argv[3]
-
-from bs4 import BeautifulSoup
-with open(noplot_html_file_path, 'r', encoding='utf-8') as noplot_html_file:
-  noplot_html = noplot_html_file.read()
-with open(hasplot_html_file_path, 'r', encoding='utf-8') as hasplot_html_file:
-  hasplot_html = hasplot_html_file.read()
-
-noplot_html_soup = BeautifulSoup(noplot_html, 'html.parser')
-elems = noplot_html_soup.find_all("div", {"class": "sphx-glr-example-title"})
-if len(elems) == 0:
-  print("No match found, not replacing HTML content in "+noplot_html_file_path)
-elif len(elems) == 1:
-  print("Match found in "+noplot_html_file_path+". Replacing its content.")
-  elem = elems[0]
-  elem.replace_with(BeautifulSoup(hasplot_html, 'html.parser').find_all("div", {"class": "sphx-glr-example-title"})[0])
-  with open(output_html_file_path, "w", encoding='utf-8') as output_html_file:
-    output_html_file.write(str(noplot_html_soup))
-else:
-  raise Exception("Found more than one match in "+noplot_html_file_path+". Aborting.")
diff --git a/Makefile b/Makefile
index d29691856..a865a545a 100644
--- a/Makefile
+++ b/Makefile
@@ -97,18 +97,10 @@ download:
 	wget -N https://download.pytorch.org/models/mobilenet_v2-b0353104.pth -P $(DATADIR)
 	cp $(DATADIR)/mobilenet_v2-b0353104.pth advanced_source/data/mobilenet_pretrained_float.pth
 
-	# Download dataset for advanced_source/static_quantization_tutorial.py
-	wget -N https://s3.amazonaws.com/pytorch-tutorial-assets/imagenet_1k.zip -P $(DATADIR)
-	unzip $(ZIPOPTS) $(DATADIR)/imagenet_1k.zip -d advanced_source/data/
-
 	# Download model for prototype_source/graph_mode_static_quantization_tutorial.py
 	wget -N https://download.pytorch.org/models/resnet18-5c106cde.pth -P $(DATADIR)
 	cp $(DATADIR)/resnet18-5c106cde.pth prototype_source/data/resnet18_pretrained_float.pth
 
-	# Download dataset for prototype_source/graph_mode_static_quantization_tutorial.py
-	wget -N https://s3.amazonaws.com/pytorch-tutorial-assets/imagenet_1k.zip -P $(DATADIR)
-	unzip $(ZIPOPTS) $(DATADIR)/imagenet_1k.zip -d prototype_source/data/
-
 docs:
 	make download
 	make html
diff --git a/README.md b/README.md
index 5a753970d..301979114 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 PyTorch에서 제공하는 튜토리얼의 한국어 번역을 위한 저장소입니다.\
 번역의 결과물은 [https://tutorials.pytorch.kr](https://tutorials.pytorch.kr)에서 확인하실 수 있습니다. (번역을 진행하며 **불규칙적으로** 업데이트합니다.)\
-현재 번역 진행 중인 내용은 [#90 이슈](https://github.com/9bow/PyTorch-tutorials-kr/issues/90)를 참고해주세요.
+현재 번역 진행 중인 내용은 [#210 이슈](https://github.com/9bow/PyTorch-tutorials-kr/issues/210)를 참고해주세요.
 
 ## 기여하기
 
@@ -15,20 +15,28 @@ PyTorch에서 제공하는 튜토리얼의 한국어 번역을 위한 저장소
 2. 번역되지 않은 튜토리얼을 번역하는 기여
    * [한국어 튜토리얼 사이트](http://tutorials.pytorch.kr/)에 아직 번역되지 않은 튜토리얼 번역하는 기여입니다.
 3. 2로 번역된 문서를 리뷰하는 기여 :star:
-   * [본 저장소에 Pull Request된 튜토리얼 문서](https://github.com/9bow/PyTorch-tutorials-kr/pulls)를 리뷰하는 기여입니다. \
-     (간절히 기다리는 기여입니다. :pray:)
+   * [본 저장소에 Pull Request된 튜토리얼 문서](https://github.com/9bow/PyTorch-tutorials-kr/pulls)의 번역이 적절한지 리뷰하는 기여입니다. \
+     (많은 분들의 참여를 간절히 기다리고 있습니다. :pray:)
 
 자세한 방법은 [기여하기 문서](CONTRIBUTING.md)를 참조해주세요. :)
 
 ## 원문
 
-현재 PyTorch v1.6 튜토리얼 번역이 진행 중입니다. ([pytorch/tutorials@bc9cac0](https://github.com/pytorch/tutorials/commit/bc9cac0a77512136d91d717e3c8f1e83165b196d) 기준)
+현재 PyTorch v1.8.1 튜토리얼 번역이 진행 중입니다. ([pytorch/tutorials@e0938fb](https://github.com/pytorch/tutorials/commit/e0938fbd1faca724ee0448f6553f606c68b099e3) 기준)
 
-최신의 튜토리얼(영어)은 [PyTorch tutorials](https://pytorch.org/tutorials) 및 [PyTorch tutorials 저장소](https://github.com/pytorch/tutorials)를 참고해주세요.
+최신 버전의 튜토리얼(공식, 영어)은 [PyTorch tutorials 사이트](https://pytorch.org/tutorials) 및 [PyTorch tutorials 저장소](https://github.com/pytorch/tutorials)를 참고해주세요.
 
 ## 과거 버전
 
-### PyTorch 1.0 이전 (0.3 & 0.4) 버전 튜토리얼 보기
+### PyTorch v1.0 이상의 튜토리얼 보기
+
+v1.0 이후 번역은 별도 저장소로 관리하지 않습니다. [이 저장소의 Release 메뉴](https://github.com/9bow/PyTorch-tutorials-kr/releases)를 확인해주세요. \
+`버전-base`(예. `1.6-base`)는 해당 버전을 시작할 때의 릴리즈이고, `버전-latest`(예. `1.6-latest`)는 해당 버전의 마지막 릴리즈입니다.
+
+해당 릴리즈의 문서를 내려받으신 후 빌드하시면 해당 버전의 문서를 확인하실 수 있습니다. \
+빌드 방법은 [기여하기 문서의 `2-5. (내 컴퓨터에서) 결과 확인하기`](https://github.com/9bow/PyTorch-tutorials-kr/blob/master/CONTRIBUTING.md#2-5-내-컴퓨터에서-결과-확인하기) 부분을 참고해주세요.
+
+### PyTorch v1.0 이전(v0.3 & v0.4)의 튜토리얼 보기
 
 아래 링크에서 과거 버전의 튜토리얼 번역을 확인하실 수 있습니다. 현재는 번역이 이뤄지고 있지 않습니다.
 
@@ -37,14 +45,6 @@ PyTorch에서 제공하는 튜토리얼의 한국어 번역을 위한 저장소
   | 0.4.1  | [PyTorch-tutorials-kr-0.4](https://9bow.github.io/PyTorch-tutorials-kr-0.4) | [GitHub 저장소](https://github.com/9bow/PyTorch-tutorials-kr-0.4) |
   | 0.3.1  | [PyTorch-tutorials-kr-0.3.1](https://9bow.github.io/PyTorch-tutorials-kr-0.3.1) | [GitHub 저장소](https://github.com/9bow/PyTorch-tutorials-kr-0.3.1) |
 
-### PyTorch 1.0 이상 버전 튜토리얼 보기
-
-버전 1.0 이후 번역은 별도 저장소로 관리하지 않습니다. [이 저장소의 Release 메뉴](https://github.com/9bow/PyTorch-tutorials-kr/releases)를 확인해주세요. \
-`버전-base`(예. `1.4-base`)는 해당 버전을 시작할 때의 릴리즈이고, `버전-latest`(예. `1.4-latest`)는 해당 버전의 마지막 릴리즈입니다.
-
-해당 릴리즈의 문서를 내려받으신 후 빌드하시면 해당 버전의 문서를 확인하실 수 있습니다. \
-빌드 방법은 [기여하기 문서의 `2-5. (내 컴퓨터에서) 결과 확인하기`](https://github.com/9bow/PyTorch-tutorials-kr/blob/master/CONTRIBUTING.md#2-5-내-컴퓨터에서-결과-확인하기) 부분을 참고해주세요.
-
 ---
-This is a project to translate [pytorch/tutorials@bc9cac0](https://github.com/pytorch/tutorials/commit/bc9cac0a77512136d91d717e3c8f1e83165b196d) into Korean.
-For the latest version, please visit to the [PyTorch tutorials repo](https://github.com/pytorch/tutorials).
+This is a project to translate [pytorch/tutorials@e0938fb](https://github.com/pytorch/tutorials/commit/e0938fbd1faca724ee0448f6553f606c68b099e3) into Korean.
+For the latest version, please visit to the [official PyTorch tutorials repo](https://github.com/pytorch/tutorials).
diff --git a/_static/img/8_workers.png b/_static/img/8_workers.png
new file mode 100644
index 000000000..9a51182eb
Binary files /dev/null and b/_static/img/8_workers.png differ
diff --git a/_static/img/basics/comp-graph.png b/_static/img/basics/comp-graph.png
new file mode 100644
index 000000000..cfa6163d5
Binary files /dev/null and b/_static/img/basics/comp-graph.png differ
diff --git a/_static/img/basics/fashion_mnist.png b/_static/img/basics/fashion_mnist.png
new file mode 100644
index 000000000..213b1e1f1
Binary files /dev/null and b/_static/img/basics/fashion_mnist.png differ
diff --git a/_static/img/basics/optimizationloops.png b/_static/img/basics/optimizationloops.png
new file mode 100644
index 000000000..c43d83f27
Binary files /dev/null and b/_static/img/basics/optimizationloops.png differ
diff --git a/_static/img/basics/typesdata.png b/_static/img/basics/typesdata.png
new file mode 100644
index 000000000..5d0e0291e
Binary files /dev/null and b/_static/img/basics/typesdata.png differ
diff --git a/_static/img/dag_autograd.png b/_static/img/dag_autograd.png
new file mode 100644
index 000000000..cdc50fed6
Binary files /dev/null and b/_static/img/dag_autograd.png differ
diff --git a/_static/img/deeplabv3_android.png b/_static/img/deeplabv3_android.png
new file mode 100644
index 000000000..e0a451be8
Binary files /dev/null and b/_static/img/deeplabv3_android.png differ
diff --git a/_static/img/deeplabv3_android2.png b/_static/img/deeplabv3_android2.png
new file mode 100644
index 000000000..0ae041479
Binary files /dev/null and b/_static/img/deeplabv3_android2.png differ
diff --git a/_static/img/deeplabv3_ios.png b/_static/img/deeplabv3_ios.png
new file mode 100644
index 000000000..c901179e1
Binary files /dev/null and b/_static/img/deeplabv3_ios.png differ
diff --git a/_static/img/deeplabv3_ios2.png b/_static/img/deeplabv3_ios2.png
new file mode 100644
index 000000000..3dc0073ca
Binary files /dev/null and b/_static/img/deeplabv3_ios2.png differ
diff --git a/_static/img/mario.gif b/_static/img/mario.gif
new file mode 100644
index 000000000..95d8c0cb1
Binary files /dev/null and b/_static/img/mario.gif differ
diff --git a/_static/img/mario_env.png b/_static/img/mario_env.png
new file mode 100644
index 000000000..b6fc09c3c
Binary files /dev/null and b/_static/img/mario_env.png differ
diff --git a/_static/img/oneworker.png b/_static/img/oneworker.png
new file mode 100644
index 000000000..255ec5848
Binary files /dev/null and b/_static/img/oneworker.png differ
diff --git a/_static/img/ray-tune.png b/_static/img/ray-tune.png
new file mode 100644
index 000000000..febd6de28
Binary files /dev/null and b/_static/img/ray-tune.png differ
diff --git a/_static/img/thumbnails/cropped/Training-Transformer-Models-using-Distributed-Data-Parallel-and-Pipeline-Parallelism.png b/_static/img/thumbnails/cropped/Training-Transformer-Models-using-Distributed-Data-Parallel-and-Pipeline-Parallelism.png
new file mode 100644
index 000000000..426a14d98
Binary files /dev/null and b/_static/img/thumbnails/cropped/Training-Transformer-Models-using-Distributed-Data-Parallel-and-Pipeline-Parallelism.png differ
diff --git a/_static/img/thumbnails/cropped/Training-Transformer-models-using-Pipeline-Parallelism.png b/_static/img/thumbnails/cropped/Training-Transformer-models-using-Pipeline-Parallelism.png
new file mode 100644
index 000000000..426a14d98
Binary files /dev/null and b/_static/img/thumbnails/cropped/Training-Transformer-models-using-Pipeline-Parallelism.png differ
diff --git a/_static/img/thumbnails/cropped/amp.png b/_static/img/thumbnails/cropped/amp.png
new file mode 100644
index 000000000..a6916ce56
Binary files /dev/null and b/_static/img/thumbnails/cropped/amp.png differ
diff --git a/_static/img/thumbnails/cropped/generic-pytorch-logo.png b/_static/img/thumbnails/cropped/generic-pytorch-logo.png
new file mode 100644
index 000000000..426a14d98
Binary files /dev/null and b/_static/img/thumbnails/cropped/generic-pytorch-logo.png differ
diff --git a/_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png b/_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png
new file mode 100644
index 000000000..34bbf8c7b
Binary files /dev/null and b/_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png differ
diff --git a/_static/img/thumbnails/cropped/profile.png b/_static/img/thumbnails/cropped/profile.png
new file mode 100644
index 000000000..372db8bbe
Binary files /dev/null and b/_static/img/thumbnails/cropped/profile.png differ
diff --git a/_static/img/thumbnails/cropped/torchaudio-speech.png b/_static/img/thumbnails/cropped/torchaudio-speech.png
new file mode 100644
index 000000000..c874a6bb4
Binary files /dev/null and b/_static/img/thumbnails/cropped/torchaudio-speech.png differ
diff --git a/_static/torchvision_finetuning_instance_segmentation.ipynb b/_static/torchvision_finetuning_instance_segmentation.ipynb
index 79fef91f3..f4b58f7ec 100644
--- a/_static/torchvision_finetuning_instance_segmentation.ipynb
+++ b/_static/torchvision_finetuning_instance_segmentation.ipynb
@@ -1448,7 +1448,7 @@
         "        self.masks = list(sorted(os.listdir(os.path.join(root, \"PedMasks\"))))\n",
         "\n",
         "    def __getitem__(self, idx):\n",
-        "        # load images ad masks\n",
+        "        # load images and masks\n",
         "        img_path = os.path.join(self.root, \"PNGImages\", self.imgs[idx])\n",
         "        mask_path = os.path.join(self.root, \"PedMasks\", self.masks[idx])\n",
         "        img = Image.open(img_path).convert(\"RGB\")\n",
diff --git a/_static/tv-training-code.py b/_static/tv-training-code.py
index 116e9bdd2..5c1de33c7 100644
--- a/_static/tv-training-code.py
+++ b/_static/tv-training-code.py
@@ -25,7 +25,7 @@ def __init__(self, root, transforms):
         self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
 
     def __getitem__(self, idx):
-        # load images ad masks
+        # load images and masks
         img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
         mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
         img = Image.open(img_path).convert("RGB")
@@ -160,6 +160,6 @@ def main():
         evaluate(model, data_loader_test, device=device)
 
     print("That's it!")
-    
+
 if __name__ == "__main__":
     main()
diff --git a/_templates/layout.html b/_templates/layout.html
index 09c664a23..c3a84da44 100644
--- a/_templates/layout.html
+++ b/_templates/layout.html
@@ -2,38 +2,47 @@
 
 {% block footer %}
 {{ super() }}
+<script async src="https://www.googletagmanager.com/gtag/js?id=UA-71919972-3"></script>
 <script>
-  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
-  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
-  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
-  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+  window.dataLayer = window.dataLayer || [];
+  function gtag(){dataLayer.push(arguments);}
+  gtag('js', new Date());
+  gtag('config', 'UA-71919972-3');
+</script>
 
-  ga('create', 'UA-71919972-3', 'auto');
-  ga('send', 'pageview');
 
+<script>
   $("[data-behavior='call-to-action-event']").on('click', function(){
     ga('send', {
       hitType: 'event',
-      eventCategory: 'Download',
+      eventCategory: $(this).attr("data-response"),
       eventAction: 'click',
-      eventLabel: $(this).attr("data-response")
+      eventLabel: window.location.href
+    });
+
+    gtag('event', 'click', {
+      'event_category': $(this).attr("data-response"),
+      'event_label': $("h1").first().text(),
+      'tutorial_link': window.location.href
     });
    });
 
-   $("[data-behavior='was-this-helpful-event']").on('click', function(){
-    $(".helpful-question").hide();
-    $(".was-helpful-thank-you").show();
-    ga('send', {
-      hitType: 'event',
-      eventCategory: 'Was this Helpful?',
-      eventAction: 'click',
-      eventLabel: $(this).attr("data-response")
+   $("[data-behavior='tutorial-rating']").on('click', function(){
+    gtag('event', 'click', {
+      'event_category': 'Tutorial Rating',
+      'event_label': $("h1").first().text(),
+      'value': $(this).attr("data-count")
     });
    });
 
    if (location.pathname == "/") {
-     $(".helpful-container").hide();
+     $(".rating-container").hide();
      $(".hr-bottom").hide();
    }
 </script>
+
+<script type="text/javascript">
+  var collapsedSections = ['PyTorch Recipes', 'Image and Video', 'Audio', 'Text', 'Reinforcement Learning', 'Deploying PyTorch Models in Production', 'Code Transforms with FX', 'Frontend APIs', 'Extending PyTorch', 'Model Optimization', 'Parallel and Distributed Training', 'Mobile'];
+</script>
+
 {% endblock %}
diff --git a/advanced_source/cpp_extension.rst b/advanced_source/cpp_extension.rst
index 56b02dd18..96421c63b 100644
--- a/advanced_source/cpp_extension.rst
+++ b/advanced_source/cpp_extension.rst
@@ -115,13 +115,13 @@ PyTorch has no knowledge of the *algorithm* you are implementing. It knows only
 of the individual operations you use to compose your algorithm. As such, PyTorch
 must execute your operations individually, one after the other. Since each
 individual call to the implementation (or *kernel*) of an operation, which may
-involve launch of a CUDA kernel, has a certain amount of overhead, this overhead
-may become significant across many function calls. Furthermore, the Python
-interpreter that is running our code can itself slow down our program.
+involve the launch of a CUDA kernel, has a certain amount of overhead, this
+overhead may become significant across many function calls. Furthermore, the
+Python interpreter that is running our code can itself slow down our program.
 
 A definite method of speeding things up is therefore to rewrite parts in C++ (or
 CUDA) and *fuse* particular groups of operations. Fusing means combining the
-implementations of many functions into a single functions, which profits from
+implementations of many functions into a single function, which profits from
 fewer kernel launches as well as other optimizations we can perform with
 increased visibility of the global flow of data.
 
@@ -313,7 +313,7 @@ Once you have your operation written in C++ and ATen, you can use pybind11 to
 bind your C++ functions or classes into Python in a very simple manner.
 Questions or issues you have about this part of PyTorch C++ extensions will
 largely be addressed by `pybind11 documentation
-<https://pybind11.readthedocs.io/en/master/>`_.
+<https://pybind11.readthedocs.io/en/stable/>`_.
 
 For our extensions, the necessary binding code spans only four lines:
 
@@ -326,7 +326,7 @@ For our extensions, the necessary binding code spans only four lines:
 
 One bit to note here is the macro ``TORCH_EXTENSION_NAME``. The torch extension
 build will define it as the name you give your extension in the ``setup.py``
-script. In this case, the value of ``TORCH_EXTENSION_NAME`` would be "lltm".
+script. In this case, the value of ``TORCH_EXTENSION_NAME`` would be "lltm_cpp".
 This is to avoid having to maintain the name of the extension in two places
 (the build script and your C++ code), as a mismatch between the two can lead to
 nasty and hard to track issues.
@@ -509,12 +509,12 @@ and with our new C++ version::
   Forward: 349.335 us | Backward 443.523 us
 
 We can already see a significant speedup for the forward function (more than
-30%). For the backward function a speedup is visible, albeit not major one. The
-backward pass I wrote above was not particularly optimized and could definitely
-be improved. Also, PyTorch's automatic differentiation engine can automatically
-parallelize computation graphs, may use a more efficient flow of operations
-overall, and is also implemented in C++, so it's expected to be fast.
-Nevertheless, this is a good start.
+30%). For the backward function, a speedup is visible, albeit not a major one.
+The backward pass I wrote above was not particularly optimized and could
+definitely be improved. Also, PyTorch's automatic differentiation engine can
+automatically parallelize computation graphs, may use a more efficient flow of
+operations overall, and is also implemented in C++, so it's expected to be
+fast. Nevertheless, this is a good start.
 
 Performance on GPU Devices
 **************************
@@ -571,7 +571,7 @@ And C++/ATen::
 
 That's a great overall speedup compared to non-CUDA code. However, we can pull
 even more performance out of our C++ code by writing custom CUDA kernels, which
-we'll dive into soon. Before that, let's dicuss another way of building your C++
+we'll dive into soon. Before that, let's discuss another way of building your C++
 extensions.
 
 JIT Compiling Extensions
@@ -851,7 +851,7 @@ and ``Double``), you can use ``AT_DISPATCH_ALL_TYPES``.
 
 Note that we perform some operations with plain ATen. These operations will
 still run on the GPU, but using ATen's default implementations. This makes
-sense, because ATen will use highly optimized routines for things like matrix
+sense because ATen will use highly optimized routines for things like matrix
 multiplies (e.g. ``addmm``) or convolutions which would be much harder to
 implement and improve ourselves.
 
@@ -903,7 +903,7 @@ You can see in the CUDA kernel that we work directly on pointers with the right
 type. Indeed, working directly with high level type agnostic tensors inside cuda
 kernels would be very inefficient.
 
-However, this comes at a cost of ease of use and readibility, especially for
+However, this comes at a cost of ease of use and readability, especially for
 highly dimensional data. In our example, we know for example that the contiguous
 ``gates`` tensor has 3 dimensions:
 
@@ -920,7 +920,7 @@ arithmetic.
   gates.data<scalar_t>()[n*3*state_size + row*state_size + column]
 
 
-In addition to being verbose, this expression needs stride to be explicitely
+In addition to being verbose, this expression needs stride to be explicitly
 known, and thus passed to the kernel function within its arguments. You can see
 that in the case of kernel functions accepting multiple tensors with different
 sizes you will end up with a very long list of arguments.
@@ -1101,7 +1101,7 @@ on it:
     const int threads = 1024;
     const dim3 blocks((state_size + threads - 1) / threads, batch_size);
 
-    AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_forward_cuda", ([&] {
+    AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_backward_cuda", ([&] {
       lltm_cuda_backward_kernel<scalar_t><<<blocks, threads>>>(
           d_old_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
           d_gates.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
diff --git a/advanced_source/ddp_pipeline.py b/advanced_source/ddp_pipeline.py
new file mode 100644
index 000000000..a3ac4b9d1
--- /dev/null
+++ b/advanced_source/ddp_pipeline.py
@@ -0,0 +1,523 @@
+"""
+Training Transformer models using Distributed Data Parallel and Pipeline Parallelism
+====================================================================================
+
+**Author**: `Pritam Damania <https://github.com/pritamdamania87>`_
+
+This tutorial demonstrates how to train a large Transformer model across
+multiple GPUs using `Distributed Data Parallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__ and
+`Pipeline Parallelism <https://pytorch.org/docs/stable/pipeline.html>`__. This tutorial is an extension of the
+`Sequence-to-Sequence Modeling with nn.Transformer and TorchText <https://pytorch.org/tutorials/beginner/transformer_tutorial.html>`__ tutorial
+and scales up the same model to demonstrate how Distributed Data Parallel and
+Pipeline Parallelism can be used to train Transformer models.
+
+Prerequisites:
+
+    * `Pipeline Parallelism <https://pytorch.org/docs/stable/pipeline.html>`__
+    * `Sequence-to-Sequence Modeling with nn.Transformer and TorchText <https://pytorch.org/tutorials/beginner/transformer_tutorial.html>`__
+    * `Getting Started with Distributed Data Parallel <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`__
+"""
+
+
+######################################################################
+# Define the model
+# ----------------
+#
+
+######################################################################
+# ``PositionalEncoding`` module injects some information about the
+# relative or absolute position of the tokens in the sequence. The
+# positional encodings have the same dimension as the embeddings so that
+# the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
+# different frequencies.
+
+import sys
+import os
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tempfile
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+
+######################################################################
+# In this tutorial, we will split a Transformer model across two GPUs and use
+# pipeline parallelism to train the model. In addition to this, we use
+# `Distributed Data Parallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__
+# to train two replicas of this pipeline. We have one process driving a pipe across
+# GPUs 0 and 1 and another process driving a pipe across GPUs 2 and 3. Both these
+# processes then use Distributed Data Parallel to train the two replicas. The
+# model is exactly the same model used in the `Sequence-to-Sequence Modeling with nn.Transformer and TorchText
+# <https://pytorch.org/tutorials/beginner/transformer_tutorial.html>`__ tutorial,
+# but is split into two stages. The largest number of parameters belong to the
+# `nn.TransformerEncoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html>`__ layer.
+# The `nn.TransformerEncoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html>`__
+# itself consists of ``nlayers`` of `nn.TransformerEncoderLayer <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html>`__.
+# As a result, our focus is on ``nn.TransformerEncoder`` and we split the model
+# such that half of the ``nn.TransformerEncoderLayer`` are on one GPU and the
+# other half are on another. To do this, we pull out the ``Encoder`` and
+# ``Decoder`` sections into seperate modules and then build an nn.Sequential
+# representing the original Transformer module.
+
+
+if sys.platform == 'win32':
+    print('Windows platform is not supported for pipeline parallelism')
+    sys.exit(0)
+if torch.cuda.device_count() < 4:
+    print('Need at least four GPU devices for this tutorial')
+    sys.exit(0)
+
+class Encoder(nn.Module):
+    def __init__(self, ntoken, ninp, dropout=0.5):
+        super(Encoder, self).__init__()
+        self.src_mask = None
+        self.pos_encoder = PositionalEncoding(ninp, dropout)
+        self.encoder = nn.Embedding(ntoken, ninp)
+        self.ninp = ninp
+        self.init_weights()
+
+    def init_weights(self):
+        initrange = 0.1
+        self.encoder.weight.data.uniform_(-initrange, initrange)
+
+    def _generate_square_subsequent_mask(self, sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+
+    def forward(self, src):
+        if self.src_mask is None or self.src_mask.size(0) != src.size(0):
+            device = src.device
+            mask = self._generate_square_subsequent_mask(src.size(0)).to(device)
+            self.src_mask = mask
+
+        src = self.encoder(src) * math.sqrt(self.ninp)
+        return self.pos_encoder(src)
+
+class Decoder(nn.Module):
+    def __init__(self, ntoken, ninp):
+        super(Decoder, self).__init__()
+        self.decoder = nn.Linear(ninp, ntoken)
+        self.init_weights()
+
+    def init_weights(self):
+        initrange = 0.1
+        self.decoder.bias.data.zero_()
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, inp):
+        return self.decoder(inp)
+
+######################################################################
+# Start multiple processes for training
+# -------------------------------------
+#
+
+
+######################################################################
+# We start two processes where each process drives its own pipeline across two
+# GPUs. ``run_worker`` is executed for each process.
+
+def run_worker(rank, world_size):
+
+
+######################################################################
+# Load and batch data
+# -------------------
+#
+
+
+######################################################################
+# The training process uses Wikitext-2 dataset from ``torchtext``. The
+# vocab object is built based on the train dataset and is used to numericalize
+# tokens into tensors. Starting from sequential data, the ``batchify()``
+# function arranges the dataset into columns, trimming off any tokens remaining
+# after the data has been divided into batches of size ``batch_size``.
+# For instance, with the alphabet as the sequence (total length of 26)
+# and a batch size of 4, we would divide the alphabet into 4 sequences of
+# length 6:
+#
+# .. math::
+#   \begin{bmatrix}
+#   \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z}
+#   \end{bmatrix}
+#   \Rightarrow
+#   \begin{bmatrix}
+#   \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} &
+#   \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} &
+#   \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} &
+#   \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix}
+#   \end{bmatrix}
+#
+# These columns are treated as independent by the model, which means that
+# the dependence of ``G`` and ``F`` can not be learned, but allows more
+# efficient batch processing.
+#
+
+# In 'run_worker'
+    def print_with_rank(msg):
+        print('[RANK {}]: {}'.format(rank, msg))
+
+    import io
+    from torchtext.utils import download_from_url, extract_archive
+    from torchtext.data.utils import get_tokenizer
+    from torchtext.vocab import build_vocab_from_iterator
+
+    url = 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'
+    test_filepath, valid_filepath, train_filepath = extract_archive(download_from_url(url, root=".data{}".format(rank)))
+    tokenizer = get_tokenizer('basic_english')
+    vocab = build_vocab_from_iterator(map(tokenizer,
+                                          iter(io.open(train_filepath,
+                                                       encoding="utf8"))))
+
+    def data_process(raw_text_iter):
+      data = [torch.tensor([vocab[token] for token in tokenizer(item)],
+                           dtype=torch.long) for item in raw_text_iter]
+      return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
+
+    train_data = data_process(iter(io.open(train_filepath, encoding="utf8")))
+    val_data = data_process(iter(io.open(valid_filepath, encoding="utf8")))
+    test_data = data_process(iter(io.open(test_filepath, encoding="utf8")))
+    device = torch.device(2 * rank)
+
+    def batchify(data, bsz, rank, world_size, is_train=False):
+        # Divide the dataset into bsz parts.
+        nbatch = data.size(0) // bsz
+        # Trim off any extra elements that wouldn't cleanly fit (remainders).
+        data = data.narrow(0, 0, nbatch * bsz)
+        # Evenly divide the data across the bsz batches.
+        data = data.view(bsz, -1).t().contiguous()
+        # Divide the data across the ranks only for training data.
+        if is_train:
+            data_per_rank = data.size(0) // world_size
+            data = data[rank * data_per_rank : (rank + 1) * data_per_rank]
+        return data.to(device)
+
+    batch_size = 20
+    eval_batch_size = 10
+    train_data = batchify(train_data, batch_size, rank, world_size, True)
+    val_data = batchify(val_data, eval_batch_size, rank, world_size)
+    test_data = batchify(test_data, eval_batch_size, rank, world_size)
+
+
+######################################################################
+# Functions to generate input and target sequence
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+
+
+######################################################################
+# ``get_batch()`` function generates the input and target sequence for
+# the transformer model. It subdivides the source data into chunks of
+# length ``bptt``. For the language modeling task, the model needs the
+# following words as ``Target``. For example, with a ``bptt`` value of 2,
+# we’d get the following two Variables for ``i`` = 0:
+#
+# .. image:: ../_static/img/transformer_input_target.png
+#
+# It should be noted that the chunks are along dimension 0, consistent
+# with the ``S`` dimension in the Transformer model. The batch dimension
+# ``N`` is along dimension 1.
+#
+
+# In 'run_worker'
+    bptt = 35
+    def get_batch(source, i):
+        seq_len = min(bptt, len(source) - 1 - i)
+        data = source[i:i+seq_len]
+        target = source[i+1:i+1+seq_len].view(-1)
+        return data, target
+
+######################################################################
+# Model scale and Pipe initialization
+# -----------------------------------
+#
+
+
+######################################################################
+# To demonstrate training large Transformer models using pipeline parallelism,
+# we scale up the Transformer layers appropriately. We use an embedding
+# dimension of 4096, hidden size of 4096, 16 attention heads and 8 total
+# transformer layers (``nn.TransformerEncoderLayer``). This creates a model with
+# **~1 billion** parameters.
+#
+# We need to initialize the `RPC Framework <https://pytorch.org/docs/stable/rpc.html>`__
+# since Pipe depends on the RPC framework via `RRef <https://pytorch.org/docs/stable/rpc.html#rref>`__
+# which allows for future expansion to cross host pipelining. We need to
+# initialize the RPC framework with only a single worker since we're using a
+# single process to drive multiple GPUs.
+#
+# The pipeline is then initialized with 8 transformer layers on one GPU and 8
+# transformer layers on the other GPU. One pipe is setup across GPUs 0 and 1 and
+# another across GPUs 2 and 3. Both pipes are then replicated using DistributedDataParallel.
+
+# In 'run_worker'
+    ntokens = len(vocab.stoi) # the size of vocabulary
+    emsize = 4096 # embedding dimension
+    nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder
+    nlayers = 8 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
+    nhead = 16 # the number of heads in the multiheadattention models
+    dropout = 0.2 # the dropout value
+
+    from torch.distributed import rpc
+    tmpfile = tempfile.NamedTemporaryFile()
+    rpc.init_rpc(
+        name="worker",
+        rank=0,
+        world_size=1,
+        rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
+            init_method="file://{}".format(tmpfile.name),
+            # Specifying _transports and _channels is a workaround and we no longer
+            # will have to specify _transports and _channels for PyTorch
+            # versions >= 1.8.1
+            _transports=["ibv", "uv"],
+            _channels=["cuda_ipc", "cuda_basic"],
+        )
+    )
+
+    # Num gpus for model parallelism.
+    num_gpus = 2
+    partition_len = ((nlayers - 1) // num_gpus) + 1
+
+    # Add encoder in the beginning.
+    tmp_list = [Encoder(ntokens, emsize, dropout).cuda(2 * rank)]
+    module_list = []
+
+    # Add all the necessary transformer blocks.
+    for i in range(nlayers):
+        transformer_block = TransformerEncoderLayer(emsize, nhead, nhid, dropout)
+        if i != 0 and i % (partition_len) == 0:
+            module_list.append(nn.Sequential(*tmp_list))
+            tmp_list = []
+        device = i // (partition_len)
+        tmp_list.append(transformer_block.to(2 * rank + device))
+
+    # Add decoder in the end.
+    tmp_list.append(Decoder(ntokens, emsize).cuda(2 * rank + num_gpus - 1))
+    module_list.append(nn.Sequential(*tmp_list))
+
+    # Need to use 'checkpoint=never' since as of PyTorch 1.8, Pipe checkpointing
+    # doesn't work with DDP.
+    from torch.distributed.pipeline.sync import Pipe
+    model = Pipe(torch.nn.Sequential(
+        *module_list), chunks = 8, checkpoint="never")
+
+    # Initialize process group and wrap model in DDP.
+    from torch.nn.parallel import DistributedDataParallel
+    import torch.distributed as dist
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '29500'
+    dist.init_process_group(
+                backend="nccl", rank=rank, world_size=world_size)
+    model = DistributedDataParallel(model)
+
+    def get_total_params(module: torch.nn.Module):
+        total_params = 0
+        for param in module.parameters():
+            total_params += param.numel()
+        return total_params
+
+    print_with_rank('Total parameters in model: {:,}'.format(get_total_params(model)))
+
+######################################################################
+# Run the model
+# -------------
+#
+
+
+######################################################################
+# `CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__
+# is applied to track the loss and
+# `SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd#torch.optim.SGD>`__
+# implements stochastic gradient descent method as the optimizer. The initial
+# learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr#torch.optim.lr_scheduler.StepLR>`__ is
+# applied to adjust the learn rate through epochs. During the
+# training, we use
+# `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm#torch.nn.utils.clip_grad_norm_>`__
+# function to scale all the gradient together to prevent exploding.
+#
+
+# In 'run_worker'
+    criterion = nn.CrossEntropyLoss()
+    lr = 5.0 # learning rate
+    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
+
+    import time
+    def train():
+        model.train() # Turn on the train mode
+        total_loss = 0.
+        start_time = time.time()
+        ntokens = len(vocab.stoi)
+
+        # Train only for 50 batches to keep script execution time low.
+        nbatches = min(50 * bptt, train_data.size(0) - 1)
+
+        for batch, i in enumerate(range(0, nbatches, bptt)):
+            data, targets = get_batch(train_data, i)
+            optimizer.zero_grad()
+            # Since the Pipe is only within a single host and process the ``RRef``
+            # returned by forward method is local to this node and can simply
+            # retrieved via ``RRef.local_value()``.
+            output = model(data).local_value()
+            # Need to move targets to the device where the output of the
+            # pipeline resides.
+            loss = criterion(output.view(-1, ntokens), targets.cuda(2 * rank + 1))
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+            optimizer.step()
+
+            total_loss += loss.item()
+            log_interval = 10
+            if batch % log_interval == 0 and batch > 0:
+                cur_loss = total_loss / log_interval
+                elapsed = time.time() - start_time
+                print_with_rank('| epoch {:3d} | {:5d}/{:5d} batches | '
+                      'lr {:02.2f} | ms/batch {:5.2f} | '
+                      'loss {:5.2f} | ppl {:8.2f}'.format(
+                        epoch, batch, nbatches // bptt, scheduler.get_lr()[0],
+                        elapsed * 1000 / log_interval,
+                        cur_loss, math.exp(cur_loss)))
+                total_loss = 0
+                start_time = time.time()
+
+    def evaluate(eval_model, data_source):
+        eval_model.eval() # Turn on the evaluation mode
+        total_loss = 0.
+        ntokens = len(vocab.stoi)
+        # Evaluate only for 50 batches to keep script execution time low.
+        nbatches = min(50 * bptt, data_source.size(0) - 1)
+        with torch.no_grad():
+            for i in range(0, nbatches, bptt):
+                data, targets = get_batch(data_source, i)
+                output = eval_model(data).local_value()
+                output_flat = output.view(-1, ntokens)
+                # Need to move targets to the device where the output of the
+                # pipeline resides.
+                total_loss += len(data) * criterion(output_flat, targets.cuda(2 * rank + 1)).item()
+        return total_loss / (len(data_source) - 1)
+
+######################################################################
+# Loop over epochs. Save the model if the validation loss is the best
+# we've seen so far. Adjust the learning rate after each epoch.
+
+# In 'run_worker'
+    best_val_loss = float("inf")
+    epochs = 3 # The number of epochs
+    best_model = None
+
+    for epoch in range(1, epochs + 1):
+        epoch_start_time = time.time()
+        train()
+        val_loss = evaluate(model, val_data)
+        print_with_rank('-' * 89)
+        print_with_rank('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
+              'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
+                                         val_loss, math.exp(val_loss)))
+        print_with_rank('-' * 89)
+
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            best_model = model
+
+        scheduler.step()
+
+
+######################################################################
+# Evaluate the model with the test dataset
+# -------------------------------------
+#
+# Apply the best model to check the result with the test dataset.
+
+# In 'run_worker'
+    test_loss = evaluate(best_model, test_data)
+    print_with_rank('=' * 89)
+    print_with_rank('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
+        test_loss, math.exp(test_loss)))
+    print_with_rank('=' * 89)
+
+# Main execution
+import torch.multiprocessing as mp
+
+if __name__=="__main__":
+    world_size = 2
+    mp.spawn(run_worker, args=(world_size, ), nprocs=world_size, join=True)
+
+
+######################################################################
+# Output
+# ------
+#
+
+
+######################################################################
+#.. code-block:: py
+#
+#    [RANK 1]: Total parameters in model: 1,041,453,167
+#    [RANK 0]: Total parameters in model: 1,041,453,167
+#    [RANK 0]: | epoch   1 |    10/   50 batches | lr 5.00 | ms/batch 1414.18 | loss 48.70 | ppl 1406154472673147092992.00
+#    [RANK 1]: | epoch   1 |    10/   50 batches | lr 5.00 | ms/batch 1414.42 | loss 48.49 | ppl 1146707511057334927360.00
+#    [RANK 0]: | epoch   1 |    20/   50 batches | lr 5.00 | ms/batch 1260.76 | loss 42.74 | ppl 3648812398518492672.00
+#    [RANK 1]: | epoch   1 |    20/   50 batches | lr 5.00 | ms/batch 1260.76 | loss 41.51 | ppl 1064844757565813248.00
+#    [RANK 0]: | epoch   1 |    30/   50 batches | lr 5.00 | ms/batch 1246.80 | loss 41.85 | ppl 1497706388552644096.00
+#    [RANK 1]: | epoch   1 |    30/   50 batches | lr 5.00 | ms/batch 1246.80 | loss 40.46 | ppl 373830103285747072.00
+#    [RANK 0]: | epoch   1 |    40/   50 batches | lr 5.00 | ms/batch 1246.69 | loss 39.76 | ppl 185159839078666368.00
+#    [RANK 1]: | epoch   1 |    40/   50 batches | lr 5.00 | ms/batch 1246.69 | loss 39.89 | ppl 211756997625874912.00
+#    [RANK 0]: -----------------------------------------------------------------------------------------
+#    [RANK 0]: | end of epoch   1 | time: 69.37s | valid loss  2.92 | valid ppl    18.46
+#    [RANK 0]: -----------------------------------------------------------------------------------------
+#    [RANK 1]: -----------------------------------------------------------------------------------------
+#    [RANK 1]: | end of epoch   1 | time: 69.39s | valid loss  2.92 | valid ppl    18.46
+#    [RANK 1]: -----------------------------------------------------------------------------------------
+#    [RANK 1]: | epoch   2 |    10/   50 batches | lr 4.51 | ms/batch 1373.91 | loss 39.77 | ppl 187532281612905856.00
+#    [RANK 0]: | epoch   2 |    10/   50 batches | lr 4.51 | ms/batch 1375.62 | loss 39.05 | ppl 91344349371016336.00
+#    [RANK 0]: | epoch   2 |    20/   50 batches | lr 4.51 | ms/batch 1250.33 | loss 30.62 | ppl 19917977906884.78
+#    [RANK 1]: | epoch   2 |    20/   50 batches | lr 4.51 | ms/batch 1250.33 | loss 30.48 | ppl 17250186491252.32
+#    [RANK 1]: | epoch   2 |    30/   50 batches | lr 4.51 | ms/batch 1250.73 | loss 29.14 | ppl 4534527326854.47
+#    [RANK 0]: | epoch   2 |    30/   50 batches | lr 4.51 | ms/batch 1250.73 | loss 29.43 | ppl 6035762659681.65
+#    [RANK 0]: | epoch   2 |    40/   50 batches | lr 4.51 | ms/batch 1249.54 | loss 23.11 | ppl 10869828323.89
+#    [RANK 1]: | epoch   2 |    40/   50 batches | lr 4.51 | ms/batch 1249.55 | loss 22.90 | ppl 8785318464.24
+#    [RANK 0]: -----------------------------------------------------------------------------------------
+#    [RANK 0]: | end of epoch   2 | time: 69.02s | valid loss  0.94 | valid ppl     2.55
+#    [RANK 0]: -----------------------------------------------------------------------------------------
+#    [RANK 1]: -----------------------------------------------------------------------------------------
+#    [RANK 1]: | end of epoch   2 | time: 69.05s | valid loss  0.94 | valid ppl     2.55
+#    [RANK 1]: -----------------------------------------------------------------------------------------
+#    [RANK 0]: | epoch   3 |    10/   50 batches | lr 4.29 | ms/batch 1380.66 | loss 12.98 | ppl 434052.59
+#    [RANK 1]: | epoch   3 |    10/   50 batches | lr 4.29 | ms/batch 1376.47 | loss 12.92 | ppl 410203.33
+#    [RANK 1]: | epoch   3 |    20/   50 batches | lr 4.29 | ms/batch 1250.88 | loss  9.80 | ppl 18034.58
+#    [RANK 0]: | epoch   3 |    20/   50 batches | lr 4.29 | ms/batch 1250.88 | loss  9.78 | ppl 17741.88
+#    [RANK 0]: | epoch   3 |    30/   50 batches | lr 4.29 | ms/batch 1251.89 | loss 10.37 | ppl 32016.45
+#    [RANK 1]: | epoch   3 |    30/   50 batches | lr 4.29 | ms/batch 1251.90 | loss 10.46 | ppl 34735.08
+#    [RANK 0]: | epoch   3 |    40/   50 batches | lr 4.29 | ms/batch 1250.70 | loss 10.09 | ppl 24147.61
+#    [RANK 1]: | epoch   3 |    40/   50 batches | lr 4.29 | ms/batch 1250.71 | loss 10.08 | ppl 23748.31
+#    [RANK 0]: -----------------------------------------------------------------------------------------
+#    [RANK 0]: | end of epoch   3 | time: 69.12s | valid loss  0.69 | valid ppl     2.00
+#    [RANK 0]: -----------------------------------------------------------------------------------------
+#    [RANK 1]: -----------------------------------------------------------------------------------------
+#    [RANK 1]: | end of epoch   3 | time: 69.12s | valid loss  0.69 | valid ppl     2.00
+#    [RANK 1]: -----------------------------------------------------------------------------------------
+#    [RANK 0]: =========================================================================================
+#    [RANK 0]: | End of training | test loss  0.60 | test ppl     1.83
+#    [RANK 0]: =========================================================================================
+#    [RANK 1]: =========================================================================================
+#    [RANK 1]: | End of training | test loss  0.60 | test ppl     1.83
diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst
index 7a7d806c3..868407e89 100644
--- a/advanced_source/dispatcher.rst
+++ b/advanced_source/dispatcher.rst
@@ -1,5 +1,5 @@
-Dispatcher in C++
-=================
+Registering a Dispatched Operator in C++
+========================================
 
 The dispatcher is an internal component of PyTorch which is responsible for
 figuring out what code should actually get run when you call a function like
@@ -32,12 +32,12 @@ Defining schema and backend implementations
 
 The general principle behind the dispatcher is that it divides the
 implementation of an operator into multiple kernels, each of which implements
-functionality for a specific *dispatch key*; for example, CPU, CUDA or Autograd.
-The dispatcher determines what the highest priority dispatch key is at the time
+functionality for a specific *dispatch key*, e.g. CPU, CUDA.  The dispatcher
+determines what the highest priority dispatch key is at the time
 you call an operator (this is done by looking at both the tensor arguments as
 well as some thread local state), and transfers control to the kernel for that
 dispatch key.  The end effect is that when you call an operator, we first
-execute the Autograd kernel, and then we redispatch to the CPU or CUDA kernel
+execute the Autograd kernel, and then we redispatch to the backend kernel
 depending on the device types of the passed in tensors.
 
 Let's take a look at the various parts involved in making this
@@ -105,6 +105,8 @@ speaking, the structure of your registrations will look like this:
     that provides implementations for all basic operators on the XLA dispatch
     key.
 
+.. _autograd-support:
+
 Adding autograd support
 -----------------------
 
@@ -113,7 +115,7 @@ can we add autograd support to it?  As you might guess, we will register an
 autograd kernel (similar to what's described in the `custom autograd function <cpp_autograd>`_ tutorial)!
 However, there is a twist: unlike the CPU and CUDA kernels, the autograd kernel
 needs to *redispatch*: it needs to call back into the dispatcher to get to
-the final CPU and CUDA implementations.
+the inference kernels, e.g. CPU or CUDA implementations.
 
 Thus, before we write the autograd kernel, let's write a *dispatching function*
 which calls into the dispatcher to find the right kernel for your operator.
@@ -177,6 +179,17 @@ functions:
   :start-after: BEGIN TORCH_LIBRARY_IMPL Autograd
   :end-before: END TORCH_LIBRARY_IMPL Autograd
 
+
+.. note::
+
+    In this example we register the kernel to ``Autograd``, which installs it as the
+    autograd kernel for all backends. You can also register optimized kernels for specific
+    backends by using the corresponding backend-specific dispatch key - for example,
+    ``AutogradCPU`` or ``AutogradCUDA``. To explore these and other dispatch key
+    options in more detail, check out the ``PythonDispatcher`` tool provided in
+    `torch/_python_dispatcher.py <https://github.com/pytorch/pytorch/blob/master/torch/_python_dispatcher.py>`_.
+
+
 Going beyond autograd
 ---------------------
 
@@ -207,7 +220,8 @@ So why use the dispatcher?  There are a few reasons:
    (CPU, CUDA, Autograd) without having to write a single, centralized
    if statement that refers to all of them.  Importantly, third parties can
    register extra implementations for other aspects without having to patch the
-   original definition of an operator.
+   original definition of an operator.  We'll talk more about extending the
+   dispatcher in `extending dispatcher for a new backend <extend_dispatcher>`_.
 
 2. It supports more dispatch keys than CPU, CUDA and Autograd.  You can
    see a full list of dispatch keys that are currently implemented
@@ -229,38 +243,97 @@ Autocast
 ^^^^^^^^
 
 The Autocast dispatch key implements support for
-`automatic mixed precision <https://developer.nvidia.com/automatic-mixed-precision>`_
-(AMP).  An autocast kernel typically modifies the operation of an operator by casting the
-input arguments to some precision before carrying out the operation.  For some
-operations, it is numerically safe to cast to lower precision, which is how AMP
-can achieve speed ups and reduced memory usage without sacrificing much
-accuracy.  A nontrivial autocast kernel looks something like this:
+`automatic mixed precision (AMP) <https://pytorch.org/docs/stable/amp.html>`_.
+An autocast wrapper kernel typically casts incoming ``float16`` or ``float32`` CUDA tensors
+to some preferred precision before running the op.
+For example, matmuls and convolutions on floating-point CUDA tensors usually run faster
+and use less memory in ``float16`` without impairing convergence.
+Autocast wrappers only have an effect in
+`autocast-enabled contexts <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_.
+
+Here's an autocast wrapper for a hypothetical custom matmul, along with its registration:
 
 .. code-block:: cpp
 
+    // Autocast-specific helper functions
+    #include <ATen/autocast_mode.h>
+
     Tensor mymatmul_autocast(const Tensor& self, const Tensor& other) {
       c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
-      return mymatmul(autocast::_cast(at::kHalf, self), autocast::_cast(at::kHalf, other));
+      return mymatmul(at::autocast::cached_cast(at::kHalf, self),
+                      at::autocast::cached_cast(at::kHalf, other));
     }
 
+    TORCH_LIBRARY_IMPL(myops, Autocast, m) {
+      m.impl("mymatmul", mymatmul_autocast);
+    }
+
+``cached_cast(kHalf, tensor)`` casts ``tensor`` to ``float16`` if ``tensor`` is CUDA and ``float32``,
+otherwise, it leaves ``tensor`` unchanged (c.f. the
+`eligibility policy <https://pytorch.org/docs/stable/amp.html#op-eligibility>`_ for natively autocasted ops).
+This ensures if the network calls ``mymatmul`` on any mixture of ``float16`` and ``float32`` CUDA tensors,
+``mymatmul`` runs in ``float16``.  Meanwhile, calls to ``mymatmul`` with non-CUDA, integer-type, or ``float64``
+inputs are unaffected.  Using ``cached_cast`` to follow the native eligibility policy in your own autocast wrapper
+is recommended, but not required.  For example, if you wanted to force ``float16`` execution for all input types,
+you could ``return mymatmul(self.half(), other.half());`` instead of using ``cached_cast``.
+
 Notice that, like our autograd kernels, we exclude the ``Autocast`` key from
-dispatch before redispatching.  By default, if no autocast kernel is provided,
-we simply fallthrough directly to the regular operator implementation (no
-autocasting occurs.) (We didn't use ``myadd`` for this example, since pointwise
-addition doesn't do autocasting and should just fall through).
-
-When should an autocast kernel be registered? Unfortunately, there aren't
-cut-and-dry rules for when you should cast to a lower precision.  You can
-get a sense for what operators have autocasting behavior by looking at
-the `AMP documentation
-<https://pytorch.org/docs/master/amp.html#op-specific-behavior>`_.  Some other
-general rules:
-
-* Operations that do reductions should be carried out in float32,
-* Any operation with multiple float tensor inputs has to standardize them
-  to a common precision, and
-* Any operation that does a convolution or gemm under the hood should
-  probably be float16
+dispatch before redispatching.
+
+By default, if no autocast wrapper is provided,
+we fallthrough directly to the regular operator implementation (no
+autocasting occurs).  (We didn't use ``myadd`` for this example, since pointwise
+addition doesn't need autocasting and should just fall through.)
+
+When should an autocast wrapper be registered? Unfortunately, there aren't
+cut-and-dried rules for an op's preferred precision.  You can
+get a sense for some native ops' preferred precisions by looking at the
+`cast lists <https://pytorch.org/docs/master/amp.html#op-specific-behavior>`_.
+General guidance:
+
+* Ops that do reductions should probably execute in ``float32``,
+* Any op that does a convolution or gemm under the hood should
+  probably execute in ``float16``, and
+* Other ops with multiple floating-point tensor inputs should standardize
+  them to a common precision (unless the implementation supports inputs with different precisions).
+
+If your custom op falls into the third category, the ``promote_type`` template
+helps figure out the widest floating-point type present among input tensors, which is
+the safest choice for the execution type:
+
+.. code-block:: cpp
+
+    #include <ATen/autocast_mode.h>
+
+    Tensor my_multiple_input_op_autocast(const Tensor& t0, const Tensor& t1) {
+      c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+      // The required at::kHalf argument is an optimistic initial guess.
+      auto exec_type = at::autocast::promote_type(at::kHalf, t0, t1);
+      return my_multiple_input_op(at::autocast::cached_cast(exec_type, t0),
+                                  at::autocast::cached_cast(exec_type, t1));
+    }
+
+If your custom op is :ref:`autograd-enabled<autograd-support>`, you only need to write and register
+an autocast wrapper for the same name onto which the autograd wrapper is registered.
+For example, if you wanted an autocast wrapper for the ``myadd`` function shown
+in the autograd section, all you'd need is
+
+.. code-block:: cpp
+
+    Tensor myadd_autocast(const Tensor& self, const Tensor& other) {
+      c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+      return myadd(at::autocast::cached_cast(<desired dtype>, self),
+                   at::autocast::cached_cast(<desired dtype>, other));
+    }
+
+    TORCH_LIBRARY_IMPL(myops, Autocast, m) {
+      m.impl("myadd", myadd_autocast);
+    }
+
+There are no separate gymnastics to make the backward method autocast compatible.
+However, the backward method defined in your custom autograd function will run in the same
+dtype as autocast sets for the forward method, so you should choose a ``<desired dtype>``
+suitable for both your forward and backward methods.
 
 Batched
 ^^^^^^^
diff --git a/advanced_source/dynamic_quantization_tutorial.py b/advanced_source/dynamic_quantization_tutorial.py
index 94555b115..be0182a5d 100644
--- a/advanced_source/dynamic_quantization_tutorial.py
+++ b/advanced_source/dynamic_quantization_tutorial.py
@@ -7,7 +7,7 @@
 
 **Edited by**: `Seth Weidman <https://github.com/SethHWeidman/>`_
 
-**번역**: `박경림 <https://github.com/kypark7/>`_ `Myungha Kwon <https://github.com/kwonmha/>`_ 
+**번역**: `박경림 <https://github.com/kypark7/>`_ `Myungha Kwon <https://github.com/kwonmha/>`_
 
 시작하기
 ------------
@@ -132,9 +132,9 @@ def tokenize(self, path):
 # -----------------------------
 #
 # 이 튜토리얼은 모델이 학습된 후 적용되는 양자화 기술인 동적 양자화에 대한 튜토리얼입니다.
-# 따라서 우리는 미리 학습된 가중치를 모델 아키텍처에 로드할 것 입니다. 이 가중치는 word 
+# 따라서 우리는 미리 학습된 가중치를 모델 아키텍처에 로드할 것 입니다. 이 가중치는 word
 # language 모델 예제의 기본 설정을 사용하여 5개의 epoch 동안 학습하여 얻은 것입니다.
-# 
+#
 
 ntokens = len(corpus.dictionary)
 
@@ -208,7 +208,7 @@ def batchify(data, bsz):
 def get_batch(source, i):
     seq_len = min(bptt, len(source) - 1 - i)
     data = source[i:i+seq_len]
-    target = source[i+1:i+1+seq_len].view(-1)
+    target = source[i+1:i+1+seq_len].reshape(-1)
     return data, target
 
 def repackage_hidden(h):
@@ -280,7 +280,7 @@ def time_model_evaluation(model, test_data):
 time_model_evaluation(quantized_model, test_data)
 
 ######################################################################
-# MacBook Pro에서 로컬로 실행하는 경우, 양자화 없이는 추론(inference)에 약 200초가 걸리고 
+# MacBook Pro에서 로컬로 실행하는 경우, 양자화 없이는 추론(inference)에 약 200초가 걸리고
 # 양자화를 사용하면 약 100초가 걸립니다.
 #
 # 마치며
diff --git a/advanced_source/extend_dispatcher.rst b/advanced_source/extend_dispatcher.rst
new file mode 100644
index 000000000..7cdf49e61
--- /dev/null
+++ b/advanced_source/extend_dispatcher.rst
@@ -0,0 +1,379 @@
+Extending dispatcher for a new backend in C++
+=============================================
+
+In this tutorial we will walk through all necessary steps to extend the dispatcher to
+add a new device living outside ``pytorch/pytorch`` repo and maintain it to keep in
+sync with native PyTorch devices.  Here we'll assume that you're familiar with how
+to `register a dispatched operator in C++ <dispatcher>`_ and how to write a
+`custom autograd function <cpp_autograd>`_.
+
+
+.. note::
+
+   This tutorial touches a lot of internal components inside PyTorch which are being actively improved,
+   please expect changes to APIs if you decide to follow this tutorial.  We'll keep this tutorial
+   up to date with the latest APIs.
+
+What's a new backend?
+---------------------
+
+Adding a new backend to PyTorch requires a lot of developement and maintainence from backend extenders.
+Before adding a new backend, let's first consider a few common use cases and recommended solutions for them:
+
+* If you have new algorithms for an existing PyTorch operator, send a PR to PyTorch.
+* If you want to propose a new operator, send a feature request/PR to PyTorch.
+* If you want to add support for a new device/hardware like Google TPU and customized chips, which often requires using
+  hardware-specific API to write kernels, follow this tutorial and add a out-of-tree backend to PyTorch.
+* If you want to add support for existing operators but with a different Tensor layout/representation
+  like sparse and quantized, which enforces your kernels to be written in a way that's more efficient
+  given the layout/representation limitation, follow this tutorial and add a out-of-tree backend to PyTorch.
+
+In this tutorial we'll mainly focus on adding a new out-of-tree device below.  Adding out-of-tree support
+for a different tensor layout might share many common steps with devices, but we haven't seen an example of
+such integrations yet so it might require addtional work from PyTorch to support it.
+
+Get a dispatch key for your backend
+-----------------------------------
+
+PyTorch operators are implemented in C++ and made available in Python frontend through Python bindings.
+The PyTorch dispatcher divides the implementation of an operator into multiple kernels, each of which is
+associated with a specific dispatch key.  Supporting a new backend in PyTorch essentially means writing
+a kernel for each PyTorch operator in C++ and then registering them to a dispatch key representing your
+customized backend in the dispatcher.
+
+Dispatch key is your identifier in the dispatcher system. The dispatcher looks at the dispatch keys carried on
+input tensors and calls the right kernel accordingly.  PyTorch provides three reserved dispatch keys
+(and their corresponding Autograd keys) for prototyping out-of-tree backend extensions:
+
+* PrivateUse1/AutogradPrivateUse1
+* PrivateUse2/AutogradPrivateUse2
+* PrivateUse3/AutogradPrivateUse3
+
+You can choose any of keys above to prototype your customized backend.
+To create a Tensor on ``PrivateUse1`` backend, you need to set dispatch key in ``TensorImpl`` constructor.
+
+.. code-block:: cpp
+  /* Example TensorImpl constructor */
+  TensorImpl(
+      Storage&& storage,
+      DispatchKeySet ks,
+      const caffe2::TypeMeta data_type);
+
+  // To create a TensorImpl on PrivateUse1 backend, pass in the following ks to TensorImpl creation.
+  DispatchKeySet ks = c10::DispatchKeySet{c10::DispatchKey::PrivateUse1, c10::DispatchKey::AutogradPrivateUse1};
+
+
+Note that ``TensorImpl`` class above assumes your Tensor is backed by a storage like CPU/CUDA. We also
+provide ``OpaqueTensorImpl`` for backends without a storage. And you might need to tweak/override certain
+methods to fit your customized hardware.
+One example in pytorch repo is `Vulkan TensorImpl <https://github.com/pytorch/pytorch/blob/1.7/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h>`_.
+
+
+.. note::
+   Once the prototype is done and you plan to do regular releases for your backend extension,  please feel free to
+   submit a PR to ``pytorch/pytorch`` to reserve a dedicated dispath key for your backend.
+
+
+Get the full list of PyTorch operators
+--------------------------------------
+
+PyTorch provides a full list of extensible C++ operators in generated file
+``build/aten/src/ATen/RegistrationDeclarations.h``.
+This file is only available after building PyTorch from source.
+Here's a snippet of the file:
+
+.. code-block:: cpp
+
+  Tensor abs(const Tensor & self); // {"schema": "aten::abs(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+  Tensor & abs_(Tensor & self); // {"schema": "aten::abs_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+  Tensor & abs_out(Tensor & out, const Tensor & self); // {"schema": "aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+  Tensor absolute(const Tensor & self); // {"schema": "aten::absolute(Tensor self) -> Tensor", "dispatch": "False", "default": "False"}
+  Tensor & absolute_(Tensor & self); // {"schema": "aten::absolute_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "False"}
+  Tensor & absolute_out(Tensor & out, const Tensor & self); // {"schema": "aten::absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "False"}
+  Tensor angle(const Tensor & self); // {"schema": "aten::angle(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+  Tensor & angle_out(Tensor & out, const Tensor & self); // {"schema": "aten::angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+  Tensor sgn(const Tensor & self); // {"schema": "aten::sgn(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+
+There're multiple fields associated with a single operator. Let's break it down using ``abs_out`` as an example:
+
+* ``Tensor & abs_out(Tensor & out, const Tensor & self);`` is the C++ signature of the operator, your C++
+  kernel should match this signature exactly.
+* ``aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)`` is the unique schema representing the operator,
+  which also contains aliasing and mutation annotations compared to the C++ signature.  This is the unique identifier
+  the dispatcher uses to find an operator.
+* ``dispatch`` and ``default`` are boolean fields that provide information about what native PyTorch kernels
+  can do, thus implies whether it's required for backend extenders to implement the kernel.
+  More details can be found in :ref:`register kernels for the new backend<register-kernel>`.
+
+
+.. _register-kernel:
+
+Register kernels for the new backend
+------------------------------------
+
+To register your kernels to PyTorch dispatcher, you can use the
+``TORCH_LIBRARY_IMPL`` API described in
+`Registering a Dispatched Operator in C++ <dispatcher>`_:
+
+.. code-block:: cpp
+
+  TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+    m.impl(<schema_my_op1>, &my_op1);
+    m.impl(<schema_my_op2>, &my_op2);
+    m.impl(<schema_my_op2_backward>, &my_op2_backward);
+  }
+
+Now let's zoom in and what operator requires a kernel from a customized backend and what's
+inside the kernels exactly.
+
+PyTorch currently has more than 1600 operators and it’s still growing.  It’s unrealistic
+for backend extensions to keep up with this speed.  Even for native backends like CPU
+or CUDA, it often requires a lot of work to write dedicated kernels for every new op.
+
+Fortunately, some native PyTorch kernels are written in a way that they decompose to
+combination of several known operators. In other words, you only need to implement
+a set of known operators (ops that require registration below) instead of all PyTorch operators.
+
+PyTorch operators can be classified into two categories:
+
+* Ops that require registration: PyTorch native implementation for these ops is backend specific
+  and thus it’s required to provide a kernel for customized backend.  Otherwise calling such op
+  on the customized backend will error out.
+    * In ``RegistrationDeclarations.h`` these operators have ``dispatch`` set to True *and* ``default`` set to False
+      in the metadata found in their accompanying comments.
+
+
+* Registration is optional: backend extenders can skip registering to these ops without sacrificing any support.
+  However, if a backend extender wants to override the default kernel provided by PyTorch, they can still
+  register their customized kernel to their backend and the dispatcher will use it for your backend only.
+  For example, current implementation of PyTorch's ``max_pool2d`` returns ``indices`` as part of forward outputs which
+  creates overhead in torch_xla, so torch_xla registers its own kernel for ``max_pool2d`` instead.
+    * In ``RegistrationDeclarations.h`` these operators have ``dispatch`` set to False *or* ``default`` set to True
+      in the metadata found in their accompanying comments.
+
+
+
+Autograd support for the new backend
+------------------------------------
+
+Gradient formulas are mostly purely mathematical and thus are general for all backends.
+PyTorch often registers a kernel to alias dispatch key Autograd, which means it can be used by all backends.
+
+For these operators you don't have to worry about their derivative formulas,
+you can just write forward definitions for operators in ``RegistrationDeclarations.h`` and PyTorch handles
+backward for you automatically.
+
+.. code-block:: cpp
+
+
+  Tensor my_op1(const Tensor& self, const Tensor& other) {
+    // call your backend-specific APIs to implement my_op so that
+    // it matches PyTorch's native behavior
+  }
+  TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+    m.impl(<schema_my_op1>, &my_op);
+  }
+
+
+In some cases, PyTorch backward kernel implementations are also device specific so that they can squeeze out
+max performance out of each backend. For those operators you’ll see op_backward showing up in
+``RegistrationDeclarations.h`` as *required registration* as well.
+
+.. code-block:: cpp
+
+
+  Tensor my_op2_backward(const Tensor& self, const Tensor& other) {
+    // call your backend-specific APIs to implement my_op2_backward so that
+    // it matches PyTorch's native behavior
+  }
+
+  // Note backward kernel is still registered to PrivateUse1 instead of AutogradPrivateUse1.
+  // PyTorch will wrap your backward kernel with proper autograd setup and then link to it in
+  // my_op2's AutogradPrivateUse1 kernel.
+  TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+    m.impl(<schema_my_op2>, &my_op2);
+    m.impl(<schema_my_op2_backward>, &my_op2_backward);
+  }
+
+
+In a few *rare* cases, PyTorch’s gradient formula for certain operators may have assumptions that don’t generalize
+for all backends. In those cases backend extenders can optionally override PyTorch Autograd layer by registering
+a kernel from torch::autograd::Function to the corresponding dispatch key (for example, AutogradPrivateUse1 if
+you're using PrivateUse1 for your backend):
+
+
+.. code-block:: cpp
+
+
+  class MyAddFunction : public torch::autograd::Function<MyAddFunction> {
+    public:
+    static Tensor forward(AutogradContext *ctx, torch::Tensor self, torch::Tensor other) {
+      at::AutoNonVariableTypeMode g;
+      return myadd(self, other);
+    }
+
+    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
+      auto grad_output = grad_outputs[0];
+      return {grad_output, grad_output};
+    }
+  };
+
+  Tensor myadd_autograd(const Tensor& self, const Tensor& other) {
+    return MyAddFunction::apply(self, other)[0];
+  }
+
+  // Register the autograd kernel to AutogradPrivateUse1
+  TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) {
+    m.impl(<myadd_schema>, &myadd_autograd);
+  }
+
+  // Register the inference kernel to PrivateUse1
+  TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+    m.impl(<myadd_schema>, &myadd);
+  }
+
+
+
+With this trick you have full control over both training and inference behavior for ``my_add`` operator in your backend.
+Here's `an example <https://github.com/pytorch/xla/blob/r1.7/torch_xla/csrc/aten_autograd_ops.h>`_ in the ``pytorch/xla`` repository.
+
+
+Build an extension
+------------------
+
+Out-of-tree backend is supported by adding a C++ extension to PyTorch.
+Once you have kernels and registrations ready, you can build a C++ extension by
+writing a ``setup.py`` script that uses ``setuptools`` to compile C++ code.  Here's a simplified example from
+`pytorch/xla repo <https://github.com/pytorch/xla/blob/master/setup.py>`_::
+
+  from setuptools import setup
+  from torch.utils.cpp_extension import BuildExtension, CppExtension
+
+  setup(
+      name='torch_xla',
+      ext_modules=[
+          CppExtension(
+              '_XLAC',
+              torch_xla_sources,
+              include_dirs=include_dirs,
+              extra_compile_args=extra_compile_args,
+              library_dirs=library_dirs,
+              extra_link_args=extra_link_args + \
+                  [make_relative_rpath('torch_xla/lib')],
+          ),
+      ],
+      cmdclass={
+          'build_ext': Build,  # Build is a derived class of BuildExtension
+      }
+      # more configs...
+  )
+
+
+See `our C++ extension tutorial <https://pytorch.org/tutorials/advanced/cpp_extension.html#building-with-setuptools>`_
+for more details.
+
+
+Custom operator support
+-----------------------
+
+Your new backend should work seamlessly with
+`customized operators extended in python <https://pytorch.org/docs/stable/notes/extending.html>`_
+without writing any new kernels as long as the customized operator is composed of existing
+PyTorch operators (which are already supported by your backend).
+
+For `custom operators extended in C++ <cpp_autograd>`_ they often come with a
+`backend specific C++ kernel implementation e.g. nms kernel in torchvsion <https://github.com/pytorch/vision/blob/master/torchvision/csrc/ops/cuda/nms_kernel.cu>`_
+as well as `a customized Python API e.g. torch.ops.torchvision.nms <https://github.com/pytorch/vision/blob/master/torchvision/csrc/ops/nms.cpp#L18>`_.
+To support these operators, backend extenders will need to write a C++ kernel for your backend and properly
+register it to the corresponding namespace in the dispatcher similar to supporting PyTorch native operators.
+Alternatively you could also add a customized API in your extension e.g ``torch_xla.core.functions.nms`` for
+these adhoc requests.
+
+JIT support
+-----------
+
+As we mentioned in `Registering a Dispatched Operator in C++ <dispatcher>`_, kernels registered through `m.impl()` API
+support being called in both unboxed and boxed ways. In other words your customized backend can also work with our
+JIT tracing/scripting frontend just like the in-tree backends like CPU or CUDA do.  You could potentially also write specialized optimization
+passes for your backend on a JIT graph.  But we will not discuss it here since we haven't finalized the integration point
+in JIT, so the current backend support will focus on the eager frontend for now.
+
+
+Testing your backend against native PyTorch backends
+----------------------------------------------------
+
+PyTorch lets tests run on multiple device types using its `generic device type testing framework <https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_device_type.py>`_.
+You can find details about `how tests use it <https://github.com/pytorch/pytorch/blob/5a8198eb3c594aa18352930fd21f3c25bd7b7100/torch/testing/_internal/common_device_type.py#L23>`_
+and information about `how to add a new device type <https://github.com/pytorch/pytorch/blob/5a8198eb3c594aa18352930fd21f3c25bd7b7100/torch/testing/_internal/common_device_type.py#L369>`_.
+Once added, PyTorch tests using the generic device type testing framework will be run using your device type, too.
+See `this Wiki page <https://github.com/pytorch/pytorch/wiki/Writing-tests-that-run-on-all-available-device-types>`_ for an example of how tests are instantiated.
+
+Running PyTorch’s existing test suites with your device type is important to ensure correctness,
+but not all PyTorch features are supported by every device type.  The generic device type testing
+framework allows for considerable customization so that device types can select which tests to run,
+which dtypes they support, and even which precisions to use when comparing tensors for equality.
+
+An example device type that uses the generic device type testing framework and doesn’t ship with
+PyTorch is XLA.  See `its extension of the generic device type testing framework <https://github.com/pytorch/xla/blob/master/test/pytorch_test_base.py>`_,
+which contains examples of block listing tests, block listing dtypes, and overriding test precision.
+
+The generic device type testing framework is actively developed. To request a feature please file an
+issue on PyTorch’s Github.
+
+
+Backward Compatibility
+----------------------
+
+Currently PyTorch can’t guarantee backward compatibility for registered operators.
+Operators, as well as their schemas, might be added/modified/deleted as needed.  Registered
+kernels must be *exactly* the same as PyTorch version.  If PyTorch adds more parameters (
+even with defaults) for an operator, your old registration won't work until it's updated
+to match PyTorch's new signature.
+
+As a result, we *highly recommend* out-of-tree backend extenders only sync with major PyTorch
+releases to minimize interruptions in development.  PyTorch is on a quarterly release cadence.
+Backend extenders should join the *#announcement* channel at `pytorch.slack.com <http://pytorch.slack.com/>`_
+to get latest updates on releases.
+
+Known issues & additional notes
+-------------------------------
+
+*  Not all test suites are device generic yet. Extensible test classes can be found by searching
+   ``instantiate_device_type_tests`` in PyTorch codebase, e.g
+   ``TestTorchDeviceType, TestViewOps, TestTensorDeviceOps, TestTypePromotion`` etc.
+* There's no extension point in C++ for serializing a python Tensor object on customized backend. Currently
+  you can only extend it by modifying `PyTorch Tensor __reduce_ex__ method <https://github.com/pytorch/pytorch/blob/5640b79bf8a5412a0209a919c05c811d5427cc12/torch/tensor.py#L83-L150>`_
+  or monkey patching in out-of-tree repository.
+* If your backend doesn't allow direct memory access, you should pay additional attention to supporting
+  view ops since they're supposed to share storage. Changes to view tensor need to propagated to its
+  base tensor and vice versa.
+* There's no extension point in C++ for Optimizer if your backend doesn't work with the native PyTorch
+  Optimizers, e.g. need to carry the states to be updated in backward like torch-xla. Such use cases
+  currently can only be done through adding customized API or monkey patching in out-of-tree repository.
+
+Future Work
+-----------
+
+Making every component in PyTorch extensible for an out-of-tree backend seamless
+requires a lot of changes to PyTorch internals.  Here are a few items that we're
+actively working on might improve the experience in the future:
+
+* Improve test coverage of generic testing framework.
+* Improve ``Math`` kernel coverage and more comprehensive tests to make sure ``Math``
+  kernel bahavior matches other backends like ``CPU/CUDA``.
+* Refactor ``RegistrationDeclarations.h`` to carry the minimal information and reuse
+  PyTorch's codegen as much as possible.
+* Support a backend fallback kernel to automatic convert inputs to CPU and convert the
+  result back to the customized backend. This will allow "full" operator coverage even
+  though you don't have kernels written for every operator.
+
+
+Stay in touch
+-------------
+
+Please use `PyTorch dev discussions <https://dev-discuss.pytorch.org/>`_ for questions and discussions. If you have
+any feature requests or bug reports, please `file an issue on github <https://github.com/pytorch/pytorch/issues>`_.
+
+If you're interested in helping in any of the future work items above (e.g adding more ``Math``
+kernels for PyTorch operators in C++), please reach out to us through Github or Slack!
+
diff --git a/advanced_source/rpc_ddp_tutorial/main.py b/advanced_source/rpc_ddp_tutorial/main.py
index f83384d0a..3d0d6ba22 100644
--- a/advanced_source/rpc_ddp_tutorial/main.py
+++ b/advanced_source/rpc_ddp_tutorial/main.py
@@ -6,7 +6,7 @@
 import torch.distributed as dist
 import torch.distributed.autograd as dist_autograd
 import torch.distributed.rpc as rpc
-from torch.distributed.rpc import ProcessGroupRpcBackendOptions
+from torch.distributed.rpc import TensorPipeRpcBackendOptions
 import torch.multiprocessing as mp
 import torch.optim as optim
 from torch.distributed.optim import DistributedOptimizer
@@ -128,7 +128,7 @@ def run_worker(rank, world_size):
     os.environ['MASTER_PORT'] = '29500'
 
 
-    rpc_backend_options = ProcessGroupRpcBackendOptions()
+    rpc_backend_options = TensorPipeRpcBackendOptions()
     rpc_backend_options.init_method='tcp://localhost:29501'
 
     # Rank 2 is master, 3 is ps and 0 and 1 are trainers.
diff --git a/advanced_source/static_quantization_tutorial.py b/advanced_source/static_quantization_tutorial.py
deleted file mode 100644
index 72ea3cc70..000000000
--- a/advanced_source/static_quantization_tutorial.py
+++ /dev/null
@@ -1,678 +0,0 @@
-"""
-(beta) Static Quantization with Eager Mode in PyTorch
-=========================================================
-
-**Author**: `Raghuraman Krishnamoorthi <https://github.com/raghuramank100>`_
-
-**Edited by**: `Seth Weidman <https://github.com/SethHWeidman/>`_
-
-This tutorial shows how to do post-training static quantization, as well as illustrating
-two more advanced techniques - per-channel quantization and quantization-aware training -
-to further improve the model's accuracy. Note that quantization is currently only supported
-for CPUs, so we will not be utilizing GPUs / CUDA in this tutorial.
-
-By the end of this tutorial, you will see how quantization in PyTorch can result in
-significant decreases in model size while increasing speed. Furthermore, you'll see how
-to easily apply some advanced quantization techniques shown
-`here <https://arxiv.org/abs/1806.08342>`_ so that your quantized models take much less
-of an accuracy hit than they would otherwise.
-
-Warning: we use a lot of boilerplate code from other PyTorch repos to, for example,
-define the ``MobileNetV2`` model archtecture, define data loaders, and so on. We of course
-encourage you to read it; but if you want to get to the quantization features, feel free
-to skip to the "4. Post-training static quantization" section.
-
-We'll start by doing the necessary imports:
-"""
-import numpy as np
-import torch
-import torch.nn as nn
-import torchvision
-from torch.utils.data import DataLoader
-from torchvision import datasets
-import torchvision.transforms as transforms
-import os
-import time
-import sys
-import torch.quantization
-
-# # Setup warnings
-import warnings
-warnings.filterwarnings(
-    action='ignore',
-    category=DeprecationWarning,
-    module=r'.*'
-)
-warnings.filterwarnings(
-    action='default',
-    module=r'torch.quantization'
-)
-
-# Specify random seed for repeatable results
-torch.manual_seed(191009)
-
-######################################################################
-# 1. Model architecture
-# ---------------------
-#
-# We first define the MobileNetV2 model architecture, with several notable modifications
-# to enable quantization:
-#
-# - Replacing addition with ``nn.quantized.FloatFunctional``
-# - Insert ``QuantStub`` and ``DeQuantStub`` at the beginning and end of the network.
-# - Replace ReLU6 with ReLU
-#
-# Note: this code is taken from
-# `here <https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenet.py>`_.
-
-from torch.quantization import QuantStub, DeQuantStub
-
-def _make_divisible(v, divisor, min_value=None):
-    """
-    This function is taken from the original tf repo.
-    It ensures that all layers have a channel number that is divisible by 8
-    It can be seen here:
-    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
-    :param v:
-    :param divisor:
-    :param min_value:
-    :return:
-    """
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-class ConvBNReLU(nn.Sequential):
-    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
-        padding = (kernel_size - 1) // 2
-        super(ConvBNReLU, self).__init__(
-            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
-            nn.BatchNorm2d(out_planes, momentum=0.1),
-            # Replace with ReLU
-            nn.ReLU(inplace=False)
-        )
-
-
-class InvertedResidual(nn.Module):
-    def __init__(self, inp, oup, stride, expand_ratio):
-        super(InvertedResidual, self).__init__()
-        self.stride = stride
-        assert stride in [1, 2]
-
-        hidden_dim = int(round(inp * expand_ratio))
-        self.use_res_connect = self.stride == 1 and inp == oup
-
-        layers = []
-        if expand_ratio != 1:
-            # pw
-            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
-        layers.extend([
-            # dw
-            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
-            # pw-linear
-            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
-            nn.BatchNorm2d(oup, momentum=0.1),
-        ])
-        self.conv = nn.Sequential(*layers)
-        # Replace torch.add with floatfunctional
-        self.skip_add = nn.quantized.FloatFunctional()
-
-    def forward(self, x):
-        if self.use_res_connect:
-            return self.skip_add.add(x, self.conv(x))
-        else:
-            return self.conv(x)
-
-
-class MobileNetV2(nn.Module):
-    def __init__(self, num_classes=1000, width_mult=1.0, inverted_residual_setting=None, round_nearest=8):
-        """
-        MobileNet V2 main class
-
-        Args:
-            num_classes (int): Number of classes
-            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
-            inverted_residual_setting: Network structure
-            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
-            Set to 1 to turn off rounding
-        """
-        super(MobileNetV2, self).__init__()
-        block = InvertedResidual
-        input_channel = 32
-        last_channel = 1280
-
-        if inverted_residual_setting is None:
-            inverted_residual_setting = [
-                # t, c, n, s
-                [1, 16, 1, 1],
-                [6, 24, 2, 2],
-                [6, 32, 3, 2],
-                [6, 64, 4, 2],
-                [6, 96, 3, 1],
-                [6, 160, 3, 2],
-                [6, 320, 1, 1],
-            ]
-
-        # only check the first element, assuming user knows t,c,n,s are required
-        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
-            raise ValueError("inverted_residual_setting should be non-empty "
-                             "or a 4-element list, got {}".format(inverted_residual_setting))
-
-        # building first layer
-        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
-        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
-        features = [ConvBNReLU(3, input_channel, stride=2)]
-        # building inverted residual blocks
-        for t, c, n, s in inverted_residual_setting:
-            output_channel = _make_divisible(c * width_mult, round_nearest)
-            for i in range(n):
-                stride = s if i == 0 else 1
-                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
-                input_channel = output_channel
-        # building last several layers
-        features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
-        # make it nn.Sequential
-        self.features = nn.Sequential(*features)
-        self.quant = QuantStub()
-        self.dequant = DeQuantStub()
-        # building classifier
-        self.classifier = nn.Sequential(
-            nn.Dropout(0.2),
-            nn.Linear(self.last_channel, num_classes),
-        )
-
-        # weight initialization
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out')
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, 0, 0.01)
-                nn.init.zeros_(m.bias)
-
-    def forward(self, x):
-
-        x = self.quant(x)
-
-        x = self.features(x)
-        x = x.mean([2, 3])
-        x = self.classifier(x)
-        x = self.dequant(x)
-        return x
-
-    # Fuse Conv+BN and Conv+BN+Relu modules prior to quantization
-    # This operation does not change the numerics
-    def fuse_model(self):
-        for m in self.modules():
-            if type(m) == ConvBNReLU:
-                torch.quantization.fuse_modules(m, ['0', '1', '2'], inplace=True)
-            if type(m) == InvertedResidual:
-                for idx in range(len(m.conv)):
-                    if type(m.conv[idx]) == nn.Conv2d:
-                        torch.quantization.fuse_modules(m.conv, [str(idx), str(idx + 1)], inplace=True)
-
-######################################################################
-# 2. Helper functions
-# -------------------
-#
-# We next define several helper functions to help with model evaluation. These mostly come from
-# `here <https://github.com/pytorch/examples/blob/master/imagenet/main.py>`_.
-
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-    def __init__(self, name, fmt=':f'):
-        self.name = name
-        self.fmt = fmt
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-    def __str__(self):
-        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
-        return fmtstr.format(**self.__dict__)
-
-
-def accuracy(output, target, topk=(1,)):
-    """Computes the accuracy over the k top predictions for the specified values of k"""
-    with torch.no_grad():
-        maxk = max(topk)
-        batch_size = target.size(0)
-
-        _, pred = output.topk(maxk, 1, True, True)
-        pred = pred.t()
-        correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-        res = []
-        for k in topk:
-            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
-            res.append(correct_k.mul_(100.0 / batch_size))
-        return res
-
-
-def evaluate(model, criterion, data_loader, neval_batches):
-    model.eval()
-    top1 = AverageMeter('Acc@1', ':6.2f')
-    top5 = AverageMeter('Acc@5', ':6.2f')
-    cnt = 0
-    with torch.no_grad():
-        for image, target in data_loader:
-            output = model(image)
-            loss = criterion(output, target)
-            cnt += 1
-            acc1, acc5 = accuracy(output, target, topk=(1, 5))
-            print('.', end = '')
-            top1.update(acc1[0], image.size(0))
-            top5.update(acc5[0], image.size(0))
-            if cnt >= neval_batches:
-                 return top1, top5
-
-    return top1, top5
-
-def load_model(model_file):
-    model = MobileNetV2()
-    state_dict = torch.load(model_file)
-    model.load_state_dict(state_dict)
-    model.to('cpu')
-    return model
-
-def print_size_of_model(model):
-    torch.save(model.state_dict(), "temp.p")
-    print('Size (MB):', os.path.getsize("temp.p")/1e6)
-    os.remove('temp.p')
-
-######################################################################
-# 3. Define dataset and data loaders
-# ----------------------------------
-#
-# As our last major setup step, we define our dataloaders for our training and testing set.
-#
-# ImageNet Data
-# ^^^^^^^^^^^^^
-#
-# The specific dataset we've created for this tutorial contains just 1000 images from the ImageNet data, one from
-# each class (this dataset, at just over 250 MB, is small enough that it can be downloaded
-# relatively easily). The URL for this custom dataset is:
-#
-# .. code::
-#
-#     https://s3.amazonaws.com/pytorch-tutorial-assets/imagenet_1k.zip
-#
-# To download this data locally using Python, you could use:
-#
-# .. code:: python
-#
-#     import requests
-#
-#     url = 'https://s3.amazonaws.com/pytorch-tutorial-assets/imagenet_1k.zip`
-#     filename = '~/Downloads/imagenet_1k_data.zip'
-#
-#     r = requests.get(url)
-#
-#     with open(filename, 'wb') as f:
-#         f.write(r.content)
-#
-# For this tutorial to run, we download this data and move it to the right place using
-# `these lines <https://github.com/pytorch/tutorials/blob/master/Makefile#L97-L98>`_
-# from the `Makefile <https://github.com/pytorch/tutorials/blob/master/Makefile>`_.
-#
-# To run the code in this tutorial using the entire ImageNet dataset, on the other hand, you could download
-# the data using ``torchvision`` following
-# `here <https://pytorch.org/docs/stable/torchvision/datasets.html#imagenet>`_. For example,
-# to download the training set and apply some standard transformations to it, you could use:
-#
-# .. code:: python
-#
-#     import torchvision
-#     import torchvision.transforms as transforms
-#
-#     imagenet_dataset = torchvision.datasets.ImageNet(
-#         '~/.data/imagenet',
-#         split='train',
-#         download=True,
-#         transforms.Compose([
-#             transforms.RandomResizedCrop(224),
-#             transforms.RandomHorizontalFlip(),
-#             transforms.ToTensor(),
-#             transforms.Normalize(mean=[0.485, 0.456, 0.406],
-#                                  std=[0.229, 0.224, 0.225]),
-#         ])
-#
-# With the data downloaded, we show functions below that define dataloaders we'll use to read
-# in this data. These functions mostly come from
-# `here <https://github.com/pytorch/vision/blob/master/references/detection/train.py>`_.
-
-def prepare_data_loaders(data_path):
-
-    traindir = os.path.join(data_path, 'train')
-    valdir = os.path.join(data_path, 'val')
-    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                     std=[0.229, 0.224, 0.225])
-
-    dataset = torchvision.datasets.ImageFolder(
-        traindir,
-        transforms.Compose([
-            transforms.RandomResizedCrop(224),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            normalize,
-        ]))
-
-    dataset_test = torchvision.datasets.ImageFolder(
-        valdir,
-        transforms.Compose([
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            normalize,
-        ]))
-
-    train_sampler = torch.utils.data.RandomSampler(dataset)
-    test_sampler = torch.utils.data.SequentialSampler(dataset_test)
-
-    data_loader = torch.utils.data.DataLoader(
-        dataset, batch_size=train_batch_size,
-        sampler=train_sampler)
-
-    data_loader_test = torch.utils.data.DataLoader(
-        dataset_test, batch_size=eval_batch_size,
-        sampler=test_sampler)
-
-    return data_loader, data_loader_test
-
-######################################################################
-# Next, we'll load in the pre-trained MobileNetV2 model. We provide the URL to download the data from in ``torchvision``
-# `here <https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenet.py#L9>`_.
-
-data_path = 'data/imagenet_1k'
-saved_model_dir = 'data/'
-float_model_file = 'mobilenet_pretrained_float.pth'
-scripted_float_model_file = 'mobilenet_quantization_scripted.pth'
-scripted_quantized_model_file = 'mobilenet_quantization_scripted_quantized.pth'
-
-train_batch_size = 30
-eval_batch_size = 30
-
-data_loader, data_loader_test = prepare_data_loaders(data_path)
-criterion = nn.CrossEntropyLoss()
-float_model = load_model(saved_model_dir + float_model_file).to('cpu')
-
-######################################################################
-# Next, we'll "fuse modules"; this can both make the model faster by saving on memory access
-# while also improving numerical accuracy. While this can be used with any model, this is
-# especially common with quantized models.
-
-print('\n Inverted Residual Block: Before fusion \n\n', float_model.features[1].conv)
-float_model.eval()
-
-# Fuses modules
-float_model.fuse_model()
-
-# Note fusion of Conv+BN+Relu and Conv+Relu
-print('\n Inverted Residual Block: After fusion\n\n',float_model.features[1].conv)
-
-######################################################################
-# Finally to get a "baseline" accuracy, let's see the accuracy of our un-quantized model
-# with fused modules
-
-num_eval_batches = 10
-
-print("Size of baseline model")
-print_size_of_model(float_model)
-
-top1, top5 = evaluate(float_model, criterion, data_loader_test, neval_batches=num_eval_batches)
-print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg))
-torch.jit.save(torch.jit.script(float_model), saved_model_dir + scripted_float_model_file)
-
-######################################################################
-# We see 78% accuracy on 300 images, a solid baseline for ImageNet,
-# especially considering our model is just 14.0 MB.
-#
-# This will be our baseline to compare to. Next, let's try different quantization methods
-#
-# 4. Post-training static quantization
-# ------------------------------------
-#
-# Post-training static quantization involves not just converting the weights from float to int,
-# as in dynamic quantization, but also performing the additional step of first feeding batches
-# of data through the network and computing the resulting distributions of the different activations
-# (specifically, this is done by inserting `observer` modules at different points that record this
-# data). These distributions are then used to determine how the specifically the different activations
-# should be quantized at inference time (a simple technique would be to simply divide the entire range
-# of activations into 256 levels, but we support more sophisticated methods as well). Importantly,
-# this additional step allows us to pass quantized values between operations instead of converting these
-# values to floats - and then back to ints - between every operation, resulting in a significant speed-up.
-
-num_calibration_batches = 10
-
-myModel = load_model(saved_model_dir + float_model_file).to('cpu')
-myModel.eval()
-
-# Fuse Conv, bn and relu
-myModel.fuse_model()
-
-# Specify quantization configuration
-# Start with simple min/max range estimation and per-tensor quantization of weights
-myModel.qconfig = torch.quantization.default_qconfig
-print(myModel.qconfig)
-torch.quantization.prepare(myModel, inplace=True)
-
-# Calibrate first
-print('Post Training Quantization Prepare: Inserting Observers')
-print('\n Inverted Residual Block:After observer insertion \n\n', myModel.features[1].conv)
-
-# Calibrate with the training set
-evaluate(myModel, criterion, data_loader, neval_batches=num_calibration_batches)
-print('Post Training Quantization: Calibration done')
-
-# Convert to quantized model
-torch.quantization.convert(myModel, inplace=True)
-print('Post Training Quantization: Convert done')
-print('\n Inverted Residual Block: After fusion and quantization, note fused modules: \n\n',myModel.features[1].conv)
-
-print("Size of model after quantization")
-print_size_of_model(myModel)
-
-top1, top5 = evaluate(myModel, criterion, data_loader_test, neval_batches=num_eval_batches)
-print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg))
-
-######################################################################
-# For this quantized model, we see a significantly lower accuracy of just ~62% on these same 300
-# images. Nevertheless, we did reduce the size of our model down to just under 3.6 MB, almost a 4x
-# decrease.
-#
-# In addition, we can significantly improve on the accuracy simply by using a different
-# quantization configuration. We repeat the same exercise with the recommended configuration for
-# quantizing for x86 architectures. This configuration does the following:
-#
-# - Quantizes weights on a per-channel basis
-# - Uses a histogram observer that collects a histogram of activations and then picks
-#   quantization parameters in an optimal manner.
-#
-
-per_channel_quantized_model = load_model(saved_model_dir + float_model_file)
-per_channel_quantized_model.eval()
-per_channel_quantized_model.fuse_model()
-per_channel_quantized_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
-print(per_channel_quantized_model.qconfig)
-
-torch.quantization.prepare(per_channel_quantized_model, inplace=True)
-evaluate(per_channel_quantized_model,criterion, data_loader, num_calibration_batches)
-torch.quantization.convert(per_channel_quantized_model, inplace=True)
-top1, top5 = evaluate(per_channel_quantized_model, criterion, data_loader_test, neval_batches=num_eval_batches)
-print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg))
-torch.jit.save(torch.jit.script(per_channel_quantized_model), saved_model_dir + scripted_quantized_model_file)
-
-######################################################################
-# Changing just this quantization configuration method resulted in an increase
-# of the accuracy to over 76%! Still, this is 1-2% worse than the baseline of 78% achieved above.
-# So lets try quantization aware training.
-#
-# 5. Quantization-aware training
-# ------------------------------
-#
-# Quantization-aware training (QAT) is the quantization method that typically results in the highest accuracy.
-# With QAT, all weights and activations are “fake quantized” during both the forward and backward passes of
-# training: that is, float values are rounded to mimic int8 values, but all computations are still done with
-# floating point numbers. Thus, all the weight adjustments during training are made while “aware” of the fact
-# that the model will ultimately be quantized; after quantizing, therefore, this method will usually yield
-# higher accuracy than either dynamic quantization or post-training static quantization.
-#
-# The overall workflow for actually performing QAT is very similar to before:
-#
-# - We can use the same model as before: there is no additional preparation needed for quantization-aware
-#   training.
-# - We need to use a ``qconfig`` specifying what kind of fake-quantization is to be inserted after weights
-#   and activations, instead of specifying observers
-#
-# We first define a training function:
-
-def train_one_epoch(model, criterion, optimizer, data_loader, device, ntrain_batches):
-    model.train()
-    top1 = AverageMeter('Acc@1', ':6.2f')
-    top5 = AverageMeter('Acc@5', ':6.2f')
-    avgloss = AverageMeter('Loss', '1.5f')
-
-    cnt = 0
-    for image, target in data_loader:
-        start_time = time.time()
-        print('.', end = '')
-        cnt += 1
-        image, target = image.to(device), target.to(device)
-        output = model(image)
-        loss = criterion(output, target)
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        top1.update(acc1[0], image.size(0))
-        top5.update(acc5[0], image.size(0))
-        avgloss.update(loss, image.size(0))
-        if cnt >= ntrain_batches:
-            print('Loss', avgloss.avg)
-
-            print('Training: * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
-                  .format(top1=top1, top5=top5))
-            return
-
-    print('Full imagenet train set:  * Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}'
-          .format(top1=top1, top5=top5))
-    return
-
-######################################################################
-# We fuse modules as before
-
-qat_model = load_model(saved_model_dir + float_model_file)
-qat_model.fuse_model()
-
-optimizer = torch.optim.SGD(qat_model.parameters(), lr = 0.0001)
-qat_model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
-
-######################################################################
-# Finally, ``prepare_qat`` performs the "fake quantization", preparing the model for quantization-aware
-# training
-
-torch.quantization.prepare_qat(qat_model, inplace=True)
-print('Inverted Residual Block: After preparation for QAT, note fake-quantization modules \n',qat_model.features[1].conv)
-
-######################################################################
-# Training a quantized model with high accuracy requires accurate modeling of numerics at
-# inference. For quantization aware training, therefore, we modify the training loop by:
-#
-# - Switch batch norm to use running mean and variance towards the end of training to better
-#   match inference numerics.
-# - We also freeze the quantizer parameters (scale and zero-point) and fine tune the weights.
-
-num_train_batches = 20
-
-# Train and check accuracy after each epoch
-for nepoch in range(8):
-    train_one_epoch(qat_model, criterion, optimizer, data_loader, torch.device('cpu'), num_train_batches)
-    if nepoch > 3:
-        # Freeze quantizer parameters
-        qat_model.apply(torch.quantization.disable_observer)
-    if nepoch > 2:
-        # Freeze batch norm mean and variance estimates
-        qat_model.apply(torch.nn.intrinsic.qat.freeze_bn_stats)
-
-    # Check the accuracy after each epoch
-    quantized_model = torch.quantization.convert(qat_model.eval(), inplace=False)
-    quantized_model.eval()
-    top1, top5 = evaluate(quantized_model,criterion, data_loader_test, neval_batches=num_eval_batches)
-    print('Epoch %d :Evaluation accuracy on %d images, %2.2f'%(nepoch, num_eval_batches * eval_batch_size, top1.avg))
-
-#####################################################################
-# Here, we just perform quantization-aware training for a small number of epochs. Nevertheless,
-# quantization-aware training yields an accuracy of over 71% on the entire imagenet dataset,
-# which is close to the floating point accuracy of 71.9%.
-#
-# More on quantization-aware training:
-#
-# - QAT is a super-set of post training quant techniques that allows for more debugging.
-#   For example, we can analyze if the accuracy of the model is limited by weight or activation
-#   quantization.
-# - We can also simulate the accuracy of a quantized model in floating point since
-#   we are using fake-quantization to model the numerics of actual quantized arithmetic.
-# - We can mimic post training quantization easily too.
-#
-# Speedup from quantization
-# ^^^^^^^^^^^^^^^^^^^^^^^^^
-#
-# Finally, let's confirm something we alluded to above: do our quantized models actually perform inference
-# faster? Let's test:
-
-def run_benchmark(model_file, img_loader):
-    elapsed = 0
-    model = torch.jit.load(model_file)
-    model.eval()
-    num_batches = 5
-    # Run the scripted model on a few batches of images
-    for i, (images, target) in enumerate(img_loader):
-        if i < num_batches:
-            start = time.time()
-            output = model(images)
-            end = time.time()
-            elapsed = elapsed + (end-start)
-        else:
-            break
-    num_images = images.size()[0] * num_batches
-
-    print('Elapsed time: %3.0f ms' % (elapsed/num_images*1000))
-    return elapsed
-
-run_benchmark(saved_model_dir + scripted_float_model_file, data_loader_test)
-
-run_benchmark(saved_model_dir + scripted_quantized_model_file, data_loader_test)
-
-######################################################################
-# Running this locally on a MacBook pro yielded 61 ms for the regular model, and
-# just 20 ms for the quantized model, illustrating the typical 2-4x speedup
-# we see for quantized models compared to floating point ones.
-#
-# Conclusion
-# ----------
-#
-# In this tutorial, we showed two quantization methods - post-training static quantization,
-# and quantization-aware training - describing what they do "under the hood" and how to use
-# them in PyTorch.
-#
-# Thanks for reading! As always, we welcome any feedback, so please create an issue
-# `here <https://github.com/pytorch/pytorch/issues>`_ if you have any.
diff --git a/advanced_source/static_quantization_tutorial.rst b/advanced_source/static_quantization_tutorial.rst
new file mode 100644
index 000000000..79f76b805
--- /dev/null
+++ b/advanced_source/static_quantization_tutorial.rst
@@ -0,0 +1,633 @@
+(beta) Static Quantization with Eager Mode in PyTorch 
+========================================================= 
+**Author**: `Raghuraman Krishnamoorthi <https://github.com/raghuramank100>`_
+**Edited by**: `Seth Weidman <https://github.com/SethHWeidman/>`_, `Jerry Zhang <https:github.com/jerryzh168>`_
+
+This tutorial shows how to do post-training static quantization, as well as illustrating  
+two more advanced techniques - per-channel quantization and quantization-aware training - 
+to further improve the model's accuracy. Note that quantization is currently only supported 
+for CPUs, so we will not be utilizing GPUs / CUDA in this tutorial. 
+By the end of this tutorial, you will see how quantization in PyTorch can result in 
+significant decreases in model size while increasing speed. Furthermore, you'll see how 
+to easily apply some advanced quantization techniques shown 
+`here <https://arxiv.org/abs/1806.08342>`_ so that your quantized models take much less 
+of an accuracy hit than they would otherwise. 
+Warning: we use a lot of boilerplate code from other PyTorch repos to, for example, 
+define the ``MobileNetV2`` model archtecture, define data loaders, and so on. We of course  
+encourage you to read it; but if you want to get to the quantization features, feel free  
+to skip to the "4. Post-training static quantization" section.  
+We'll start by doing the necessary imports: 
+
+.. code:: python
+
+    import numpy as np  
+    import torch  
+    import torch.nn as nn 
+    import torchvision  
+    from torch.utils.data import DataLoader 
+    from torchvision import datasets  
+    import torchvision.transforms as transforms 
+    import os 
+    import time 
+    import sys  
+    import torch.quantization 
+
+    # # Setup warnings  
+    import warnings 
+    warnings.filterwarnings(  
+        action='ignore',  
+        category=DeprecationWarning,  
+        module=r'.*'  
+    ) 
+    warnings.filterwarnings(  
+        action='default', 
+        module=r'torch.quantization'  
+    ) 
+
+    # Specify random seed for repeatable results  
+    torch.manual_seed(191009) 
+
+1. Model architecture 
+--------------------- 
+
+We first define the MobileNetV2 model architecture, with several notable modifications  
+to enable quantization: 
+
+- Replacing addition with ``nn.quantized.FloatFunctional``  
+- Insert ``QuantStub`` and ``DeQuantStub`` at the beginning and end of the network. 
+- Replace ReLU6 with ReLU 
+ 
+Note: this code is taken from 
+`here <https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenet.py>`_.  
+
+.. code:: python
+
+    from torch.quantization import QuantStub, DeQuantStub 
+
+    def _make_divisible(v, divisor, min_value=None):  
+        """ 
+        This function is taken from the original tf repo. 
+        It ensures that all layers have a channel number that is divisible by 8 
+        It can be seen here:  
+        https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py  
+        :param v: 
+        :param divisor: 
+        :param min_value: 
+        :return:  
+        """ 
+        if min_value is None: 
+            min_value = divisor 
+        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 
+        # Make sure that round down does not go down by more than 10%.  
+        if new_v < 0.9 * v: 
+            new_v += divisor  
+        return new_v  
+
+
+    class ConvBNReLU(nn.Sequential):  
+        def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): 
+            padding = (kernel_size - 1) // 2  
+            super(ConvBNReLU, self).__init__( 
+                nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),  
+                nn.BatchNorm2d(out_planes, momentum=0.1), 
+                # Replace with ReLU 
+                nn.ReLU(inplace=False)  
+            ) 
+
+
+    class InvertedResidual(nn.Module):  
+        def __init__(self, inp, oup, stride, expand_ratio): 
+            super(InvertedResidual, self).__init__()  
+            self.stride = stride  
+            assert stride in [1, 2] 
+
+            hidden_dim = int(round(inp * expand_ratio)) 
+            self.use_res_connect = self.stride == 1 and inp == oup  
+
+            layers = [] 
+            if expand_ratio != 1: 
+                # pw  
+                layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) 
+            layers.extend([ 
+                # dw  
+                ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), 
+                # pw-linear 
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),  
+                nn.BatchNorm2d(oup, momentum=0.1),  
+            ])  
+            self.conv = nn.Sequential(*layers)  
+            # Replace torch.add with floatfunctional  
+            self.skip_add = nn.quantized.FloatFunctional()  
+
+        def forward(self, x): 
+            if self.use_res_connect:  
+                return self.skip_add.add(x, self.conv(x)) 
+            else: 
+                return self.conv(x) 
+
+
+    class MobileNetV2(nn.Module): 
+        def __init__(self, num_classes=1000, width_mult=1.0, inverted_residual_setting=None, round_nearest=8):  
+            """ 
+            MobileNet V2 main class 
+            Args: 
+                num_classes (int): Number of classes  
+                width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount  
+                inverted_residual_setting: Network structure  
+                round_nearest (int): Round the number of channels in each layer to be a multiple of this number 
+                Set to 1 to turn off rounding 
+            """ 
+            super(MobileNetV2, self).__init__() 
+            block = InvertedResidual  
+            input_channel = 32  
+            last_channel = 1280 
+
+            if inverted_residual_setting is None: 
+                inverted_residual_setting = [ 
+                    # t, c, n, s  
+                    [1, 16, 1, 1],  
+                    [6, 24, 2, 2],  
+                    [6, 32, 3, 2],  
+                    [6, 64, 4, 2],  
+                    [6, 96, 3, 1],  
+                    [6, 160, 3, 2], 
+                    [6, 320, 1, 1], 
+                ] 
+
+            # only check the first element, assuming user knows t,c,n,s are required  
+            if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: 
+                raise ValueError("inverted_residual_setting should be non-empty " 
+                                 "or a 4-element list, got {}".format(inverted_residual_setting)) 
+
+            # building first layer  
+            input_channel = _make_divisible(input_channel * width_mult, round_nearest)  
+            self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) 
+            features = [ConvBNReLU(3, input_channel, stride=2)] 
+            # building inverted residual blocks 
+            for t, c, n, s in inverted_residual_setting:  
+                output_channel = _make_divisible(c * width_mult, round_nearest) 
+                for i in range(n):  
+                    stride = s if i == 0 else 1 
+                    features.append(block(input_channel, output_channel, stride, expand_ratio=t)) 
+                    input_channel = output_channel  
+            # building last several layers  
+            features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))  
+            # make it nn.Sequential 
+            self.features = nn.Sequential(*features)  
+            self.quant = QuantStub()  
+            self.dequant = DeQuantStub()  
+            # building classifier 
+            self.classifier = nn.Sequential(  
+                nn.Dropout(0.2),  
+                nn.Linear(self.last_channel, num_classes),  
+            ) 
+
+            # weight initialization 
+            for m in self.modules():  
+                if isinstance(m, nn.Conv2d):  
+                    nn.init.kaiming_normal_(m.weight, mode='fan_out') 
+                    if m.bias is not None:  
+                        nn.init.zeros_(m.bias)  
+                elif isinstance(m, nn.BatchNorm2d): 
+                    nn.init.ones_(m.weight) 
+                    nn.init.zeros_(m.bias)  
+                elif isinstance(m, nn.Linear):  
+                    nn.init.normal_(m.weight, 0, 0.01)  
+                    nn.init.zeros_(m.bias)  
+
+        def forward(self, x): 
+
+            x = self.quant(x) 
+
+            x = self.features(x)  
+            x = x.mean([2, 3])  
+            x = self.classifier(x)  
+            x = self.dequant(x) 
+            return x  
+
+        # Fuse Conv+BN and Conv+BN+Relu modules prior to quantization 
+        # This operation does not change the numerics 
+        def fuse_model(self): 
+            for m in self.modules():  
+                if type(m) == ConvBNReLU: 
+                    torch.quantization.fuse_modules(m, ['0', '1', '2'], inplace=True) 
+                if type(m) == InvertedResidual: 
+                    for idx in range(len(m.conv)):  
+                        if type(m.conv[idx]) == nn.Conv2d:  
+                            torch.quantization.fuse_modules(m.conv, [str(idx), str(idx + 1)], inplace=True) 
+
+2. Helper functions 
+------------------- 
+ 
+We next define several helper functions to help with model evaluation. These mostly come from 
+`here <https://github.com/pytorch/examples/blob/master/imagenet/main.py>`_. 
+
+.. code:: python
+
+    class AverageMeter(object): 
+        """Computes and stores the average and current value""" 
+        def __init__(self, name, fmt=':f'): 
+            self.name = name  
+            self.fmt = fmt  
+            self.reset()  
+
+        def reset(self):  
+            self.val = 0  
+            self.avg = 0  
+            self.sum = 0  
+            self.count = 0  
+
+        def update(self, val, n=1): 
+            self.val = val  
+            self.sum += val * n 
+            self.count += n 
+            self.avg = self.sum / self.count  
+
+        def __str__(self):  
+            fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 
+            return fmtstr.format(**self.__dict__) 
+
+
+    def accuracy(output, target, topk=(1,)):  
+        """Computes the accuracy over the k top predictions for the specified values of k"""  
+        with torch.no_grad(): 
+            maxk = max(topk)  
+            batch_size = target.size(0) 
+
+            _, pred = output.topk(maxk, 1, True, True)  
+            pred = pred.t() 
+            correct = pred.eq(target.view(1, -1).expand_as(pred)) 
+
+            res = []  
+            for k in topk:  
+                correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)  
+                res.append(correct_k.mul_(100.0 / batch_size))  
+            return res  
+
+
+    def evaluate(model, criterion, data_loader, neval_batches): 
+        model.eval()  
+        top1 = AverageMeter('Acc@1', ':6.2f') 
+        top5 = AverageMeter('Acc@5', ':6.2f') 
+        cnt = 0 
+        with torch.no_grad(): 
+            for image, target in data_loader: 
+                output = model(image) 
+                loss = criterion(output, target)  
+                cnt += 1  
+                acc1, acc5 = accuracy(output, target, topk=(1, 5))  
+                print('.', end = '')  
+                top1.update(acc1[0], image.size(0)) 
+                top5.update(acc5[0], image.size(0)) 
+                if cnt >= neval_batches:  
+                     return top1, top5  
+
+        return top1, top5 
+
+    def load_model(model_file): 
+        model = MobileNetV2() 
+        state_dict = torch.load(model_file) 
+        model.load_state_dict(state_dict) 
+        model.to('cpu') 
+        return model  
+
+    def print_size_of_model(model): 
+        torch.save(model.state_dict(), "temp.p")  
+        print('Size (MB):', os.path.getsize("temp.p")/1e6)  
+        os.remove('temp.p') 
+
+3. Define dataset and data loaders  
+----------------------------------  
+ 
+As our last major setup step, we define our dataloaders for our training and testing set. 
+ 
+ImageNet Data 
+^^^^^^^^^^^^^ 
+
+To run the code in this tutorial using the entire ImageNet dataset, first download imagenet by following the instructions at here `ImageNet Data <http://www.image-net.org/download>`_. Unzip the downloaded file into the 'data_path' folder.
+
+With the data downloaded, we show functions below that define dataloaders we'll use to read 
+in this data. These functions mostly come from  
+`here <https://github.com/pytorch/vision/blob/master/references/detection/train.py>`_.
+
+
+.. code:: python
+
+    def prepare_data_loaders(data_path):  
+
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],  
+                                         std=[0.229, 0.224, 0.225])
+        dataset = torchvision.datasets.ImageNet(
+               data_path, split="train",
+             transforms.Compose([  
+                       transforms.RandomResizedCrop(224),  
+                       transforms.RandomHorizontalFlip(),  
+                       transforms.ToTensor(),  
+                       normalize,  
+                   ]))  
+        dataset_test = torchvision.datasets.ImageNet(
+              data_path, split="val", 
+                  transforms.Compose([  
+                      transforms.Resize(256), 
+                      transforms.CenterCrop(224), 
+                      transforms.ToTensor(),  
+                      normalize,  
+                  ])) 
+
+        train_sampler = torch.utils.data.RandomSampler(dataset) 
+        test_sampler = torch.utils.data.SequentialSampler(dataset_test) 
+
+        data_loader = torch.utils.data.DataLoader(  
+            dataset, batch_size=train_batch_size, 
+            sampler=train_sampler)  
+
+        data_loader_test = torch.utils.data.DataLoader( 
+            dataset_test, batch_size=eval_batch_size, 
+            sampler=test_sampler) 
+
+        return data_loader, data_loader_test  
+
+
+Next, we'll load in the pre-trained MobileNetV2 model. We provide the URL to download the data from in ``torchvision``  
+`here <https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenet.py#L9>`_. 
+
+.. code:: python
+
+    data_path = '~/.data/imagenet'
+    saved_model_dir = 'data/' 
+    float_model_file = 'mobilenet_pretrained_float.pth' 
+    scripted_float_model_file = 'mobilenet_quantization_scripted.pth' 
+    scripted_quantized_model_file = 'mobilenet_quantization_scripted_quantized.pth' 
+
+    train_batch_size = 30 
+    eval_batch_size = 50 
+
+    data_loader, data_loader_test = prepare_data_loaders(data_path) 
+    criterion = nn.CrossEntropyLoss() 
+    float_model = load_model(saved_model_dir + float_model_file).to('cpu')  
+ 
+    # Next, we'll "fuse modules"; this can both make the model faster by saving on memory access  
+    # while also improving numerical accuracy. While this can be used with any model, this is 
+    # especially common with quantized models.  
+
+    print('\n Inverted Residual Block: Before fusion \n\n', float_model.features[1].conv) 
+    float_model.eval()  
+
+    # Fuses modules 
+    float_model.fuse_model()  
+
+    # Note fusion of Conv+BN+Relu and Conv+Relu 
+    print('\n Inverted Residual Block: After fusion\n\n',float_model.features[1].conv)  
+
+  
+Finally to get a "baseline" accuracy, let's see the accuracy of our un-quantized model  
+with fused modules  
+
+.. code:: python
+
+    num_eval_batches = 1000
+
+    print("Size of baseline model") 
+    print_size_of_model(float_model)  
+
+    top1, top5 = evaluate(float_model, criterion, data_loader_test, neval_batches=num_eval_batches) 
+    print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg)) 
+    torch.jit.save(torch.jit.script(float_model), saved_model_dir + scripted_float_model_file)
+
+  
+On the entire model, we get an accuracy of 71.9% on the eval dataset of 50,000 images.
+
+This will be our baseline to compare to. Next, let's try different quantization methods 
+
+4. Post-training static quantization  
+------------------------------------  
+
+Post-training static quantization involves not just converting the weights from float to int, 
+as in dynamic quantization, but also performing the additional step of first feeding batches  
+of data through the network and computing the resulting distributions of the different activations  
+(specifically, this is done by inserting `observer` modules at different points that record this  
+data). These distributions are then used to determine how the specifically the different activations  
+should be quantized at inference time (a simple technique would be to simply divide the entire range  
+of activations into 256 levels, but we support more sophisticated methods as well). Importantly,  
+this additional step allows us to pass quantized values between operations instead of converting these  
+values to floats - and then back to ints - between every operation, resulting in a significant speed-up.  
+
+.. code:: python
+
+    num_calibration_batches = 32
+
+    myModel = load_model(saved_model_dir + float_model_file).to('cpu')  
+    myModel.eval()  
+
+    # Fuse Conv, bn and relu  
+    myModel.fuse_model()  
+
+    # Specify quantization configuration  
+    # Start with simple min/max range estimation and per-tensor quantization of weights 
+    myModel.qconfig = torch.quantization.default_qconfig  
+    print(myModel.qconfig)  
+    torch.quantization.prepare(myModel, inplace=True) 
+
+    # Calibrate first 
+    print('Post Training Quantization Prepare: Inserting Observers')  
+    print('\n Inverted Residual Block:After observer insertion \n\n', myModel.features[1].conv) 
+
+    # Calibrate with the training set 
+    evaluate(myModel, criterion, data_loader, neval_batches=num_calibration_batches)  
+    print('Post Training Quantization: Calibration done') 
+
+    # Convert to quantized model  
+    torch.quantization.convert(myModel, inplace=True) 
+    print('Post Training Quantization: Convert done') 
+    print('\n Inverted Residual Block: After fusion and quantization, note fused modules: \n\n',myModel.features[1].conv) 
+
+    print("Size of model after quantization") 
+    print_size_of_model(myModel)  
+
+    top1, top5 = evaluate(myModel, criterion, data_loader_test, neval_batches=num_eval_batches) 
+    print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg))
+  
+For this quantized model, we see an accuracy of 56.7% on the eval dataset. This is because we used a simple min/max observer to determine quantization parameters. Nevertheless, we did reduce the size of our model down to just under 3.6 MB, almost a 4x decrease. 
+
+In addition, we can significantly improve on the accuracy simply by using a different 
+quantization configuration. We repeat the same exercise with the recommended configuration for  
+quantizing for x86 architectures. This configuration does the following:  
+
+- Quantizes weights on a per-channel basis  
+- Uses a histogram observer that collects a histogram of activations and then picks 
+  quantization parameters in an optimal manner. 
+
+.. code:: python
+
+    per_channel_quantized_model = load_model(saved_model_dir + float_model_file)  
+    per_channel_quantized_model.eval()  
+    per_channel_quantized_model.fuse_model()  
+    per_channel_quantized_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')  
+    print(per_channel_quantized_model.qconfig)  
+
+    torch.quantization.prepare(per_channel_quantized_model, inplace=True) 
+    evaluate(per_channel_quantized_model,criterion, data_loader, num_calibration_batches) 
+    torch.quantization.convert(per_channel_quantized_model, inplace=True) 
+    top1, top5 = evaluate(per_channel_quantized_model, criterion, data_loader_test, neval_batches=num_eval_batches) 
+    print('Evaluation accuracy on %d images, %2.2f'%(num_eval_batches * eval_batch_size, top1.avg)) 
+    torch.jit.save(torch.jit.script(per_channel_quantized_model), saved_model_dir + scripted_quantized_model_file)
+
+
+Changing just this quantization configuration method resulted in an increase  
+of the accuracy to over 67.3%! Still, this is 4% worse than the baseline of 71.9% achieved above. 
+So lets try quantization aware training.  
+
+5. Quantization-aware training  
+------------------------------  
+
+Quantization-aware training (QAT) is the quantization method that typically results in the highest accuracy.  
+With QAT, all weights and activations are “fake quantized” during both the forward and backward passes of 
+training: that is, float values are rounded to mimic int8 values, but all computations are still done with  
+floating point numbers. Thus, all the weight adjustments during training are made while “aware” of the fact 
+that the model will ultimately be quantized; after quantizing, therefore, this method will usually yield  
+higher accuracy than either dynamic quantization or post-training static quantization.  
+
+The overall workflow for actually performing QAT is very similar to before: 
+
+- We can use the same model as before: there is no additional preparation needed for quantization-aware 
+  training. 
+- We need to use a ``qconfig`` specifying what kind of fake-quantization is to be inserted after weights  
+  and activations, instead of specifying observers  
+
+We first define a training function:  
+
+.. code:: python
+
+    def train_one_epoch(model, criterion, optimizer, data_loader, device, ntrain_batches):  
+        model.train() 
+        top1 = AverageMeter('Acc@1', ':6.2f') 
+        top5 = AverageMeter('Acc@5', ':6.2f') 
+        avgloss = AverageMeter('Loss', '1.5f')  
+
+        cnt = 0 
+        for image, target in data_loader: 
+            start_time = time.time()  
+            print('.', end = '')  
+            cnt += 1  
+            image, target = image.to(device), target.to(device) 
+            output = model(image) 
+            loss = criterion(output, target)  
+            optimizer.zero_grad() 
+            loss.backward() 
+            optimizer.step()  
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))  
+            top1.update(acc1[0], image.size(0)) 
+            top5.update(acc5[0], image.size(0)) 
+            avgloss.update(loss, image.size(0)) 
+            if cnt >= ntrain_batches: 
+                print('Loss', avgloss.avg)  
+
+                print('Training: * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' 
+                      .format(top1=top1, top5=top5))  
+                return  
+
+        print('Full imagenet train set:  * Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}' 
+              .format(top1=top1, top5=top5))  
+        return  
+
+  
+We fuse modules as before 
+
+.. code:: python
+
+    qat_model = load_model(saved_model_dir + float_model_file)  
+    qat_model.fuse_model()  
+
+    optimizer = torch.optim.SGD(qat_model.parameters(), lr = 0.0001)  
+    qat_model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')  
+  
+Finally, ``prepare_qat`` performs the "fake quantization", preparing the model for quantization-aware training
+
+.. code:: python
+
+    torch.quantization.prepare_qat(qat_model, inplace=True) 
+    print('Inverted Residual Block: After preparation for QAT, note fake-quantization modules \n',qat_model.features[1].conv)
+  
+Training a quantized model with high accuracy requires accurate modeling of numerics at 
+inference. For quantization aware training, therefore, we modify the training loop by:  
+
+- Switch batch norm to use running mean and variance towards the end of training to better  
+  match inference numerics. 
+- We also freeze the quantizer parameters (scale and zero-point) and fine tune the weights. 
+
+.. code:: python
+
+    num_train_batches = 20  
+
+    # QAT takes time and one needs to train over a few epochs.
+    # Train and check accuracy after each epoch 
+    for nepoch in range(8): 
+        train_one_epoch(qat_model, criterion, optimizer, data_loader, torch.device('cpu'), num_train_batches) 
+        if nepoch > 3:  
+            # Freeze quantizer parameters 
+            qat_model.apply(torch.quantization.disable_observer)  
+        if nepoch > 2:  
+            # Freeze batch norm mean and variance estimates 
+            qat_model.apply(torch.nn.intrinsic.qat.freeze_bn_stats) 
+
+        # Check the accuracy after each epoch 
+        quantized_model = torch.quantization.convert(qat_model.eval(), inplace=False) 
+        quantized_model.eval()  
+        top1, top5 = evaluate(quantized_model,criterion, data_loader_test, neval_batches=num_eval_batches)  
+        print('Epoch %d :Evaluation accuracy on %d images, %2.2f'%(nepoch, num_eval_batches * eval_batch_size, top1.avg)) 
+ 
+Quantization-aware training yields an accuracy of over 71.5% on the entire imagenet dataset, which is close to the floating point accuracy of 71.9%. 
+
+More on quantization-aware training:  
+
+- QAT is a super-set of post training quant techniques that allows for more debugging.  
+  For example, we can analyze if the accuracy of the model is limited by weight or activation 
+  quantization. 
+- We can also simulate the accuracy of a quantized model in floating point since  
+  we are using fake-quantization to model the numerics of actual quantized arithmetic.  
+- We can mimic post training quantization easily too. 
+
+Speedup from quantization 
+^^^^^^^^^^^^^^^^^^^^^^^^^ 
+
+Finally, let's confirm something we alluded to above: do our quantized models actually perform inference  
+faster? Let's test: 
+
+.. code:: python
+
+    def run_benchmark(model_file, img_loader):  
+        elapsed = 0 
+        model = torch.jit.load(model_file)  
+        model.eval()  
+        num_batches = 5 
+        # Run the scripted model on a few batches of images 
+        for i, (images, target) in enumerate(img_loader): 
+            if i < num_batches: 
+                start = time.time() 
+                output = model(images)  
+                end = time.time() 
+                elapsed = elapsed + (end-start) 
+            else: 
+                break 
+        num_images = images.size()[0] * num_batches 
+
+        print('Elapsed time: %3.0f ms' % (elapsed/num_images*1000)) 
+        return elapsed  
+
+    run_benchmark(saved_model_dir + scripted_float_model_file, data_loader_test)  
+
+    run_benchmark(saved_model_dir + scripted_quantized_model_file, data_loader_test)  
+
+Running this locally on a MacBook pro yielded 61 ms for the regular model, and  
+just 20 ms for the quantized model, illustrating the typical 2-4x speedup 
+we see for quantized models compared to floating point ones.  
+
+Conclusion  
+----------  
+
+In this tutorial, we showed two quantization methods - post-training static quantization, 
+and quantization-aware training - describing what they do "under the hood" and how to use 
+them in PyTorch.  
+
+Thanks for reading! As always, we welcome any feedback, so please create an issue 
+`here <https://github.com/pytorch/pytorch/issues>`_ if you have any.
\ No newline at end of file
diff --git a/advanced_source/transformer__timeseries_cpp_tutorial/CMakeLists.txt b/advanced_source/transformer__timeseries_cpp_tutorial/CMakeLists.txt
new file mode 100644
index 000000000..a8246f688
--- /dev/null
+++ b/advanced_source/transformer__timeseries_cpp_tutorial/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+project(custom_ops)
+
+find_package(Torch REQUIRED)
+
+add_executable(transformer_ts transformer_timeseries.cpp)
+target_link_libraries(transformer_ts "${TORCH_LIBRARIES}")
+set_property(TARGET transformer_ts PROPERTY CXX_STANDARD 14)
diff --git a/advanced_source/transformer__timeseries_cpp_tutorial/scheduler.h b/advanced_source/transformer__timeseries_cpp_tutorial/scheduler.h
new file mode 100644
index 000000000..9daabd519
--- /dev/null
+++ b/advanced_source/transformer__timeseries_cpp_tutorial/scheduler.h
@@ -0,0 +1,112 @@
+// Copyright 2020-present pytorch-cpp Authors
+#pragma once
+
+#include <torch/torch.h>
+#include <vector>
+#include <algorithm>
+
+namespace scheduler {
+template<typename TOptimizer>
+struct OptimizerOptionsMap {
+};
+
+template<>
+struct OptimizerOptionsMap<torch::optim::Adam> {
+    using type = torch::optim::AdamOptions;
+};
+
+template<>
+struct OptimizerOptionsMap<torch::optim::Adagrad> {
+    using type = torch::optim::AdagradOptions;
+};
+
+template<>
+struct OptimizerOptionsMap<torch::optim::LBFGS> {
+    using type = torch::optim::LBFGSOptions;
+};
+
+template<>
+struct OptimizerOptionsMap<torch::optim::RMSprop> {
+    using type = torch::optim::RMSpropOptions;
+};
+
+template<>
+struct OptimizerOptionsMap<torch::optim::SGD> {
+    using type = torch::optim::SGDOptions;
+};
+
+/**
+ * Learning rate scheduler base.
+ *
+ * Based on the Python implementation at
+ * https://github.com/pytorch/pytorch/blob/master/torch/optim/lr_scheduler.py.
+ * @tparam TOptimizer Optimizer type
+ */
+template<typename TOptimizer>
+class LRScheduler {
+ public:
+    explicit LRScheduler(TOptimizer& optimizer, int64_t last_epoch = -1)
+            : optimizer_(optimizer), last_epoch_(last_epoch), base_lrs(get_current_lr()) {}
+
+    virtual std::vector<double> get_lr() = 0;
+
+    void step() {
+        ++last_epoch_;
+
+        const auto values = get_lr();
+        auto &param_groups = optimizer_.param_groups();
+
+        for (decltype(param_groups.size()) i = 0; i != param_groups.size(); ++i) {
+            dynamic_cast<typename OptimizerOptionsMap<TOptimizer>::type &>(param_groups[i].options()).lr(values[i]);
+        }
+    }
+
+    virtual ~LRScheduler() = default;
+
+ protected:
+    TOptimizer& optimizer_;
+    int64_t last_epoch_;
+    std::vector<double> base_lrs;
+
+    std::vector<double> get_current_lr() {
+        std::vector<double> lrs;
+        lrs.reserve(optimizer_.param_groups().size());
+
+        for (auto &param_group : optimizer_.param_groups()) {
+            lrs.push_back(dynamic_cast<typename
+            OptimizerOptionsMap<TOptimizer>::type &>(param_group.options()).lr());
+        }
+
+        return lrs;
+    }
+};
+
+/**
+ * Step learning rate scheduler.
+ *
+ * Based on the python implementation at
+ * https://github.com/pytorch/pytorch/blob/master/torch/optim/lr_scheduler.py.
+ * @tparam TOptimizer Optimizer type
+ */
+template<typename TOptimizer>
+class StepLR : public LRScheduler<TOptimizer> {
+ public:
+    StepLR(TOptimizer& optimizer, int64_t step_size, double gamma = 0.1, int64_t last_epoch = -1)
+            : LRScheduler<TOptimizer>(optimizer, last_epoch), step_size_(step_size), gamma_(gamma) {}
+
+    std::vector<double> get_lr() override {
+        auto new_lr = this->get_current_lr();
+
+        if (this->last_epoch_ != 0 && (this->last_epoch_ % step_size_ == 0)) {
+            std::transform(new_lr.cbegin(), new_lr.cend(), new_lr.begin(),
+                           [gamma_ = gamma_](auto value) { return value * gamma_; });
+        }
+
+        return new_lr;
+    }
+
+ private:
+    int64_t step_size_;
+    double gamma_;
+};
+}  // namespace scheduler
\ No newline at end of file
diff --git a/advanced_source/transformer__timeseries_cpp_tutorial/transformer_timeseries.cpp b/advanced_source/transformer__timeseries_cpp_tutorial/transformer_timeseries.cpp
new file mode 100644
index 000000000..8bd9ebe8c
--- /dev/null
+++ b/advanced_source/transformer__timeseries_cpp_tutorial/transformer_timeseries.cpp
@@ -0,0 +1,240 @@
+#include <torch/torch.h>
+#include <math.h>
+#include <iostream>
+#include <cmath>
+#include <limits>
+#include <chrono>
+#include <ctime>
+#include <random>
+#include "scheduler.h"
+
+using namespace torch::indexing;
+
+struct PositionalEncodingImpl : torch::nn::Module{
+    PositionalEncodingImpl(){
+
+    }
+    PositionalEncodingImpl(int64_t d_model, int64_t max_len=5000){
+        pe = torch::zeros({max_len, d_model});
+        position = torch::arange(0, max_len,
+            torch::TensorOptions(torch::kFloat32).requires_grad(false));
+        position = position.unsqueeze(1);
+        torch::Tensor temp = torch::arange(0, d_model, 2, torch::TensorOptions(torch::kFloat32).requires_grad(false));
+        div_term = torch::exp(temp * (std::log(10000.0) / d_model));
+
+
+        pe.index_put_({Slice(), Slice(0, None, 2)}, torch::sin(position * div_term));
+        pe.index_put_({Slice(), Slice(1, None, 2)}, torch::cos(position * div_term));
+
+
+        
+        pe = pe.unsqueeze(0).transpose(0, 1);
+        register_parameter("pe", pe);
+        register_parameter("position", position);
+        register_parameter("div_term", div_term);
+        register_buffer("pe", pe);
+    }
+
+    torch::Tensor forward(torch::Tensor x){
+        x = x + pe.index({Slice(0, x.size(0)), Slice()});
+        return x;
+    }
+
+    torch::Tensor pe;
+    torch::Tensor position;
+    torch::Tensor div_term;
+};
+
+TORCH_MODULE(PositionalEncoding);
+
+struct TransformerModel : torch::nn::Module{
+    TransformerModel(int64_t feature_size = 250, int64_t nlayers = 1, float dropout_p=0.1){
+        pos_encoder = PositionalEncoding(feature_size);
+        torch::nn::TransformerEncoderLayerOptions elOptions = 
+            torch::nn::TransformerEncoderLayerOptions(feature_size, 10);
+        torch::nn::TransformerEncoderLayer encoder_layers = torch::nn::TransformerEncoderLayer(
+            elOptions.dropout(dropout_p));
+        torch::nn::TransformerEncoderOptions enOptions = torch::nn::TransformerEncoderOptions(encoder_layers, nlayers);
+        transformer_encoder = torch::nn::TransformerEncoder(enOptions);
+        decoder = torch::nn::Linear(feature_size, 1);
+        register_module("pos_encoder", pos_encoder);
+        register_module("transformer_encoder", transformer_encoder);
+        register_module("decoder", decoder);
+    }
+
+    void init_weights(){
+        float initrange = 0.1;
+        decoder->bias.data().zero_();
+        decoder->weight.data().uniform_(-initrange, initrange);
+    }
+
+    torch::Tensor _generate_square_subsequent_mask(int sz){
+        torch::Tensor mask = (torch::triu(torch::ones({sz, sz})) == 1).transpose(0, 1).to(torch::kFloat32);
+        mask = mask.masked_fill(mask == 0, -std::numeric_limits<float>::infinity()).masked_fill(mask == 1, 0.f);
+        return mask;
+    }
+
+    torch::Tensor forward(torch::Tensor src){
+        if (false == is_mask_generated){
+            torch::Tensor mask = _generate_square_subsequent_mask(src.size(0)).to(src.device());
+            src_mask = mask;
+            is_mask_generated = true;
+        }
+
+        src = pos_encoder(src);
+        torch::Tensor output = transformer_encoder(src, src_mask);
+        output = decoder(output);
+        return output;
+    }
+
+    torch::Tensor src_mask;
+    bool is_mask_generated = false;
+    PositionalEncoding pos_encoder;
+    torch::nn::TransformerEncoder transformer_encoder = nullptr;
+    torch::nn::Linear decoder = nullptr;
+    int64_t ninp;
+};
+
+torch::Tensor create_inout_sequences(torch::Tensor input_data, int64_t tw, int64_t output_window = 1){
+    torch::Tensor temp = torch::empty({input_data.size(0) - tw, 2, tw}, torch::TensorOptions(torch::kFloat32));
+    auto len = input_data.numel();
+    auto max_counter = len - tw;
+    int64_t k = 0;
+    for (auto i = 0; i < max_counter; i++){
+        torch::Tensor train_seq = input_data.index({Slice(i, i + tw)});
+        temp[i][0] = input_data.index({Slice(i, i + tw)});
+        temp[i][1] = input_data.index({Slice(i + output_window, i + tw + output_window)});
+
+    }
+
+    return temp;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> get_data(int64_t output_window = 1){
+    //construct a little toy dataset
+    auto time = torch::arange(0, 400, 0.1);
+    auto amplitude = torch::sin(time) + torch::sin(time * 0.05) + torch::sin(time * 0.12);// + dist(mt);
+
+    
+    //from sklearn.preprocessing import MinMaxScaler
+
+    
+    //looks like normalizing input values curtial for the model
+    //scaler = MinMaxScaler(feature_range=(-1, 1)) 
+    //amplitude = scaler.fit_transform(series.to_numpy().reshape(-1, 1)).reshape(-1)
+    //amplitude = scaler.fit_transform(amplitude.reshape(-1, 1)).reshape(-1)
+    
+    
+    auto samples = 2600;
+
+    auto train_data = amplitude.index({Slice(None, samples)});
+    auto test_data = amplitude.index({Slice(samples, None)});
+
+    //convert our train data into a pytorch train tensor
+    auto input_window = 100;
+
+    auto train_sequence = create_inout_sequences(train_data,input_window);
+    train_sequence = train_sequence.index({Slice(None,-output_window)});
+    
+    auto test_sequence = create_inout_sequences(test_data,input_window);
+    test_sequence = test_sequence.index({Slice(None,-output_window)});
+
+    auto cuda_available = torch::cuda::is_available();
+    torch::Device device(cuda_available ? torch::kCUDA : torch::kCPU);
+
+    return std::make_tuple(train_sequence.to(device),test_sequence.to(device));
+}
+
+std::tuple<torch::Tensor, torch::Tensor> get_batch(torch::Tensor source, int64_t i, int64_t batch_size, int64_t input_window = 100){
+    auto seq_len = std::min(batch_size, source.size(0) - i);
+    
+    auto data = source.index({Slice(i, i + seq_len)});
+    auto input = data.index({Slice(), 0, Slice()});
+    auto target = data.index({Slice(), 1, Slice()});
+    auto temp = input.numel()/100;
+    if (temp > 10)
+        temp = 10;
+    input = torch::reshape(input, {100, temp, 1});
+    target = torch::reshape(target, {100, temp, 1});
+    return std::make_tuple(input, target);
+}
+
+
+void train(TransformerModel model, torch::Tensor train_data, int64_t num_epochs = 100){
+    model.train();
+    auto total_loss = 0.0;
+    auto start_time = std::chrono::system_clock::now();
+    auto batch_size = 10;
+    auto batch = 0;
+
+    torch::nn::MSELoss criterion;
+
+
+    auto learning_rate = 0.005;
+    torch::optim::SGD optimizer(model.parameters(), torch::optim::SGDOptions(learning_rate));
+    scheduler::StepLR<decltype(optimizer)> scheduler(optimizer, 1.0, 0.95);
+
+    for(int64_t i = 0; i <= num_epochs; i++){
+        auto start_time = std::chrono::system_clock::now();
+        std::cout<<"Epoch "<<i<<std::endl;
+        batch = 0;
+        for (int64_t j = 0; j < train_data.size(0); j = j + batch_size, batch++){
+            auto data = get_batch(train_data, j, batch_size);
+            optimizer.zero_grad();
+            auto output = model.forward(std::get<0>(data));
+            
+            auto loss = criterion(output, std::get<1>(data));
+            loss.backward();
+            torch::nn::utils::clip_grad_norm_(model.parameters(), 0.7);
+            optimizer.step();
+            total_loss += loss.item<double>();
+            auto log_interval = int(train_data.size(0)) / (batch_size * 5);
+            if (batch != 0 && 0 == batch % log_interval){
+                auto curr_loss = total_loss / log_interval;
+                auto elapsed = std::chrono::system_clock::now() - start_time;
+                std::cout<<"|epoch "<<i<<" | "<<batch<<"/"<<train_data.size(0)/batch_size;
+                std::cout<<" batches | "<<(elapsed.count() * 10)<<" ms | loss"<<curr_loss<<std::endl;;
+                total_loss = 0;
+                start_time = std::chrono::system_clock::now();
+            }
+        }
+
+        scheduler.step();
+    }
+
+    return;
+}
+
+void evaluate(TransformerModel model, torch::Tensor eval_data){
+    model.eval();
+    auto batch_size = 10;
+    auto total_loss = 0.0;
+    torch::nn::MSELoss criterion;
+
+    std::cout<<"Evaluating:";
+    for (int64_t j = 0; j < eval_data.size(0); j = j + batch_size){
+            auto data = get_batch(eval_data, j, batch_size);
+            auto output = model.forward(std::get<0>(data));
+            auto loss = criterion(output, std::get<1>(data));
+            total_loss += loss.item<double>();
+    }
+
+    std::cout<<"Evaluation Loss: "<<total_loss<<std::endl;
+    return;
+}
+
+int main(){
+    auto cuda_available = torch::cuda::is_available();
+    torch::Device device(cuda_available ? torch::kCUDA : torch::kCPU);
+
+    auto model = TransformerModel();
+    model.to(device);
+
+    auto data = get_data();
+    train(model, std::get<0>(data));
+    evaluate(model, std::get<1>(data));
+
+    return 0;
+    
+}
+
diff --git a/beginner_source/Intro_to_TorchScript_tutorial.py b/beginner_source/Intro_to_TorchScript_tutorial.py
index 0835d9bc1..71c7eb72f 100644
--- a/beginner_source/Intro_to_TorchScript_tutorial.py
+++ b/beginner_source/Intro_to_TorchScript_tutorial.py
@@ -75,7 +75,7 @@ def forward(self, x, h):
 #    cell <https://colah.github.io/posts/2015-08-Understanding-LSTMs/>`__ 의
 #    일종입니다. 즉, 반복(loop)에 적용되는 함수입니다.
 #
-# 모듈을 인스턴스화하고, 3x4 크기의 무작위 값들로 이루어진 행렬 ``x`` 와 ``y`` 를
+# 모듈을 인스턴스화하고, 3x4 크기의 무작위 값들로 이루어진 행렬 ``x`` 와 ``h`` 를
 # 만들었습니다.
 # 그런 다음, ``my_cell(x, h)`` 를 이용해 cell을 호출했습니다. 이것은 ``forward``
 # 함수를 호출합니다.
@@ -187,6 +187,8 @@ def forward(self, x, h):
 my_cell = MyCell()
 x, h = torch.rand(3, 4), torch.rand(3, 4)
 traced_cell = torch.jit.trace(my_cell, (x, h))
+
+print(traced_cell.dg.code)
 print(traced_cell)
 traced_cell(x, h)
 
@@ -280,8 +282,10 @@ def forward(self, x, h):
 scripted_gate = torch.jit.script(MyDecisionGate())
 
 my_cell = MyCell(scripted_gate)
-traced_cell = torch.jit.script(my_cell)
-print(traced_cell.code)
+scripted_cell = torch.jit.script(my_cell)
+
+print(scripted_gate.code)
+print(scripted_cell.code)
 
 
 ######################################################################
@@ -353,9 +357,9 @@ def forward(self, xs):
 # 랩핑 된 RNN 모듈을 저장하고 로드해 봅시다:
 #
 
-traced.save('wrapped_rnn.zip')
+traced.save('wrapped_rnn.pt')
 
-loaded = torch.jit.load('wrapped_rnn.zip')
+loaded = torch.jit.load('wrapped_rnn.pt')
 
 print(loaded)
 print(loaded.code)
diff --git a/beginner_source/README.txt b/beginner_source/README.txt
index 8c4cde443..7d2b1f37c 100644
--- a/beginner_source/README.txt
+++ b/beginner_source/README.txt
@@ -23,4 +23,4 @@ Beginner Tutorials
 
 6. transformer_translation.py
 	Language Translation with Transformers
-	https://pytorch.org/tutorials/beginner/transformer_translation.html
\ No newline at end of file
+	https://pytorch.org/tutorials/beginner/transformer_tutorial.html
\ No newline at end of file
diff --git a/beginner_source/audio_preprocessing_tutorial.py b/beginner_source/audio_preprocessing_tutorial.py
index 01c5a4a74..8b29b07bb 100644
--- a/beginner_source/audio_preprocessing_tutorial.py
+++ b/beginner_source/audio_preprocessing_tutorial.py
@@ -1,375 +1,1235 @@
 """
-torchaudio Tutorial
-===================
+Audio manipulation with torchaudio
+==================================
 
-PyTorch is an open source deep learning platform that provides a
-seamless path from research prototyping to production deployment with
-GPU support.
+``torchaudio`` provides powerful audio I/O functions, preprocessing
+transforms and dataset.
 
-Significant effort in solving machine learning problems goes into data
-preparation. ``torchaudio`` leverages PyTorch’s GPU support, and provides
-many tools to make data loading easy and more readable. In this
-tutorial, we will see how to load and preprocess data from a simple
-dataset.
-
-For this tutorial, please make sure the ``matplotlib`` package is
-installed for easier visualization.
+In this tutorial, we will look into how to prepare audio data and
+extract features that can be fed to NN models.
 
 """
 
+# When running this tutorial in Google Colab, install the required packages
+# with the following.
+# !pip install torchaudio librosa boto3
+
 import torch
 import torchaudio
+import torchaudio.functional as F
+import torchaudio.transforms as T
+
+print(torch.__version__)
+print(torchaudio.__version__)
+
+
+######################################################################
+# Preparing data and utility functions (skip this section)
+# --------------------------------------------------------
+# 
+
+#@title Prepare data and utility functions. {display-mode: "form"}
+#@markdown
+#@markdown You do not need to look into this cell.
+#@markdown Just execute once and you are good to go.
+#@markdown
+#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0.
+
+#-------------------------------------------------------------------------------
+# Preparation of data and helper functions.
+#-------------------------------------------------------------------------------
+import io
+import os
+import math
+import tarfile
+import multiprocessing
+
+import scipy
+import librosa
+import boto3
+from botocore import UNSIGNED
+from botocore.config import Config
+import requests
+import matplotlib
 import matplotlib.pyplot as plt
+from IPython.display import Audio, display
+
+[width, height] = matplotlib.rcParams['figure.figsize']
+if width < 10:
+  matplotlib.rcParams['figure.figsize'] = [width * 2.5, height]
+
+_SAMPLE_DIR = "_sample_data"
+SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav"
+SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav")
+
+SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
+SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav")
+
+SAMPLE_RIR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/room-response/rm1/impulse/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo.wav"
+SAMPLE_RIR_PATH = os.path.join(_SAMPLE_DIR, "rir.wav")
+
+SAMPLE_NOISE_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/distractors/rm1/babb/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav"
+SAMPLE_NOISE_PATH = os.path.join(_SAMPLE_DIR, "bg.wav")
+
+SAMPLE_MP3_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.mp3"
+SAMPLE_MP3_PATH = os.path.join(_SAMPLE_DIR, "steam.mp3")
+
+SAMPLE_GSM_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.gsm"
+SAMPLE_GSM_PATH = os.path.join(_SAMPLE_DIR, "steam.gsm")
+
+SAMPLE_TAR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit.tar.gz"
+SAMPLE_TAR_PATH = os.path.join(_SAMPLE_DIR, "sample.tar.gz")
+SAMPLE_TAR_ITEM = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
+
+S3_BUCKET = "pytorch-tutorial-assets"
+S3_KEY = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
+
+YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no")
+os.makedirs(YESNO_DATASET_PATH, exist_ok=True)
+os.makedirs(_SAMPLE_DIR, exist_ok=True)
+
+def _fetch_data():
+  uri = [
+    (SAMPLE_WAV_URL, SAMPLE_WAV_PATH),
+    (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH),
+    (SAMPLE_RIR_URL, SAMPLE_RIR_PATH),
+    (SAMPLE_NOISE_URL, SAMPLE_NOISE_PATH),
+    (SAMPLE_MP3_URL, SAMPLE_MP3_PATH),
+    (SAMPLE_GSM_URL, SAMPLE_GSM_PATH),
+    (SAMPLE_TAR_URL, SAMPLE_TAR_PATH),
+  ]
+  for url, path in uri:
+    with open(path, 'wb') as file_:
+      file_.write(requests.get(url).content)
+
+_fetch_data()
+
+def _download_yesno():
+  if os.path.exists(os.path.join(YESNO_DATASET_PATH, "waves_yesno.tar.gz")):
+    return
+  torchaudio.datasets.YESNO(root=YESNO_DATASET_PATH, download=True)
+
+YESNO_DOWNLOAD_PROCESS = multiprocessing.Process(target=_download_yesno)
+YESNO_DOWNLOAD_PROCESS.start()
+
+def _get_sample(path, resample=None):
+  effects = [
+    ["remix", "1"]
+  ]
+  if resample:
+    effects.append(["rate", f'{resample}'])
+  return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
+
+def get_speech_sample(*, resample=None):
+  return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample)
+
+def get_sample(*, resample=None):
+  return _get_sample(SAMPLE_WAV_PATH, resample=resample)
+
+def get_rir_sample(*, resample=None, processed=False):
+  rir_raw, sample_rate = _get_sample(SAMPLE_RIR_PATH, resample=resample)
+  if not processed:
+    return rir_raw, sample_rate
+  rir = rir_raw[:, int(sample_rate*1.01):int(sample_rate*1.3)]
+  rir = rir / torch.norm(rir, p=2)
+  rir = torch.flip(rir, [1])
+  return rir, sample_rate
+
+def get_noise_sample(*, resample=None):
+  return _get_sample(SAMPLE_NOISE_PATH, resample=resample)
+
+def print_metadata(metadata, src=None):
+  if src:
+    print("-" * 10)
+    print("Source:", src)
+    print("-" * 10)
+  print(" - sample_rate:", metadata.sample_rate)
+  print(" - num_channels:", metadata.num_channels)
+  print(" - num_frames:", metadata.num_frames)
+  print(" - bits_per_sample:", metadata.bits_per_sample)
+  print(" - encoding:", metadata.encoding)
+  print()
+
+def print_stats(waveform, sample_rate=None, src=None):
+  if src:
+    print("-" * 10)
+    print("Source:", src)
+    print("-" * 10)
+  if sample_rate:
+    print("Sample Rate:", sample_rate)
+  print("Shape:", tuple(waveform.shape))
+  print("Dtype:", waveform.dtype)
+  print(f" - Max:     {waveform.max().item():6.3f}")
+  print(f" - Min:     {waveform.min().item():6.3f}")
+  print(f" - Mean:    {waveform.mean().item():6.3f}")
+  print(f" - Std Dev: {waveform.std().item():6.3f}")
+  print()
+  print(waveform)
+  print()
+
+def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
+  waveform = waveform.numpy()
+
+  num_channels, num_frames = waveform.shape
+  time_axis = torch.arange(0, num_frames) / sample_rate
+
+  figure, axes = plt.subplots(num_channels, 1)
+  if num_channels == 1:
+    axes = [axes]
+  for c in range(num_channels):
+    axes[c].plot(time_axis, waveform[c], linewidth=1)
+    axes[c].grid(True)
+    if num_channels > 1:
+      axes[c].set_ylabel(f'Channel {c+1}')
+    if xlim:
+      axes[c].set_xlim(xlim)
+    if ylim:
+      axes[c].set_ylim(ylim)
+  figure.suptitle(title)
+  plt.show(block=False)
+
+def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
+  waveform = waveform.numpy()
+
+  num_channels, num_frames = waveform.shape
+  time_axis = torch.arange(0, num_frames) / sample_rate
+
+  figure, axes = plt.subplots(num_channels, 1)
+  if num_channels == 1:
+    axes = [axes]
+  for c in range(num_channels):
+    axes[c].specgram(waveform[c], Fs=sample_rate)
+    if num_channels > 1:
+      axes[c].set_ylabel(f'Channel {c+1}')
+    if xlim:
+      axes[c].set_xlim(xlim)
+  figure.suptitle(title)
+  plt.show(block=False)
+
+def play_audio(waveform, sample_rate):
+  waveform = waveform.numpy()
+
+  num_channels, num_frames = waveform.shape
+  if num_channels == 1:
+    display(Audio(waveform[0], rate=sample_rate))
+  elif num_channels == 2:
+    display(Audio((waveform[0], waveform[1]), rate=sample_rate))
+  else:
+    raise ValueError("Waveform with more than 2 channels are not supported.")
+
+def inspect_file(path):
+  print("-" * 10)
+  print("Source:", path)
+  print("-" * 10)
+  print(f" - File size: {os.path.getsize(path)} bytes")
+  print_metadata(torchaudio.info(path))
+
+def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
+  fig, axs = plt.subplots(1, 1)
+  axs.set_title(title or 'Spectrogram (db)')
+  axs.set_ylabel(ylabel)
+  axs.set_xlabel('frame')
+  im = axs.imshow(librosa.power_to_db(spec), origin='lower', aspect=aspect)
+  if xmax:
+    axs.set_xlim((0, xmax))
+  fig.colorbar(im, ax=axs)
+  plt.show(block=False)
+
+def plot_mel_fbank(fbank, title=None):
+  fig, axs = plt.subplots(1, 1)
+  axs.set_title(title or 'Filter bank')
+  axs.imshow(fbank, aspect='auto')
+  axs.set_ylabel('frequency bin')
+  axs.set_xlabel('mel bin')
+  plt.show(block=False)
+
+def get_spectrogram(
+    n_fft = 400,
+    win_len = None,
+    hop_len = None,
+    power = 2.0,
+):
+  waveform, _ = get_speech_sample()
+  spectrogram = T.Spectrogram(
+      n_fft=n_fft,
+      win_length=win_len,
+      hop_length=hop_len,
+      center=True,
+      pad_mode="reflect",
+      power=power,
+  )
+  return spectrogram(waveform)
+
+def plot_pitch(waveform, sample_rate, pitch):
+  figure, axis = plt.subplots(1, 1)
+  axis.set_title("Pitch Feature")
+  axis.grid(True)
+
+  end_time = waveform.shape[1] / sample_rate
+  time_axis = torch.linspace(0, end_time,  waveform.shape[1])
+  axis.plot(time_axis, waveform[0], linewidth=1, color='gray', alpha=0.3)
+
+  axis2 = axis.twinx()
+  time_axis = torch.linspace(0, end_time, pitch.shape[1])
+  ln2 = axis2.plot(
+      time_axis, pitch[0], linewidth=2, label='Pitch', color='green')
+
+  axis2.legend(loc=0)
+  plt.show(block=False)
+
+def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc):
+  figure, axis = plt.subplots(1, 1)
+  axis.set_title("Kaldi Pitch Feature")
+  axis.grid(True)
+
+  end_time = waveform.shape[1] / sample_rate
+  time_axis = torch.linspace(0, end_time,  waveform.shape[1])
+  axis.plot(time_axis, waveform[0], linewidth=1, color='gray', alpha=0.3)
+
+  time_axis = torch.linspace(0, end_time, pitch.shape[1])
+  ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label='Pitch', color='green')
+  axis.set_ylim((-1.3, 1.3))
+
+  axis2 = axis.twinx()
+  time_axis = torch.linspace(0, end_time, nfcc.shape[1])
+  ln2 = axis2.plot(
+      time_axis, nfcc[0], linewidth=2, label='NFCC', color='blue', linestyle='--')
+
+  lns = ln1 + ln2
+  labels = [l.get_label() for l in lns]
+  axis.legend(lns, labels, loc=0)
+  plt.show(block=False)
+
 
 ######################################################################
-# Opening a file
-# -----------------
+# Audio I/O
+# =========
 # 
-# ``torchaudio`` also supports loading sound files in the wav and mp3 format. We
-# call waveform the resulting raw audio signal.
+# torchaudio integrates ``libsox`` and provides a rich set of audio I/O.
 # 
 
-filename = "../_static/img/steam-train-whistle-daniel_simon-converted-from-mp3.wav"
-waveform, sample_rate = torchaudio.load(filename)
 
-print("Shape of waveform: {}".format(waveform.size()))
-print("Sample rate of waveform: {}".format(sample_rate))
+######################################################################
+# Quering audio metadata
+# ----------------------
+# 
+# ``torchaudio.info`` function fetches metadata of audio. You can provide
+# a path-like object or file-like object.
+# 
+
+metadata = torchaudio.info(SAMPLE_WAV_PATH)
+print_metadata(metadata, src=SAMPLE_WAV_PATH)
 
-plt.figure()
-plt.plot(waveform.t().numpy())
 
 ######################################################################
-# When you load a file in ``torchaudio``, you can optionally specify the backend to use either 
-# `SoX <https://pypi.org/project/sox/>`_ or `SoundFile <https://pypi.org/project/SoundFile/>`_ 
-# via ``torchaudio.set_audio_backend``. These backends are loaded lazily when needed.
+# Where
+# 
+# -  ``sample_rate`` is the sampling rate of the audio
+# -  ``num_channels`` is the number of channels
+# -  ``num_frames`` is the number of frames per channel
+# -  ``bits_per_sample`` is bit depth
+# -  ``encoding`` is the sample coding format
+# 
+# The values ``encoding`` can take are one of the following
+# 
+# -  ``"PCM_S"``: Signed integer linear PCM
+# -  ``"PCM_U"``: Unsigned integer linear PCM
+# -  ``"PCM_F"``: Floating point linear PCM
+# -  ``"FLAC"``: Flac, `Free Lossless Audio
+#    Codec <https://xiph.org/flac/>`__
+# -  ``"ULAW"``: Mu-law,
+#    [`wikipedia <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`__]
+# -  ``"ALAW"``: A-law
+#    [`wikipedia <https://en.wikipedia.org/wiki/A-law_algorithm>`__]
+# -  ``"MP3"`` : MP3, MPEG-1 Audio Layer III
+# -  ``"VORBIS"``: OGG Vorbis [`xiph.org <https://xiph.org/vorbis/>`__]
+# -  ``"AMR_NB"``: Adaptive Multi-Rate
+#    [`wikipedia <https://en.wikipedia.org/wiki/Adaptive_Multi-Rate_audio_codec>`__]
+# -  ``"AMR_WB"``: Adaptive Multi-Rate Wideband
+#    [`wikipedia <https://en.wikipedia.org/wiki/Adaptive_Multi-Rate_Wideband>`__]
+# -  ``"OPUS"``: Opus [`opus-codec.org <https://opus-codec.org/>`__]
+# -  ``"GSM"``: GSM-FR
+#    [`wikipedia <https://en.wikipedia.org/wiki/Full_Rate>`__]
+# -  ``"UNKNOWN"`` None of avobe
 # 
-# ``torchaudio`` also makes JIT compilation optional for functions, and uses ``nn.Module`` where possible.
+
 
 ######################################################################
-# Transformations
-# ---------------
+# **Note**
 # 
-# ``torchaudio`` supports a growing list of
-# `transformations <https://pytorch.org/audio/transforms.html>`_.
+# -  ``bits_per_sample`` can be ``0`` for formats with compression and/or
+#    variable bit rate. (such as mp3)
+# -  ``num_frames`` can be ``0`` for GSM-FR format.
 # 
-# -  **Resample**: Resample waveform to a different sample rate.
-# -  **Spectrogram**: Create a spectrogram from a waveform.
-# -  **GriffinLim**: Compute waveform from a linear scale magnitude spectrogram using 
-#    the Griffin-Lim transformation.
-# -  **ComputeDeltas**: Compute delta coefficients of a tensor, usually a spectrogram.
-# -  **ComplexNorm**: Compute the norm of a complex tensor.
-# -  **MelScale**: This turns a normal STFT into a Mel-frequency STFT,
-#    using a conversion matrix.
-# -  **AmplitudeToDB**: This turns a spectrogram from the
-#    power/amplitude scale to the decibel scale.
-# -  **MFCC**: Create the Mel-frequency cepstrum coefficients from a
-#    waveform.
-# -  **MelSpectrogram**: Create MEL Spectrograms from a waveform using the
-#    STFT function in PyTorch.
-# -  **MuLawEncoding**: Encode waveform based on mu-law companding.
-# -  **MuLawDecoding**: Decode mu-law encoded waveform.
-# -  **TimeStretch**: Stretch a spectrogram in time without modifying pitch for a given rate.
-# -  **FrequencyMasking**: Apply masking to a spectrogram in the frequency domain.
-# -  **TimeMasking**: Apply masking to a spectrogram in the time domain.
-#
-# Each transform supports batching: you can perform a transform on a single raw 
-# audio signal or spectrogram, or many of the same shape.
+
+metadata = torchaudio.info(SAMPLE_MP3_PATH)
+print_metadata(metadata, src=SAMPLE_MP3_PATH)
+
+metadata = torchaudio.info(SAMPLE_GSM_PATH)
+print_metadata(metadata, src=SAMPLE_GSM_PATH)
+
+
+######################################################################
+# Querying file-like object
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# ``info`` function works on file-like object as well.
+# 
+
+with requests.get(SAMPLE_WAV_URL, stream=True) as response:
+  metadata = torchaudio.info(response.raw)
+print_metadata(metadata, src=SAMPLE_WAV_URL)
+
+
+######################################################################
+# **Note** When passing file-like object, ``info`` function does not read
+# all the data, instead it only reads the beginning portion of data.
+# Therefore, depending on the audio format, it cannot get the correct
+# metadata, including the format itself. The following example illustrates
+# this.
 # 
-# Since all transforms are ``nn.Modules`` or ``jit.ScriptModules``, they can be
-# used as part of a neural network at any point.
+# -  Use ``format`` argument to tell what audio format it is.
+# -  The returned metadata has ``num_frames = 0``
 # 
 
+with requests.get(SAMPLE_MP3_URL, stream=True) as response:
+  metadata = torchaudio.info(response.raw, format="mp3")
+
+  print(f"Fetched {response.raw.tell()} bytes.")
+print_metadata(metadata, src=SAMPLE_MP3_URL)
+
 
 ######################################################################
-# To start, we can look at the log of the spectrogram on a log scale.
+# Loading audio data into Tensor
+# ------------------------------
+# 
+# To load audio data, you can use ``torchaudio.load``.
+# 
+# This function accepts path-like object and file-like object.
+# 
+# The returned value is a tuple of waveform (``Tensor``) and sample rate
+# (``int``).
+# 
+# By default, the resulting tensor object has ``dtype=torch.float32`` and
+# its value range is normalized within ``[-1.0, 1.0]``.
+# 
+# For the list of supported format, please refer to `the torchaudio
+# documentation <https://pytorch.org/audio>`__.
 # 
 
-specgram = torchaudio.transforms.Spectrogram()(waveform)
+waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)
 
-print("Shape of spectrogram: {}".format(specgram.size()))
+print_stats(waveform, sample_rate=sample_rate)
+plot_waveform(waveform, sample_rate)
+plot_specgram(waveform, sample_rate)
+play_audio(waveform, sample_rate)
 
-plt.figure()
-plt.imshow(specgram.log2()[0,:,:].numpy(), cmap='gray')
 
 
 ######################################################################
-# Or we can look at the Mel Spectrogram on a log scale.
+# Loading from file-like object
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # 
+# ``torchaudio``\ ’s I/O functions now support file-like object. This
+# allows to fetch audio data and decode at the same time from the location
+# other than local file system. The following examples illustrates this.
+# 
+
+# Load audio data as HTTP request
+with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response:
+  waveform, sample_rate = torchaudio.load(response.raw)
+plot_specgram(waveform, sample_rate, title="HTTP datasource")
 
-specgram = torchaudio.transforms.MelSpectrogram()(waveform)
+# Load audio from tar file
+with tarfile.open(SAMPLE_TAR_PATH, mode='r') as tarfile_:
+  fileobj = tarfile_.extractfile(SAMPLE_TAR_ITEM)
+  waveform, sample_rate = torchaudio.load(fileobj)
+plot_specgram(waveform, sample_rate, title="TAR file")
 
-print("Shape of spectrogram: {}".format(specgram.size()))
+# Load audio from S3
+client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
+response = client.get_object(Bucket=S3_BUCKET, Key=S3_KEY)
+waveform, sample_rate = torchaudio.load(response['Body'])
+plot_specgram(waveform, sample_rate, title="From S3")
 
-plt.figure()
-p = plt.imshow(specgram.log2()[0,:,:].detach().numpy(), cmap='gray')
 
 
 ######################################################################
-# We can resample the waveform, one channel at a time.
+# Tips on slicing
+# ~~~~~~~~~~~~~~~
+# 
+# Providing ``num_frames`` and ``frame_offset`` arguments will slice the
+# resulting Tensor object while decoding.
+# 
+# The same result can be achieved using the regular Tensor slicing,
+# (i.e. ``waveform[:, frame_offset:frame_offset+num_frames]``) however,
+# providing ``num_frames`` and ``frame_offset`` arguments is more
+# efficient.
+# 
+# This is because the function will stop data acquisition and decoding
+# once it finishes decoding the requested frames. This is advantageous
+# when the audio data are transfered via network as the data transfer will
+# stop as soon as the necessary amount of data is fetched.
 # 
+# The following example illustrates this;
+# 
+
+# Illustration of two different decoding methods.
+# The first one will fetch all the data and decode them, while
+# the second one will stop fetching data once it completes decoding.
+# The resulting waveforms are identical.
 
-new_sample_rate = sample_rate/10
+frame_offset, num_frames = 16000, 16000  # Fetch and decode the 1 - 2 seconds
 
-# Since Resample applies to a single channel, we resample first channel here
-channel = 0
-transformed = torchaudio.transforms.Resample(sample_rate, new_sample_rate)(waveform[channel,:].view(1,-1))
+print("Fetching all the data...")
+with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response:
+  waveform1, sample_rate1 = torchaudio.load(response.raw)
+  waveform1 = waveform1[:, frame_offset:frame_offset+num_frames]
+  print(f" - Fetched {response.raw.tell()} bytes")
 
-print("Shape of transformed waveform: {}".format(transformed.size()))
+print("Fetching until the requested frames are available...")
+with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response:
+  waveform2, sample_rate2 = torchaudio.load(
+      response.raw, frame_offset=frame_offset, num_frames=num_frames)
+  print(f" - Fetched {response.raw.tell()} bytes")
+
+print("Checking the resulting waveform ... ", end="")
+assert (waveform1 == waveform2).all()
+print("matched!")
 
-plt.figure()
-plt.plot(transformed[0,:].numpy())
 
 
 ######################################################################
-# As another example of transformations, we can encode the signal based on
-# Mu-Law enconding. But to do so, we need the signal to be between -1 and
-# 1. Since the tensor is just a regular PyTorch tensor, we can apply
-# standard operators on it.
+# Saving audio to file
+# --------------------
+# 
+# To save audio data in the formats intepretable by common applications,
+# you can use ``torchaudio.save``.
+# 
+# This function accepts path-like object and file-like object.
+# 
+# When passing file-like object, you also need to provide ``format``
+# argument so that the function knows which format it should be using. In
+# case of path-like object, the function will detemine the format based on
+# the extension. If you are saving to a file without extension, you need
+# to provide ``format`` argument.
+# 
+# When saving as WAV format, the default encoding for ``float32`` Tensor
+# is 32-bit floating-point PCM. You can provide ``encoding`` and
+# ``bits_per_sample`` argument to change this. For example, to save data
+# in 16 bit signed integer PCM, you can do the following.
 # 
+# **Note** Saving data in encodings with lower bit depth reduces the
+# resulting file size but loses precision.
+# 
+
+waveform, sample_rate = get_sample()
+print_stats(waveform, sample_rate=sample_rate)
+
+# Save without any encoding option.
+# The function will pick up the encoding which
+# the provided data fit
+path = "save_example_default.wav"
+torchaudio.save(path, waveform, sample_rate)
+inspect_file(path)
+
+# Save as 16-bit signed integer Linear PCM
+# The resulting file occupies half the storage but loses precision
+path = "save_example_PCM_S16.wav"
+torchaudio.save(
+    path, waveform, sample_rate,
+    encoding="PCM_S", bits_per_sample=16)
+inspect_file(path)
 
-# Let's check if the tensor is in the interval [-1,1]
-print("Min of waveform: {}\nMax of waveform: {}\nMean of waveform: {}".format(waveform.min(), waveform.max(), waveform.mean()))
 
 
 ######################################################################
-# Since the waveform is already between -1 and 1, we do not need to
-# normalize it.
+# ``torchaudio.save`` can also handle other formats. To name a few;
 # 
 
-def normalize(tensor):
-    # Subtract the mean, and scale to the interval [-1,1]
-    tensor_minusmean = tensor - tensor.mean()
-    return tensor_minusmean/tensor_minusmean.abs().max()
+waveform, sample_rate = get_sample()
+
+formats = [
+  "mp3",
+  "flac",
+  "vorbis",
+  "sph",
+  "amb",
+  "amr-nb",
+  "gsm",
+]
+
+for format in formats:
+  path = f"save_example.{format}"
+  torchaudio.save(path, waveform, sample_rate, format=format)
+  inspect_file(path)
 
-# Let's normalize to the full interval [-1,1]
-# waveform = normalize(waveform)
 
 
 ######################################################################
-# Let’s apply encode the waveform.
+# Saving to file-like object
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# Similar to the other I/O functions, you can save audio into file-like
+# object. When saving to file-like object, ``format`` argument is
+# required.
 # 
 
-transformed = torchaudio.transforms.MuLawEncoding()(waveform)
+waveform, sample_rate = get_sample()
 
-print("Shape of transformed waveform: {}".format(transformed.size()))
+# Saving to Bytes buffer
+buffer_ = io.BytesIO()
+torchaudio.save(buffer_, waveform, sample_rate, format="wav")
 
-plt.figure()
-plt.plot(transformed[0,:].numpy())
+buffer_.seek(0)
+print(buffer_.read(16))
 
 
 ######################################################################
-# And now decode.
+# Data Augmentation
+# =================
+# 
+# ``torchaudio`` provides a variety of ways to augment audio data.
+# 
+
+
+######################################################################
+# Applying effects and filtering
+# ------------------------------
+# 
+# ``torchaudio.sox_effects`` module provides ways to apply filiters like
+# ``sox`` command on Tensor objects and file-object audio sources
+# directly.
+# 
+# There are two functions for this;
+# 
+# -  ``torchaudio.sox_effects.apply_effects_tensor`` for applying effects
+#    on Tensor
+# -  ``torchaudio.sox_effects.apply_effects_file`` for applying effects on
+#    other audio source
+# 
+# Both function takes effects in the form of ``List[List[str]]``. This
+# mostly corresponds to how ``sox`` command works, but one caveat is that
+# ``sox`` command adds some effects automatically, but torchaudio’s
+# implementation does not do that.
+# 
+# For the list of available effects, please refer to `the sox
+# documentation <http://sox.sourceforge.net/sox.html>`__.
+# 
+# **Tip** If you need to load and resample your audio data on-the-fly,
+# then you can use ``torchaudio.sox_effects.apply_effects_file`` with
+# ``"rate"`` effect.
+# 
+# **Note** ``apply_effects_file`` accepts file-like object or path-like
+# object. Similar to ``torchaudio.load``, when the audio format cannot be
+# detected from either file extension or header, you can provide
+# ``format`` argument to tell what format the audio source is.
+# 
+# **Note** This process is not differentiable.
 # 
 
-reconstructed = torchaudio.transforms.MuLawDecoding()(transformed)
+# Load the data
+waveform1, sample_rate1 = get_sample(resample=16000)
 
-print("Shape of recovered waveform: {}".format(reconstructed.size()))
+# Define effects
+effects = [
+  ["lowpass", "-1", "300"], # apply single-pole lowpass filter
+  ["speed", "0.8"],  # reduce the speed
+                     # This only changes sample rate, so it is necessary to 
+                     # add `rate` effect with original sample rate after this.
+  ["rate", f"{sample_rate1}"],
+  ["reverb", "-w"],  # Reverbration gives some dramatic feeling
+]
+
+# Apply effects
+waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(
+    waveform1, sample_rate1, effects)
+
+plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-.1, 3.2))
+plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-.1, 3.2))
+print_stats(waveform1, sample_rate=sample_rate1, src="Original")
+print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied")
 
-plt.figure()
-plt.plot(reconstructed[0,:].numpy())
 
 
 ######################################################################
-# We can finally compare the original waveform with its reconstructed
-# version.
+# Note that the number of frames and number of channels are different from
+# the original after the effects. Let’s listen to the audio. Doesn’t it
+# sound more dramatic?
 # 
 
-# Compute median relative difference
-err = ((waveform-reconstructed).abs() / waveform.abs()).median()
+plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04))
+play_audio(waveform1, sample_rate1)
+plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04))
+play_audio(waveform2, sample_rate2)
 
-print("Median relative difference between original and MuLaw reconstucted signals: {:.2%}".format(err))
+
+######################################################################
+# Simulating room reverbration
+# ----------------------------
+# 
+# `Convolution
+# reverb <https://en.wikipedia.org/wiki/Convolution_reverb>`__ is a
+# technique used to make a clean audio data sound like in a different
+# environment.
+# 
+# Using Room Impulse Response (RIR), we can make a clean speech sound like
+# uttered in a conference room.
+# 
+# For this process, we need RIR data. The following data are from VOiCES
+# dataset, but you can record one by your self. Just turn on microphone
+# and clap you hands.
+# 
+
+sample_rate = 8000
+
+rir_raw, _ = get_rir_sample(resample=sample_rate)
+
+plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)", ylim=None)
+plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)")
+play_audio(rir_raw, sample_rate)
 
 
 ######################################################################
-# Functional
-# ---------------
+# First, we need to clean up the RIR. We extract the main impulse,
+# normalize the signal power, then flip the time axis.
 # 
-# The transformations seen above rely on lower level stateless functions for their computations. 
-# These functions are available under ``torchaudio.functional``. The complete list is available 
-# `here <https://pytorch.org/audio/functional.html>`_ and includes:
-#
-# -  **istft**: Inverse short time Fourier Transform.
-# -  **gain**: Applies amplification or attenuation to the whole waveform.
-# -  **dither**: Increases the perceived dynamic range of audio stored at a
-#    particular bit-depth.
-# -  **compute_deltas**: Compute delta coefficients of a tensor.
-# -  **equalizer_biquad**: Design biquad peaking equalizer filter and perform filtering.
-# -  **lowpass_biquad**: Design biquad lowpass filter and perform filtering.
-# -  **highpass_biquad**:Design biquad highpass filter and perform filtering.
+
+rir = rir_raw[:, int(sample_rate*1.01):int(sample_rate*1.3)]
+rir = rir / torch.norm(rir, p=2)
+rir = torch.flip(rir, [1])
+
+print_stats(rir)
+plot_waveform(rir, sample_rate, title="Room Impulse Response", ylim=None)
+
+
+
+######################################################################
+# Then we convolve the speech signal with the RIR filter.
 # 
-# For example, let's try the `mu_law_encoding` functional:
 
-mu_law_encoding_waveform = torchaudio.functional.mu_law_encoding(waveform, quantization_channels=256)
+speech, _ = get_speech_sample(resample=sample_rate)
+
+speech_ = torch.nn.functional.pad(speech, (rir.shape[1]-1, 0))
+augmented = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0]
+
+plot_waveform(speech, sample_rate, title="Original", ylim=None)
+plot_waveform(augmented, sample_rate, title="RIR Applied", ylim=None)
 
-print("Shape of transformed waveform: {}".format(mu_law_encoding_waveform.size()))
+plot_specgram(speech, sample_rate, title="Original")
+play_audio(speech, sample_rate)
+
+plot_specgram(augmented, sample_rate, title="RIR Applied")
+play_audio(augmented, sample_rate)
 
-plt.figure()
-plt.plot(mu_law_encoding_waveform[0,:].numpy())
 
 ######################################################################
-# You can see how the output fron ``torchaudio.functional.mu_law_encoding`` is the same as 
-# the output from ``torchaudio.transforms.MuLawEncoding``.
-#
-# Now let's experiment with a few of the other functionals and visualize their output. Taking our 
-# spectogram, we can compute it's deltas:
+# Adding background noise
+# -----------------------
+# 
+# To add background noise to audio data, you can simply add audio Tensor
+# and noise Tensor. A commonly way to adjust the intensity of noise is to
+# change Signal-to-Noise Ratio (SNR).
+# [`wikipedia <https://en.wikipedia.org/wiki/Signal-to-noise_ratio>`__]
+# 
+# .. math::
+# 
+# 
+#    \mathrm{SNR} = \frac{P_\mathrm{signal}}{P_\mathrm{noise}}
+# 
+# .. math::
+# 
+# 
+#    {\mathrm  {SNR_{{dB}}}}=10\log _{{10}}\left({\mathrm  {SNR}}\right)
+# 
+
+sample_rate = 8000
+speech, _ = get_speech_sample(resample=sample_rate)
+noise, _ = get_noise_sample(resample=sample_rate)
+noise = noise[:, :speech.shape[1]]
+
+plot_waveform(noise, sample_rate, title="Background noise")
+plot_specgram(noise, sample_rate, title="Background noise")
+play_audio(noise, sample_rate)
+
+speech_power = speech.norm(p=2)
+noise_power = noise.norm(p=2)
+
+for snr_db in [20, 10, 3]:
+  snr = math.exp(snr_db / 10)
+  scale = snr * noise_power / speech_power
+  noisy_speech = (scale * speech + noise) / 2
+
+  plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
+  plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
+  play_audio(noisy_speech, sample_rate)
 
-computed = torchaudio.functional.compute_deltas(specgram.contiguous(), win_length=3)
-print("Shape of computed deltas: {}".format(computed.shape))
 
-plt.figure()
-plt.imshow(computed.log2()[0,:,:].detach().numpy(), cmap='gray')
 
 ######################################################################
-# We can take the original waveform and apply different effects to it.
-#
+# Applying codec to Tensor object
+# -------------------------------
+# 
+# ``torchaudio.functional.apply_codec`` can apply codecs to Tensor object.
+# 
+# **Note** This process is not differentiable.
+# 
+
+waveform, sample_rate = get_speech_sample(resample=8000)
+
+plot_specgram(waveform, sample_rate, title="Original")
+play_audio(waveform, sample_rate)
+
+configs = [
+    ({"format": "wav", "encoding": 'ULAW', "bits_per_sample": 8}, "8 bit mu-law"),
+    ({"format": "gsm"}, "GSM-FR"),
+    ({"format": "mp3", "compression": -9}, "MP3"),
+    ({"format": "vorbis", "compression": -1}, "Vorbis"),
+]
+for param, title in configs:
+  augmented = F.apply_codec(waveform, sample_rate, **param)
+  plot_specgram(augmented, sample_rate, title=title)
+  play_audio(augmented, sample_rate)
 
-gain_waveform = torchaudio.functional.gain(waveform, gain_db=5.0)
-print("Min of gain_waveform: {}\nMax of gain_waveform: {}\nMean of gain_waveform: {}".format(gain_waveform.min(), gain_waveform.max(), gain_waveform.mean()))
 
-dither_waveform = torchaudio.functional.dither(waveform)
-print("Min of dither_waveform: {}\nMax of dither_waveform: {}\nMean of dither_waveform: {}".format(dither_waveform.min(), dither_waveform.max(), dither_waveform.mean()))
 
 ######################################################################
-# Another example of the capabilities in ``torchaudio.functional`` are applying filters to our
-# waveform. Applying the lowpass biquad filter to our waveform will output a new waveform with 
-# the signal of the frequency modified.
+# Simulating a phone recoding
+# ---------------------------
+# 
+# Combining the previous techniques, we can simulate audio that sounds
+# like a person talking over a phone in a echoey room with people talking
+# in the background.
+# 
+
+sample_rate = 16000
+speech, _ = get_speech_sample(resample=sample_rate)
+
+plot_specgram(speech, sample_rate, title="Original")
+play_audio(speech, sample_rate)
+
+# Apply RIR
+rir, _ = get_rir_sample(resample=sample_rate, processed=True)
+speech_ = torch.nn.functional.pad(speech, (rir.shape[1]-1, 0))
+speech = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0]
+
+plot_specgram(speech, sample_rate, title="RIR Applied")
+play_audio(speech, sample_rate)
+
+# Add background noise
+# Because the noise is recorded in the actual environment, we consider that 
+# the noise contains the acoustic feature of the environment. Therefore, we add
+# the noise after RIR application.
+noise, _ = get_noise_sample(resample=sample_rate)
+noise = noise[:, :speech.shape[1]]
 
-lowpass_waveform = torchaudio.functional.lowpass_biquad(waveform, sample_rate, cutoff_freq=3000)
+snr_db = 8
+scale = math.exp(snr_db / 10) * noise.norm(p=2) / speech.norm(p=2)
+speech = (scale * speech + noise) / 2
+
+plot_specgram(speech, sample_rate, title="BG noise added")
+play_audio(speech, sample_rate)
+
+# Apply filtering and change sample rate
+speech, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
+  speech,
+  sample_rate,
+  effects=[
+      ["lowpass", "4000"],
+      ["compand", "0.02,0.05", "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", "-8", "-7", "0.05"],
+      ["rate", "8000"],
+  ],
+)
+
+plot_specgram(speech, sample_rate, title="Filtered")
+play_audio(speech, sample_rate)
+
+# Apply telephony codec
+speech = F.apply_codec(speech, sample_rate, format="gsm")
+
+plot_specgram(speech, sample_rate, title="GSM Codec Applied")
+play_audio(speech, sample_rate)
 
-print("Min of lowpass_waveform: {}\nMax of lowpass_waveform: {}\nMean of lowpass_waveform: {}".format(lowpass_waveform.min(), lowpass_waveform.max(), lowpass_waveform.mean()))
 
-plt.figure()
-plt.plot(lowpass_waveform.t().numpy())
 
 ######################################################################
-# We can also visualize a waveform with the highpass biquad filter.
+# Feature Extractions
+# ===================
+# 
+# ``torchaudio`` implements feature extractions commonly used in audio
+# domain. They are available in ``torchaudio.functional`` and
+# ``torchaudio.transforms``.
+# 
+# ``functional`` module implements features as a stand alone functions.
+# They are stateless.
 # 
+# ``transforms`` module implements features in object-oriented manner,
+# using implementations from ``functional`` and ``torch.nn.Module``.
+# 
+# Because all the transforms are subclass of ``torch.nn.Module``, they can
+# be serialized using TorchScript.
+# 
+# For the complete list of available features, please refer to the
+# documentation. In this tutorial, we will look into conversion between
+# time domain and frequency domain (``Spectrogram``, ``GriffinLim``,
+# ``MelSpectrogram``) and augmentation technique called SpecAugment.
+# 
+
 
-highpass_waveform = torchaudio.functional.highpass_biquad(waveform, sample_rate, cutoff_freq=2000)
+######################################################################
+# Spectrogram
+# -----------
+# 
+# To get the frequency representation of audio signal, you can use
+# ``Spectrogram`` transform.
+# 
 
-print("Min of highpass_waveform: {}\nMax of highpass_waveform: {}\nMean of highpass_waveform: {}".format(highpass_waveform.min(), highpass_waveform.max(), highpass_waveform.mean()))
+waveform, sample_rate = get_speech_sample()
+
+n_fft = 1024
+win_length = None
+hop_length = 512
+
+# define transformation
+spectrogram = T.Spectrogram(
+    n_fft=n_fft,
+    win_length=win_length,
+    hop_length=hop_length,
+    center=True,
+    pad_mode="reflect",
+    power=2.0,
+)
+# Perform transformation
+spec = spectrogram(waveform)
+
+print_stats(spec)
+plot_spectrogram(spec[0], title='torchaudio')
 
-plt.figure()
-plt.plot(highpass_waveform.t().numpy())
 
 
 ######################################################################
-# Migrating to torchaudio from Kaldi
-# ----------------------------------
+# GriffinLim
+# ----------
+# 
+# To recover a waveform from spectrogram, you can use ``GriffinLim``.
 # 
-# Users may be familiar with
-# `Kaldi <http://github.com/kaldi-asr/kaldi>`_, a toolkit for speech
-# recognition. ``torchaudio`` offers compatibility with it in
-# ``torchaudio.kaldi_io``. It can indeed read from kaldi scp, or ark file
-# or streams with:
+
+torch.random.manual_seed(0)
+waveform, sample_rate = get_speech_sample()
+plot_waveform(waveform, sample_rate, title="Original")
+play_audio(waveform, sample_rate)
+
+n_fft = 1024
+win_length = None
+hop_length = 512
+
+spec = T.Spectrogram(
+    n_fft=n_fft,
+    win_length=win_length,
+    hop_length=hop_length,
+)(waveform)
+
+griffin_lim = T.GriffinLim(
+    n_fft=n_fft,
+    win_length=win_length,
+    hop_length=hop_length,
+)
+waveform = griffin_lim(spec)
+
+plot_waveform(waveform, sample_rate, title="Reconstructed")
+play_audio(waveform, sample_rate)
+
+
+
+######################################################################
+# Mel Filter Bank
+# ---------------
 # 
-# -  read_vec_int_ark
-# -  read_vec_flt_scp
-# -  read_vec_flt_arkfile/stream
-# -  read_mat_scp
-# -  read_mat_ark
+# ``torchaudio.functional.create_fb_matrix`` can generate the filter bank
+# to convert frequency bins to Mel-scale bins.
 # 
-# ``torchaudio`` provides Kaldi-compatible transforms for ``spectrogram``,
-# ``fbank``, ``mfcc``, and ``resample_waveform with the benefit of GPU support, see
-# `here <compliance.kaldi.html>`__ for more information.
+# Since this function does not require input audio/features, there is no
+# equivalent transform in ``torchaudio.transforms``.
 # 
 
-n_fft = 400.0
-frame_length = n_fft / sample_rate * 1000.0
-frame_shift = frame_length / 2.0
+n_fft = 256
+n_mels = 64
+sample_rate = 6000
 
-params = {
-    "channel": 0,
-    "dither": 0.0,
-    "window_type": "hanning",
-    "frame_length": frame_length,
-    "frame_shift": frame_shift,
-    "remove_dc_offset": False,
-    "round_to_power_of_two": False,
-    "sample_frequency": sample_rate,
-}
+mel_filters = F.create_fb_matrix(
+    int(n_fft // 2 + 1),
+    n_mels=n_mels,
+    f_min=0.,
+    f_max=sample_rate/2.,
+    sample_rate=sample_rate,
+    norm='slaney'
+)
+plot_mel_fbank(mel_filters, "Mel Filter Bank - torchaudio")
 
-specgram = torchaudio.compliance.kaldi.spectrogram(waveform, **params)
 
-print("Shape of spectrogram: {}".format(specgram.size()))
 
-plt.figure()
-plt.imshow(specgram.t().numpy(), cmap='gray')
+######################################################################
+# Comparison against librosa
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# As a comparison, here is the equivalent way to get the mel filter bank
+# with ``librosa``.
+# 
+# **Note** Currently, the result matches only when ``htk=True``.
+# ``torchaudio`` does not support the equivalent of ``htk=False`` option.
+# 
+
+mel_filters_librosa = librosa.filters.mel(
+    sample_rate,
+    n_fft,
+    n_mels=n_mels,
+    fmin=0.,
+    fmax=sample_rate/2.,
+    norm='slaney',
+    htk=True,
+).T
+
+plot_mel_fbank(mel_filters_librosa, "Mel Filter Bank - librosa")
+
+mse = torch.square(mel_filters - mel_filters_librosa).mean().item()
+print('Mean Square Difference: ', mse)
 
 
 ######################################################################
-# We also support computing the filterbank features from waveforms,
-# matching Kaldi’s implementation.
+# MelSpectrogram
+# --------------
 # 
+# Mel-scale spectrogram is a combination of Spectrogram and mel scale
+# conversion. In ``torchaudio``, there is a transform ``MelSpectrogram``
+# which is composed of ``Spectrogram`` and ``MelScale``.
+# 
+
+waveform, sample_rate = get_speech_sample()
 
-fbank = torchaudio.compliance.kaldi.fbank(waveform, **params)
+n_fft = 1024
+win_length = None
+hop_length = 512
+n_mels = 128
 
-print("Shape of fbank: {}".format(fbank.size()))
+mel_spectrogram = T.MelSpectrogram(
+    sample_rate=sample_rate,
+    n_fft=n_fft,
+    win_length=win_length,
+    hop_length=hop_length,
+    center=True,
+    pad_mode="reflect",
+    power=2.0,
+    norm='slaney',
+    onesided=True,
+    n_mels=n_mels,
+)
+
+melspec = mel_spectrogram(waveform)
+plot_spectrogram(
+    melspec[0], title="MelSpectrogram - torchaudio", ylabel='mel freq')
 
-plt.figure()
-plt.imshow(fbank.t().numpy(), cmap='gray')
 
 
 ######################################################################
-# You can create mel frequency cepstral coefficients from a raw audio signal
-# This matches the input/output of Kaldi’s compute-mfcc-feats.
+# Comparison against librosa
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# As a comparison, here is the equivalent way to get Mel-scale spectrogram
+# with ``librosa``.
+# 
+# **Note** Currently, the result matches only when ``htk=True``.
+# ``torchaudio`` does not support the equivalent of ``htk=False`` option.
 # 
 
-mfcc = torchaudio.compliance.kaldi.mfcc(waveform, **params)
+melspec_librosa = librosa.feature.melspectrogram(
+    waveform.numpy()[0],
+    sr=sample_rate,
+    n_fft=n_fft,
+    hop_length=hop_length,
+    win_length=win_length,
+    center=True,
+    pad_mode="reflect",
+    power=2.0,
+    n_mels=n_mels,
+    norm='slaney',
+    htk=True,
+)
+plot_spectrogram(
+    melspec_librosa, title="MelSpectrogram - librosa", ylabel='mel freq')
+
+mse = torch.square(melspec - melspec_librosa).mean().item()
+print('Mean Square Difference: ', mse)
+
+
+######################################################################
+# MFCC
+# ----
+# 
+
+waveform, sample_rate = get_speech_sample()
+
+n_fft = 2048
+win_length = None
+hop_length = 512
+n_mels = 256
+n_mfcc = 256
 
-print("Shape of mfcc: {}".format(mfcc.size()))
+mfcc_transform = T.MFCC(
+    sample_rate=sample_rate,
+    n_mfcc=n_mfcc, melkwargs={'n_fft': n_fft, 'n_mels': n_mels, 'hop_length': hop_length})
+
+mfcc = mfcc_transform(waveform)
+
+plot_spectrogram(mfcc[0])
 
-plt.figure()
-plt.imshow(mfcc.t().numpy(), cmap='gray')
 
 
 ######################################################################
-# Available Datasets
-# -----------------
+# Comparing against librosa
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
 # 
-# If you do not want to create your own dataset to train your model, ``torchaudio`` offers a
-# unified dataset interface. This interface supports lazy-loading of files to memory, download 
-# and extract functions, and datasets to build models.
+
+melspec = librosa.feature.melspectrogram(
+  y=waveform.numpy()[0], sr=sample_rate, n_fft=n_fft,
+  win_length=win_length, hop_length=hop_length,
+  n_mels=n_mels, htk=True, norm=None)
+
+mfcc_librosa = librosa.feature.mfcc(
+  S=librosa.core.spectrum.power_to_db(melspec),
+  n_mfcc=n_mfcc, dct_type=2, norm='ortho')
+
+plot_spectrogram(mfcc_librosa)
+
+mse = torch.square(mfcc - mfcc_librosa).mean().item()
+print('Mean Square Difference: ', mse)
+
+
+######################################################################
+# Pitch
+# -----
 # 
-# The datasets ``torchaudio`` currently supports are:
-#
-# -  **VCTK**: Speech data uttered by 109 native speakers of English with various accents
-#    (`Read more here <https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html>`_).
-# -  **Yesno**: Sixty recordings of one individual saying yes or no in Hebrew; each
-#    recording is eight words long (`Read more here <https://www.openslr.org/1/>`_).
-# -  **Common Voice**: An open source, multi-language dataset of voices that anyone can use
-#    to train speech-enabled applications (`Read more here <https://voice.mozilla.org/en/datasets>`_).
-# -  **LibriSpeech**: Large-scale (1000 hours) corpus of read English speech (`Read more here <http://www.openslr.org/12>`_).
+
+waveform, sample_rate = get_speech_sample()
+
+pitch = F.detect_pitch_frequency(waveform, sample_rate)
+plot_pitch(waveform, sample_rate, pitch)
+play_audio(waveform, sample_rate)
+
+
+######################################################################
+# Kaldi Pitch (beta)
+# ------------------
+# 
+# Kaldi Pitch feature [1] is pitch detection mechanism tuned for ASR
+# application. This is a beta feature in torchaudio, and only
+# ``functional`` form is available.
+# 
+# 1. A pitch extraction algorithm tuned for automatic speech recognition
+# 
+#    Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
+#    Khudanpur
+# 
+#    2014 IEEE International Conference on Acoustics, Speech and Signal
+#    Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
+#    10.1109/ICASSP.2014.6854049.
+#    [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
+#    [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
 # 
 
-yesno_data = torchaudio.datasets.YESNO('./', download=True)
+waveform, sample_rate = get_speech_sample(resample=16000)
 
-# A data point in Yesno is a tuple (waveform, sample_rate, labels) where labels is a list of integers with 1 for yes and 0 for no.
+pitch_feature = F.compute_kaldi_pitch(waveform, sample_rate)
+pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1]
 
-# Pick data point number 3 to see an example of the the yesno_data:
-n = 3
-waveform, sample_rate, labels = yesno_data[n]
+plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc)
+play_audio(waveform, sample_rate)
 
-print("Waveform: {}\nSample rate: {}\nLabels: {}".format(waveform, sample_rate, labels))
 
-plt.figure()
-plt.plot(waveform.t().numpy())
+######################################################################
+# Feature Augmentation
+# ====================
+# 
 
 
 ######################################################################
-# Now, whenever you ask for a sound file from the dataset, it is loaded in memory only when you ask for it.
-# Meaning, the dataset only loads and keeps in memory the items that you want and use, saving on memory.
-#
+# SpecAugment
+# -----------
+# 
+# `SpecAugment <https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html>`__
+# is a popular augmentation technique applied on spectrogram.
+# 
+# ``torchaudio`` implements ``TimeStrech``, ``TimeMasking`` and
+# ``FrequencyMasking``.
+# 
+
 
 ######################################################################
-# Conclusion
-# ----------
+# TimeStrech
+# ~~~~~~~~~~
 # 
-# We used an example raw audio signal, or waveform, to illustrate how to
-# open an audio file using ``torchaudio``, and how to pre-process,
-# transform, and apply functions to such waveform. We also demonstrated how
-# to use familiar Kaldi functions, as well as utilize built-in datasets to 
-# construct our models. Given that ``torchaudio`` is built on PyTorch,
-# these techniques can be used as building blocks for more advanced audio
-# applications, such as speech recognition, while leveraging GPUs.
+
+spec = get_spectrogram(power=None)
+strech = T.TimeStretch()
+
+rate = 1.2
+spec_ = strech(spec, rate)
+plot_spectrogram(F.complex_norm(spec_[0]), title=f"Stretched x{rate}", aspect='equal', xmax=304)
+
+plot_spectrogram(F.complex_norm(spec[0]), title="Original", aspect='equal', xmax=304)
+
+rate = 0.9
+spec_ = strech(spec, rate)
+plot_spectrogram(F.complex_norm(spec_[0]), title=f"Stretched x{rate}", aspect='equal', xmax=304)
+
+
+######################################################################
+# TimeMasking
+# ~~~~~~~~~~~
+# 
+
+torch.random.manual_seed(4)
+
+spec = get_spectrogram()
+plot_spectrogram(spec[0], title="Original")
+
+masking = T.TimeMasking(time_mask_param=80)
+spec = masking(spec)
+
+plot_spectrogram(spec[0], title="Masked along time axis")
+
+
+######################################################################
+# FrequencyMasking
+# ~~~~~~~~~~~~~~~~
 # 
+
+torch.random.manual_seed(4)
+
+spec = get_spectrogram()
+plot_spectrogram(spec[0], title="Original")
+
+masking = T.FrequencyMasking(freq_mask_param=80)
+spec = masking(spec)
+
+plot_spectrogram(spec[0], title="Masked along frequency axis")
+
+
+######################################################################
+# Datasets
+# ========
+# 
+# ``torchaudio`` provides easy access to common, publicly accessible
+# datasets. Please checkout the official documentation for the list of
+# available datasets.
+# 
+# Here, we take ``YESNO`` dataset and look into how to use it.
+# 
+
+YESNO_DOWNLOAD_PROCESS.join()
+
+dataset = torchaudio.datasets.YESNO(YESNO_DATASET_PATH, download=True)
+
+for i in [1, 3, 5]:
+  waveform, sample_rate, label = dataset[i]
+  plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
+  play_audio(waveform, sample_rate)
diff --git a/beginner_source/aws_distributed_training_tutorial.py b/beginner_source/aws_distributed_training_tutorial.py
deleted file mode 100644
index 1789516c2..000000000
--- a/beginner_source/aws_distributed_training_tutorial.py
+++ /dev/null
@@ -1,691 +0,0 @@
-"""
-(advanced) PyTorch 1.0 Distributed Trainer with Amazon AWS
-=============================================================
-
-**Author**: `Nathan Inkawhich <https://github.com/inkawhich>`_
-
-**Edited by**: `Teng Li <https://github.com/teng-li>`_
-
-"""
-
-
-######################################################################
-# In this tutorial we will show how to setup, code, and run a PyTorch 1.0
-# distributed trainer across two multi-gpu Amazon AWS nodes. We will start
-# with describing the AWS setup, then the PyTorch environment
-# configuration, and finally the code for the distributed trainer.
-# Hopefully you will find that there is actually very little code change
-# required to extend your current training code to a distributed
-# application, and most of the work is in the one-time environment setup.
-#
-
-
-######################################################################
-# Amazon AWS Setup
-# ----------------
-#
-# In this tutorial we will run distributed training across two multi-gpu
-# nodes. In this section we will first cover how to create the nodes, then
-# how to setup the security group so the nodes can communicate with
-# eachother.
-#
-# Creating the Nodes
-# ~~~~~~~~~~~~~~~~~~
-#
-# In Amazon AWS, there are seven steps to creating an instance. To get
-# started, login and select **Launch Instance**.
-#
-# **Step 1: Choose an Amazon Machine Image (AMI)** - Here we will select
-# the ``Deep Learning AMI (Ubuntu) Version 14.0``. As described, this
-# instance comes with many of the most popular deep learning frameworks
-# installed and is preconfigured with CUDA, cuDNN, and NCCL. It is a very
-# good starting point for this tutorial.
-#
-# **Step 2: Choose an Instance Type** - Now, select the GPU compute unit
-# called ``p2.8xlarge``. Notice, each of these instances has a different
-# cost but this instance provides 8 NVIDIA Tesla K80 GPUs per node, and
-# provides a good architecture for multi-gpu distributed training.
-#
-# **Step 3: Configure Instance Details** - The only setting to change here
-# is increasing the *Number of instances* to 2. All other configurations
-# may be left at default.
-#
-# **Step 4: Add Storage** - Notice, by default these nodes do not come
-# with a lot of storage (only 75 GB). For this tutorial, since we are only
-# using the STL-10 dataset, this is plenty of storage. But, if you want to
-# train on a larger dataset such as ImageNet, you will have to add much
-# more storage just to fit the dataset and any trained models you wish to
-# save.
-#
-# **Step 5: Add Tags** - Nothing to be done here, just move on.
-#
-# **Step 6: Configure Security Group** - This is a critical step in the
-# configuration process. By default two nodes in the same security group
-# would not be able to communicate in the distributed training setting.
-# Here, we want to create a **new** security group for the two nodes to be
-# in. However, we cannot finish configuring in this step. For now, just
-# remember your new security group name (e.g. launch-wizard-12) then move
-# on to Step 7.
-#
-# **Step 7: Review Instance Launch** - Here, review the instance then
-# launch it. By default, this will automatically start initializing the
-# two instances. You can monitor the initialization progress from the
-# dashboard.
-#
-# Configure Security Group
-# ~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# Recall that we were not able to properly configure the security group
-# when creating the instances. Once you have launched the instance, select
-# the *Network & Security > Security Groups* tab in the EC2 dashboard.
-# This will bring up a list of security groups you have access to. Select
-# the new security group you created in Step 6 (i.e. launch-wizard-12),
-# which will bring up tabs called *Description, Inbound, Outbound, and
-# Tags*. First, select the *Inbound* tab and *Edit* to add a rule to allow
-# "All Traffic" from "Sources" in the launch-wizard-12 security group.
-# Then select the *Outbound* tab and do the exact same thing. Now, we have
-# effectively allowed all Inbound and Outbound traffic of all types
-# between nodes in the launch-wizard-12 security group.
-#
-# Necessary Information
-# ~~~~~~~~~~~~~~~~~~~~~
-#
-# Before continuing, we must find and remember the IP addresses of both
-# nodes. In the EC2 dashboard find your running instances. For both
-# instances, write down the *IPv4 Public IP* and the *Private IPs*. For
-# the remainder of the document, we will refer to these as the
-# **node0-publicIP**, **node0-privateIP**, **node1-publicIP**, and
-# **node1-privateIP**. The public IPs are the addresses we will use to SSH
-# in, and the private IPs will be used for inter-node communication.
-#
-
-
-######################################################################
-# Environment Setup
-# -----------------
-#
-# The next critical step is the setup of each node. Unfortunately, we
-# cannot configure both nodes at the same time, so this process must be
-# done on each node separately. However, this is a one time setup, so once
-# you have the nodes configured properly you will not have to reconfigure
-# for future distributed training projects.
-#
-# The first step, once logged onto the node, is to create a new conda
-# environment with python 3.6 and numpy. Once created activate the
-# environment.
-#
-# ::
-#
-#     $ conda create -n nightly_pt python=3.6 numpy
-#     $ source activate nightly_pt
-#
-# Next, we will install a nightly build of Cuda 9.0 enabled PyTorch with
-# pip in the conda environment.
-#
-# ::
-#
-#     $ pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu90/torch_nightly.html
-#
-# We must also install torchvision so we can use the torchvision model and
-# dataset. At this time, we must build torchvision from source as the pip
-# installation will by default install an old version of PyTorch on top of
-# the nightly build we just installed.
-#
-# ::
-#
-#     $ cd
-#     $ git clone https://github.com/pytorch/vision.git
-#     $ cd vision
-#     $ python setup.py install
-#
-# And finally, **VERY IMPORTANT** step is to set the network interface
-# name for the NCCL socket. This is set with the environment variable
-# ``NCCL_SOCKET_IFNAME``. To get the correct name, run the ``ifconfig``
-# command on the node and look at the interface name that corresponds to
-# the node's *privateIP* (e.g. ens3). Then set the environment variable as
-#
-# ::
-#
-#     $ export NCCL_SOCKET_IFNAME=ens3
-#
-# Remember, do this on both nodes. You may also consider adding the
-# NCCL\_SOCKET\_IFNAME setting to your *.bashrc*. An important observation
-# is that we did not setup a shared filesystem between the nodes.
-# Therefore, each node will have to have a copy of the code and a copy of
-# the datasets. For more information about setting up a shared network
-# filesystem between nodes, see
-# `here <https://aws.amazon.com/blogs/aws/amazon-elastic-file-system-shared-file-storage-for-amazon-ec2/>`__.
-#
-
-
-######################################################################
-# Distributed Training Code
-# -------------------------
-#
-# With the instances running and the environments setup we can now get
-# into the training code. Most of the code here has been taken from the
-# `PyTorch ImageNet
-# Example <https://github.com/pytorch/examples/tree/master/imagenet>`__
-# which also supports distributed training. This code provides a good
-# starting point for a custom trainer as it has much of the boilerplate
-# training loop, validation loop, and accuracy tracking functionality.
-# However, you will notice that the argument parsing and other
-# non-essential functions have been stripped out for simplicity.
-#
-# In this example we will use
-# `torchvision.models.resnet18 <https://pytorch.org/docs/stable/torchvision/models.html#torchvision.models.resnet18>`__
-# model and will train it on the
-# `torchvision.datasets.STL10 <https://pytorch.org/docs/stable/torchvision/datasets.html#torchvision.datasets.STL10>`__
-# dataset. To accomodate for the dimensionality mismatch of STL-10 with
-# Resnet18, we will resize each image to 224x224 with a transform. Notice,
-# the choice of model and dataset are orthogonal to the distributed
-# training code, you may use any dataset and model you wish and the
-# process is the same. Lets get started by first handling the imports and
-# talking about some helper functions. Then we will define the train and
-# test functions, which have been largely taken from the ImageNet Example.
-# At the end, we will build the main part of the code which handles the
-# distributed training setup. And finally, we will discuss how to actually
-# run the code.
-#
-
-
-######################################################################
-# Imports
-# ~~~~~~~
-#
-# The important distributed training specific imports here are
-# `torch.nn.parallel <https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel>`__,
-# `torch.distributed <https://pytorch.org/docs/stable/distributed.html>`__,
-# `torch.utils.data.distributed <https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler>`__,
-# and
-# `torch.multiprocessing <https://pytorch.org/docs/stable/multiprocessing.html>`__.
-# It is also important to set the multiprocessing start method to *spawn*
-# or *forkserver* (only supported in Python 3),
-# as the default is *fork* which may cause deadlocks when using multiple
-# worker processes for dataloading.
-#
-
-import time
-import sys
-import torch
-
-import torch.nn as nn
-import torch.nn.parallel
-import torch.distributed as dist
-import torch.optim
-import torch.utils.data
-import torch.utils.data.distributed
-import torchvision.transforms as transforms
-import torchvision.datasets as datasets
-import torchvision.models as models
-
-from torch.multiprocessing import Pool, Process
-
-
-######################################################################
-# Helper Functions
-# ~~~~~~~~~~~~~~~~
-#
-# We must also define some helper functions and classes that will make
-# training easier. The ``AverageMeter`` class tracks training statistics
-# like accuracy and iteration count. The ``accuracy`` function computes
-# and returns the top-k accuracy of the model so we can track learning
-# progress. Both are provided for training convenience but neither are
-# distributed training specific.
-#
-
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-def accuracy(output, target, topk=(1,)):
-    """Computes the precision@k for the specified values of k"""
-    with torch.no_grad():
-        maxk = max(topk)
-        batch_size = target.size(0)
-
-        _, pred = output.topk(maxk, 1, True, True)
-        pred = pred.t()
-        correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-        res = []
-        for k in topk:
-            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
-            res.append(correct_k.mul_(100.0 / batch_size))
-        return res
-
-
-######################################################################
-# Train Functions
-# ~~~~~~~~~~~~~~~
-#
-# To simplify the main loop, it is best to separate a training epoch step
-# into a function called ``train``. This function trains the input model
-# for one epoch of the *train\_loader*. The only distributed training
-# artifact in this function is setting the
-# `non\_blocking <https://pytorch.org/docs/stable/notes/cuda.html#use-pinned-memory-buffers>`__
-# attributes of the data and label tensors to ``True`` before the forward
-# pass. This allows asynchronous GPU copies of the data meaning transfers
-# can be overlapped with computation. This function also outputs training
-# statistics along the way so we can track progress throughout the epoch.
-#
-# The other function to define here is ``adjust_learning_rate``, which
-# decays the initial learning rate at a fixed schedule. This is another
-# boilerplate trainer function that is useful to train accurate models.
-#
-
-def train(train_loader, model, criterion, optimizer, epoch):
-
-    batch_time = AverageMeter()
-    data_time = AverageMeter()
-    losses = AverageMeter()
-    top1 = AverageMeter()
-    top5 = AverageMeter()
-
-    # switch to train mode
-    model.train()
-
-    end = time.time()
-    for i, (input, target) in enumerate(train_loader):
-
-        # measure data loading time
-        data_time.update(time.time() - end)
-
-        # Create non_blocking tensors for distributed training
-        input = input.cuda(non_blocking=True)
-        target = target.cuda(non_blocking=True)
-
-        # compute output
-        output = model(input)
-        loss = criterion(output, target)
-
-        # measure accuracy and record loss
-        prec1, prec5 = accuracy(output, target, topk=(1, 5))
-        losses.update(loss.item(), input.size(0))
-        top1.update(prec1[0], input.size(0))
-        top5.update(prec5[0], input.size(0))
-
-        # compute gradients in a backward pass
-        optimizer.zero_grad()
-        loss.backward()
-
-        # Call step of optimizer to update model params
-        optimizer.step()
-
-        # measure elapsed time
-        batch_time.update(time.time() - end)
-        end = time.time()
-
-        if i % 10 == 0:
-            print('Epoch: [{0}][{1}/{2}]\t'
-                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
-                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
-                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
-                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
-                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
-                   epoch, i, len(train_loader), batch_time=batch_time,
-                   data_time=data_time, loss=losses, top1=top1, top5=top5))
-
-def adjust_learning_rate(initial_lr, optimizer, epoch):
-    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
-    lr = initial_lr * (0.1 ** (epoch // 30))
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = lr
-
-
-
-######################################################################
-# Validation Function
-# ~~~~~~~~~~~~~~~~~~~
-#
-# To track generalization performance and simplify the main loop further
-# we can also extract the validation step into a function called
-# ``validate``. This function runs a full validation step of the input
-# model on the input validation dataloader and returns the top-1 accuracy
-# of the model on the validation set. Again, you will notice the only
-# distributed training feature here is setting ``non_blocking=True`` for
-# the training data and labels before they are passed to the model.
-#
-
-def validate(val_loader, model, criterion):
-
-    batch_time = AverageMeter()
-    losses = AverageMeter()
-    top1 = AverageMeter()
-    top5 = AverageMeter()
-
-    # switch to evaluate mode
-    model.eval()
-
-    with torch.no_grad():
-        end = time.time()
-        for i, (input, target) in enumerate(val_loader):
-
-            input = input.cuda(non_blocking=True)
-            target = target.cuda(non_blocking=True)
-
-            # compute output
-            output = model(input)
-            loss = criterion(output, target)
-
-            # measure accuracy and record loss
-            prec1, prec5 = accuracy(output, target, topk=(1, 5))
-            losses.update(loss.item(), input.size(0))
-            top1.update(prec1[0], input.size(0))
-            top5.update(prec5[0], input.size(0))
-
-            # measure elapsed time
-            batch_time.update(time.time() - end)
-            end = time.time()
-
-            if i % 100 == 0:
-                print('Test: [{0}/{1}]\t'
-                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
-                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
-                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
-                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
-                       i, len(val_loader), batch_time=batch_time, loss=losses,
-                       top1=top1, top5=top5))
-
-        print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
-              .format(top1=top1, top5=top5))
-
-    return top1.avg
-
-
-######################################################################
-# Inputs
-# ~~~~~~
-#
-# With the helper functions out of the way, now we have reached the
-# interesting part. Here is where we will define the inputs for the run.
-# Some of the inputs are standard model training inputs such as batch size
-# and number of training epochs, and some are specific to our distributed
-# training task. The required inputs are:
-#
-# -  **batch\_size** - batch size for *each* process in the distributed
-#    training group. Total batch size across distributed model is
-#    batch\_size\*world\_size
-#
-# -  **workers** - number of worker processes used with the dataloaders in
-#    each process
-#
-# -  **num\_epochs** - total number of epochs to train for
-#
-# -  **starting\_lr** - starting learning rate for training
-#
-# -  **world\_size** - number of processes in the distributed training
-#    environment
-#
-# -  **dist\_backend** - backend to use for distributed training
-#    communication (i.e. NCCL, Gloo, MPI, etc.). In this tutorial, since
-#    we are using several multi-gpu nodes, NCCL is suggested.
-#
-# -  **dist\_url** - URL to specify the initialization method of the
-#    process group. This may contain the IP address and port of the rank0
-#    process or be a non-existant file on a shared file system. Here,
-#    since we do not have a shared file system this will incorporate the
-#    **node0-privateIP** and the port on node0 to use.
-#
-
-print("Collect Inputs...")
-
-# Batch Size for training and testing
-batch_size = 32
-
-# Number of additional worker processes for dataloading
-workers = 2
-
-# Number of epochs to train for
-num_epochs = 2
-
-# Starting Learning Rate
-starting_lr = 0.1
-
-# Number of distributed processes
-world_size = 4
-
-# Distributed backend type
-dist_backend = 'nccl'
-
-# Url used to setup distributed training
-dist_url = "tcp://172.31.22.234:23456"
-
-
-######################################################################
-# Initialize process group
-# ~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# One of the most important parts of distributed training in PyTorch is to
-# properly setup the process group, which is the **first** step in
-# initializing the ``torch.distributed`` package. To do this, we will use
-# the ``torch.distributed.init_process_group`` function which takes
-# several inputs. First, a *backend* input which specifies the backend to
-# use (i.e. NCCL, Gloo, MPI, etc.). An *init\_method* input which is
-# either a url containing the address and port of the rank0 machine or a
-# path to a non-existant file on the shared file system. Note, to use the
-# file init\_method, all machines must have access to the file, similarly
-# for the url method, all machines must be able to communicate on the
-# network so make sure to configure any firewalls and network settings to
-# accomodate. The *init\_process\_group* function also takes *rank* and
-# *world\_size* arguments which specify the rank of this process when run
-# and the number of processes in the collective, respectively.
-# The *init\_method* input can also be "env://". In this case, the address
-# and port of the rank0 machine will be read from the following two
-# environment variables respectively: MASTER_ADDR, MASTER_PORT.  If *rank*
-# and *world\_size* arguments are not specified in the *init\_process\_group*
-# function, they both can be read from the following two environment
-# variables respectively as well: RANK, WORLD_SIZE.
-#
-# Another important step, especially when each node has multiple gpus is
-# to set the *local\_rank* of this process. For example, if you have two
-# nodes, each with 8 GPUs and you wish to train with all of them then
-# :math:`world\_size=16` and each node will have a process with local rank
-# 0-7. This local\_rank is used to set the device (i.e. which GPU to use)
-# for the process and later used to set the device when creating a
-# distributed data parallel model. It is also recommended to use NCCL
-# backend in this hypothetical environment as NCCL is preferred for
-# multi-gpu nodes.
-#
-
-print("Initialize Process Group...")
-# Initialize Process Group
-# v1 - init with url
-dist.init_process_group(backend=dist_backend, init_method=dist_url, rank=int(sys.argv[1]), world_size=world_size)
-# v2 - init with file
-# dist.init_process_group(backend="nccl", init_method="file:///home/ubuntu/pt-distributed-tutorial/trainfile", rank=int(sys.argv[1]), world_size=world_size)
-# v3 - init with environment variables
-# dist.init_process_group(backend="nccl", init_method="env://", rank=int(sys.argv[1]), world_size=world_size)
-
-
-# Establish Local Rank and set device on this node
-local_rank = int(sys.argv[2])
-dp_device_ids = [local_rank]
-torch.cuda.set_device(local_rank)
-
-
-######################################################################
-# Initialize Model
-# ~~~~~~~~~~~~~~~~
-#
-# The next major step is to initialize the model to be trained. Here, we
-# will use a resnet18 model from ``torchvision.models`` but any model may
-# be used. First, we initialize the model and place it in GPU memory.
-# Next, we make the model ``DistributedDataParallel``, which handles the
-# distribution of the data to and from the model and is critical for
-# distributed training. The ``DistributedDataParallel`` module also
-# handles the averaging of gradients across the world, so we do not have
-# to explicitly average the gradients in the training step.
-#
-# It is important to note that this is a blocking function, meaning
-# program execution will wait at this function until *world\_size*
-# processes have joined the process group. Also, notice we pass our device
-# ids list as a parameter which contains the local rank (i.e. GPU) we are
-# using. Finally, we specify the loss function and optimizer to train the
-# model with.
-#
-
-print("Initialize Model...")
-# Construct Model
-model = models.resnet18(pretrained=False).cuda()
-# Make model DistributedDataParallel
-model = torch.nn.parallel.DistributedDataParallel(model, device_ids=dp_device_ids, output_device=local_rank)
-
-# define loss function (criterion) and optimizer
-criterion = nn.CrossEntropyLoss().cuda()
-optimizer = torch.optim.SGD(model.parameters(), starting_lr, momentum=0.9, weight_decay=1e-4)
-
-
-######################################################################
-# Initialize Dataloaders
-# ~~~~~~~~~~~~~~~~~~~~~~
-#
-# The last step in preparation for the training is to specify which
-# dataset to use. Here we use the `STL-10
-# dataset <https://cs.stanford.edu/~acoates/stl10/>`__ from
-# `torchvision.datasets.STL10 <https://pytorch.org/docs/stable/torchvision/datasets.html#torchvision.datasets.STL10>`__.
-# The STL10 dataset is a 10 class dataset of 96x96px color images. For use
-# with our model, we resize the images to 224x224px in the transform. One
-# distributed training specific item in this section is the use of the
-# ``DistributedSampler`` for the training set, which is designed to be
-# used in conjunction with ``DistributedDataParallel`` models. This object
-# handles the partitioning of the dataset across the distributed
-# environment so that not all models are training on the same subset of
-# data, which would be counterproductive. Finally, we create the
-# ``DataLoader``'s which are responsible for feeding the data to the
-# processes.
-#
-# The STL-10 dataset will automatically download on the nodes if they are
-# not present. If you wish to use your own dataset you should download the
-# data, write your own dataset handler, and construct a dataloader for
-# your dataset here.
-#
-
-print("Initialize Dataloaders...")
-# Define the transform for the data. Notice, we must resize to 224x224 with this dataset and model.
-transform = transforms.Compose(
-    [transforms.Resize(224),
-     transforms.ToTensor(),
-     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
-
-# Initialize Datasets. STL10 will automatically download if not present
-trainset = datasets.STL10(root='./data', split='train', download=True, transform=transform)
-valset = datasets.STL10(root='./data', split='test', download=True, transform=transform)
-
-# Create DistributedSampler to handle distributing the dataset across nodes when training
-# This can only be called after torch.distributed.init_process_group is called
-train_sampler = torch.utils.data.distributed.DistributedSampler(trainset)
-
-# Create the Dataloaders to feed data to the training and validation steps
-train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=(train_sampler is None), num_workers=workers, pin_memory=False, sampler=train_sampler)
-val_loader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=workers, pin_memory=False)
-
-
-######################################################################
-# Training Loop
-# ~~~~~~~~~~~~~
-#
-# The last step is to define the training loop. We have already done most
-# of the work for setting up the distributed training so this is not
-# distributed training specific. The only detail is setting the current
-# epoch count in the ``DistributedSampler``, as the sampler shuffles the
-# data going to each process deterministically based on epoch. After
-# updating the sampler, the loop runs a full training epoch, runs a full
-# validation step then prints the performance of the current model against
-# the best performing model so far. After training for num\_epochs, the
-# loop exits and the tutorial is complete. Notice, since this is an
-# exercise we are not saving models but one may wish to keep track of the
-# best performing model then save it at the end of training (see
-# `here <https://github.com/pytorch/examples/blob/master/imagenet/main.py#L184>`__).
-#
-
-best_prec1 = 0
-
-for epoch in range(num_epochs):
-    # Set epoch count for DistributedSampler
-    train_sampler.set_epoch(epoch)
-
-    # Adjust learning rate according to schedule
-    adjust_learning_rate(starting_lr, optimizer, epoch)
-
-    # train for one epoch
-    print("\nBegin Training Epoch {}".format(epoch+1))
-    train(train_loader, model, criterion, optimizer, epoch)
-
-    # evaluate on validation set
-    print("Begin Validation @ Epoch {}".format(epoch+1))
-    prec1 = validate(val_loader, model, criterion)
-
-    # remember best prec@1 and save checkpoint if desired
-    # is_best = prec1 > best_prec1
-    best_prec1 = max(prec1, best_prec1)
-
-    print("Epoch Summary: ")
-    print("\tEpoch Accuracy: {}".format(prec1))
-    print("\tBest Accuracy: {}".format(best_prec1))
-
-
-######################################################################
-# Running the Code
-# ----------------
-#
-# Unlike most of the other PyTorch tutorials, this code may not be run
-# directly out of this notebook. To run, download the .py version of this
-# file (or convert it using
-# `this <https://gist.github.com/chsasank/7218ca16f8d022e02a9c0deb94a310fe>`__)
-# and upload a copy to both nodes. The astute reader would have noticed
-# that we hardcoded the **node0-privateIP** and :math:`world\_size=4` but
-# input the *rank* and *local\_rank* inputs as arg[1] and arg[2] command
-# line arguments, respectively. Once uploaded, open two ssh terminals into
-# each node.
-#
-# -  On the first terminal for node0, run ``$ python main.py 0 0``
-#
-# -  On the second terminal for node0 run ``$ python main.py 1 1``
-#
-# -  On the first terminal for node1, run ``$ python main.py 2 0``
-#
-# -  On the second terminal for node1 run ``$ python main.py 3 1``
-#
-# The programs will start and wait after printing "Initialize Model..."
-# for all four processes to join the process group. Notice the first
-# argument is not repeated as this is the unique global rank of the
-# process. The second argument is repeated as that is the local rank of
-# the process running on the node. If you run ``nvidia-smi`` on each node,
-# you will see two processes on each node, one running on GPU0 and one on
-# GPU1.
-#
-# We have now completed the distributed training example! Hopefully you
-# can see how you would use this tutorial to help train your own models on
-# your own datasets, even if you are not using the exact same distributed
-# envrionment. If you are using AWS, don't forget to **SHUT DOWN YOUR
-# NODES** if you are not using them or you may find an uncomfortably large
-# bill at the end of the month.
-#
-# **Where to go next**
-#
-# -  Check out the `launcher
-#    utility <https://pytorch.org/docs/stable/distributed.html#launch-utility>`__
-#    for a different way of kicking off the run
-#
-# -  Check out the `torch.multiprocessing.spawn
-#    utility <https://pytorch.org/docs/master/multiprocessing.html#spawning-subprocesses>`__
-#    for another easy way of kicking off multiple distributed processes.
-#    `PyTorch ImageNet Example <https://github.com/pytorch/examples/tree/master/imagenet>`__
-#    has it implemented and can demonstrate how to use it.
-#
-# -  If possible, setup a NFS so you only need one copy of the dataset
-#
diff --git a/beginner_source/basics/README.txt b/beginner_source/basics/README.txt
new file mode 100644
index 000000000..65802da61
--- /dev/null
+++ b/beginner_source/basics/README.txt
@@ -0,0 +1,39 @@
+Learn the Basics
+------------------
+
+1. intro.py
+    Learn the Basics
+    https://pytorch.org/tutorials/beginner/basics/intro.html
+
+2. quickstart_tutorial.py
+    Quickstart
+    https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
+
+3. tensors_tutorial.py
+    Tensors
+    https://pytorch.org/tutorials/beginner/basics/tensor_tutorial.html
+
+4. dataquickstart_tutorial.py
+    Datasets & DataLoaders
+    https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
+
+5. transforms_tutorial.py
+    Transforms
+    https://pytorch.org/tutorials/beginner/basics/transforms_tutorial.html
+
+6. buildmodel_tutorial.py
+    Building the Neural Network
+    https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html
+
+7. autograd_tutorial.py
+    Automatic Differentiation with torch.autograd_tutorial
+    https://pytorch.org/tutorials/beginner/basics/autograd_tutorial.html
+
+8. optimization_tutorial.py
+    Optimizing Model Parameters
+    https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
+
+9. saveloadrun_tutorial.py
+    Save and Load the Model
+    https://pytorch.org/tutorials/beginner/basics/saveloadrun_tutorial.html
+
diff --git a/beginner_source/basics/autogradqs_tutorial.py b/beginner_source/basics/autogradqs_tutorial.py
new file mode 100644
index 000000000..8a92fc024
--- /dev/null
+++ b/beginner_source/basics/autogradqs_tutorial.py
@@ -0,0 +1,243 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ || 
+`Tensors <tensorqs_tutorial.html>`_ || 
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+**Autograd** ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Automatic Differentiation with ``torch.autograd``
+=======================================
+
+When training neural networks, the most frequently used algorithm is
+**back propagation**. In this algorithm, parameters (model weights) are
+adjusted according to the **gradient** of the loss function with respect
+to the given parameter.
+
+To compute those gradients, PyTorch has a built-in differentiation engine
+called ``torch.autograd``. It supports automatic computation of gradient for any
+computational graph.
+
+Consider the simplest one-layer neural network, with input ``x``,
+parameters ``w`` and ``b``, and some loss function. It can be defined in
+PyTorch in the following manner:
+"""
+
+import torch
+
+x = torch.ones(5)  # input tensor
+y = torch.zeros(3)  # expected output
+w = torch.randn(5, 3, requires_grad=True)
+b = torch.randn(3, requires_grad=True)
+z = torch.matmul(x, w)+b
+loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
+
+
+######################################################################
+# Tensors, Functions and Computational graph
+# ------------------------------------------
+#
+# This code defines the following **computational graph**:
+#
+# .. figure:: /_static/img/basics/comp-graph.png
+#    :alt:
+#
+# In this network, ``w`` and ``b`` are **parameters**, which we need to
+# optimize. Thus, we need to be able to compute the gradients of loss
+# function with respect to those variables. In orded to do that, we set
+# the ``requires_grad`` property of those tensors.
+
+#######################################################################
+# .. note:: You can set the value of ``requires_grad`` when creating a
+#           tensor, or later by using ``x.requires_grad_(True)`` method.
+
+#######################################################################
+# A function that we apply to tensors to construct computational graph is
+# in fact an object of class ``Function``. This object knows how to
+# compute the function in the *forward* direction, and also how to compute
+# it's derivative during the *backward propagation* step. A reference to
+# the backward propagation function is stored in ``grad_fn`` property of a
+# tensor. You can find more information of ``Function`` `in the
+# documentation <https://pytorch.org/docs/stable/autograd.html#function>`__.
+#
+
+print('Gradient function for z =',z.grad_fn)
+print('Gradient function for loss =', loss.grad_fn)
+
+######################################################################
+# Computing Gradients
+# -------------------
+#
+# To optimize weights of parameters in the neural network, we need to
+# compute the derivatives of our loss function with respect to parameters,
+# namely, we need :math:`\frac{\partial loss}{\partial w}` and
+# :math:`\frac{\partial loss}{\partial b}` under some fixed values of
+# ``x`` and ``y``. To compute those derivatives, we call
+# ``loss.backward()``, and then retrieve the values from ``w.grad`` and
+# ``b.grad``:
+#
+
+loss.backward()
+print(w.grad)
+print(b.grad)
+
+
+######################################################################
+# .. note::
+#   - We can only obtain the ``grad`` properties for the leaf
+#     nodes of the computational graph, which have ``requires_grad`` property
+#     set to ``True``. For all other nodes in our graph, gradients will not be
+#     available.
+#   - We can only perform gradient calculations using
+#     ``backward`` once on a given graph, for performance reasons. If we need
+#     to do several ``backward`` calls on the same graph, we need to pass
+#     ``retain_graph=True`` to the ``backward`` call.
+#
+
+
+######################################################################
+# Disabling Gradient Tracking
+# ---------------------------
+#
+# By default, all tensors with ``requires_grad=True`` are tracking their
+# computational history and support gradient computation. However, there
+# are some cases when we do not need to do that, for example, when we have
+# trained the model and just want to apply it to some input data, i.e. we
+# only want to do *forward* computations through the network. We can stop
+# tracking computations by surrounding our computation code with
+# ``torch.no_grad()`` block:
+#
+
+z = torch.matmul(x, w)+b
+print(z.requires_grad)
+
+with torch.no_grad():
+    z = torch.matmul(x, w)+b
+print(z.requires_grad)
+
+
+######################################################################
+# Another way to achieve the same result is to use the ``detach()`` method
+# on the tensor:
+#
+
+z = torch.matmul(x, w)+b
+z_det = z.detach()
+print(z_det.requires_grad)
+
+######################################################################
+# There are reasons you might want to disable gradient tracking:
+#   - To mark some parameters in your neural network at **frozen parameters**. This is
+#     a very common scenario for
+#     `finetuning a pretrained network <https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html>`__
+#   - To **speed up computations** when you are only doing forward pass, because computations on tensors that do
+#     not track gradients would be more efficient.
+
+
+######################################################################
+
+######################################################################
+# More on Computational Graphs
+# ----------------------------
+# Conceptually, autograd keeps a record of data (tensors) and all executed
+# operations (along with the resulting new tensors) in a directed acyclic
+# graph (DAG) consisting of
+# `Function <https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function>`__
+# objects. In this DAG, leaves are the input tensors, roots are the output
+# tensors. By tracing this graph from roots to leaves, you can
+# automatically compute the gradients using the chain rule.
+#
+# In a forward pass, autograd does two things simultaneously:
+#
+# - run the requested operation to compute a resulting tensor
+# - maintain the operation’s *gradient function* in the DAG.
+#
+# The backward pass kicks off when ``.backward()`` is called on the DAG
+# root. ``autograd`` then:
+#
+# - computes the gradients from each ``.grad_fn``,
+# - accumulates them in the respective tensor’s ``.grad`` attribute
+# - using the chain rule, propagates all the way to the leaf tensors.
+#
+# .. note::
+#   **DAGs are dynamic in PyTorch**
+#   An important thing to note is that the graph is recreated from scratch; after each
+#   ``.backward()`` call, autograd starts populating a new graph. This is
+#   exactly what allows you to use control flow statements in your model;
+#   you can change the shape, size and operations at every iteration if
+#   needed.
+
+######################################################################
+# Optional Reading: Tensor Gradients and Jacobian Products
+# --------------------------------------
+#
+# In many cases, we have a scalar loss function, and we need to compute
+# the gradient with respect to some parameters. However, there are cases
+# when the output function is an arbitrary tensor. In this case, PyTorch
+# allows you to compute so-called **Jacobian product**, and not the actual
+# gradient.
+#
+# For a vector function :math:`\vec{y}=f(\vec{x})`, where
+# :math:`\vec{x}=\langle x_1,\dots,x_n\rangle` and
+# :math:`\vec{y}=\langle y_1,\dots,y_m\rangle`, a gradient of
+# :math:`\vec{y}` with respect to :math:`\vec{x}` is given by **Jacobian
+# matrix**:
+#
+# .. math::
+#
+#
+#    \begin{align}J=\left(\begin{array}{ccc}
+#       \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{1}}{\partial x_{n}}\\
+#       \vdots & \ddots & \vdots\\
+#       \frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
+#       \end{array}\right)\end{align}
+#
+# Instead of computing the Jacobian matrix itself, PyTorch allows you to
+# compute **Jacobian Product** :math:`v^T\cdot J` for a given input vector
+# :math:`v=(v_1 \dots v_m)`. This is achieved by calling ``backward`` with
+# :math:`v` as an argument. The size of :math:`v` should be the same as
+# the size of the original tensor, with respect to which we want to
+# compute the product:
+#
+
+inp = torch.eye(5, requires_grad=True)
+out = (inp+1).pow(2)
+out.backward(torch.ones_like(inp), retain_graph=True)
+print("First call\n", inp.grad)
+out.backward(torch.ones_like(inp), retain_graph=True)
+print("\nSecond call\n", inp.grad)
+inp.grad.zero_()
+out.backward(torch.ones_like(inp), retain_graph=True)
+print("\nCall after zeroing gradients\n", inp.grad)
+
+
+######################################################################
+# Notice that when we call ``backward`` for the second time with the same
+# argument, the value of the gradient is different. This happens because
+# when doing ``backward`` propagation, PyTorch **accumulates the
+# gradients**, i.e. the value of computed gradients is added to the
+# ``grad`` property of all leaf nodes of computational graph. If you want
+# to compute the proper gradients, you need to zero out the ``grad``
+# property before. In real-life training an *optimizer* helps us to do
+# this.
+
+######################################################################
+# .. note:: Previously we were calling ``backward()`` function without
+#           parameters. This is essentially equivalent to calling
+#           ``backward(torch.tensor(1.0))``, which is a useful way to compute the
+#           gradients in case of a scalar-valued function, such as loss during
+#           neural network training.
+#
+
+######################################################################
+# --------------
+#
+
+#################################################################
+# Further Reading
+# ~~~~~~~~~~~~~~~~~
+# - `Autograd Mechanics <https://pytorch.org/docs/stable/notes/autograd.html>`_
+
diff --git a/beginner_source/basics/buildmodel_tutorial.py b/beginner_source/basics/buildmodel_tutorial.py
new file mode 100644
index 000000000..bbcc8f8a3
--- /dev/null
+++ b/beginner_source/basics/buildmodel_tutorial.py
@@ -0,0 +1,200 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ || 
+`Tensors <tensorqs_tutorial.html>`_ ||
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+**Build Model** ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Build the Neural Network
+===================
+
+Neural networks comprise of layers/modules that perform operations on data. 
+The `torch.nn <https://pytorch.org/docs/stable/nn.html>`_ namespace provides all the building blocks you need to 
+build your own neural network. Every module in PyTorch subclasses the `nn.Module <https://pytorch.org/docs/stable/generated/torch.nn.Module.html>`_. 
+A neural network is a module itself that consists of other modules (layers). This nested structure allows for
+building and managing complex architectures easily.
+
+In the following sections, we'll build a neural network to classify images in the FashionMNIST dataset.
+
+"""
+
+import os
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+
+
+#############################################
+# Get Device for Training
+# -----------------------
+# We want to be able to train our model on a hardware accelerator like the GPU, 
+# if it is available. Let's check to see if 
+# `torch.cuda <https://pytorch.org/docs/stable/notes/cuda.html>`_ is available, else we 
+# continue to use the CPU. 
+
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print('Using {} device'.format(device))
+
+##############################################
+# Define the Class
+# -------------------------
+# We define our neural network by subclassing ``nn.Module``, and 
+# initialize the neural network layers in ``__init__``. Every ``nn.Module`` subclass implements
+# the operations on input data in the ``forward`` method. 
+
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super(NeuralNetwork, self).__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+##############################################
+# We create an instance of ``NeuralNetwork``, and move it to the ``device``, and print 
+# it's structure.
+
+model = NeuralNetwork().to(device)
+print(model)
+
+
+##############################################
+# To use the model, we pass it the input data. This executes the model's ``forward``,
+# along with some `background operations <https://github.com/pytorch/pytorch/blob/270111b7b611d174967ed204776985cefca9c144/torch/nn/modules/module.py#L866>`_. 
+# Do not call ``model.forward()`` directly!
+# 
+# Calling the model on the input returns a 10-dimensional tensor with raw predicted values for each class.
+# We get the prediction probabilities by passing it through an instance of the ``nn.Softmax`` module.
+
+X = torch.rand(1, 28, 28, device=device)
+logits = model(X) 
+pred_probab = nn.Softmax(dim=1)(logits)
+y_pred = pred_probab.argmax(1)
+print(f"Predicted class: {y_pred}")
+
+
+######################################################################
+# --------------
+#
+
+
+##############################################
+# Model Layers
+# -------------------------
+#
+# Lets break down the layers in the FashionMNIST model. To illustrate it, we 
+# will take a sample minibatch of 3 images of size 28x28 and see what happens to it as 
+# we pass it through the network. 
+
+input_image = torch.rand(3,28,28)
+print(input_image.size())
+
+##################################################
+# nn.Flatten
+# ^^^^^^^^^^^^^^^^^^^^^^
+# We initialize the `nn.Flatten  <https://pytorch.org/docs/stable/generated/torch.nn.Flatten.html>`_ 
+# layer to convert each 2D 28x28 image into a contiguous array of 784 pixel values (
+# the minibatch dimension (at dim=0) is maintained).
+ 
+flatten = nn.Flatten()
+flat_image = flatten(input_image)
+print(flat_image.size())
+
+##############################################
+# nn.Linear 
+# ^^^^^^^^^^^^^^^^^^^^^^
+# The `linear layer <https://pytorch.org/docs/stable/generated/torch.nn.Linear.html>`_
+# is a module that applies a linear transformation on the input using it's stored weights and biases.
+#
+layer1 = nn.Linear(in_features=28*28, out_features=20)
+hidden1 = layer1(flat_image)
+print(hidden1.size())
+
+
+#################################################
+# nn.ReLU
+# ^^^^^^^^^^^^^^^^^^^^^^
+# Non-linear activations are what create the complex mappings between the model's inputs and outputs.
+# They are applied after linear transformations to introduce *nonlinearity*, helping neural networks
+# learn a wide variety of phenomena.
+#
+# In this model, we use `nn.ReLU <https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html>`_ between our
+# linear layers, but there's other activations to introduce non-linearity in your model.
+
+print(f"Before ReLU: {hidden1}\n\n")
+hidden1 = nn.ReLU()(hidden1)
+print(f"After ReLU: {hidden1}")
+
+
+
+#################################################
+# nn.Sequential
+# ^^^^^^^^^^^^^^^^^^^^^^
+# `nn.Sequential <https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html>`_ is an ordered 
+# container of modules. The data is passed through all the modules in the same order as defined. You can use
+# sequential containers to put together a quick network like ``seq_modules``.
+
+seq_modules = nn.Sequential(
+    flatten,
+    layer1,
+    nn.ReLU(),
+    nn.Linear(20, 10)
+)
+input_image = torch.rand(3,28,28)
+logits = seq_modules(input_image)
+
+################################################################
+# nn.Softmax
+# ^^^^^^^^^^^^^^^^^^^^^^
+# The last linear layer of the neural network returns `logits` - raw values in [-\infty, \infty] - which are passed to the
+# `nn.Softmax <https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html>`_ module. The logits are scaled to values 
+# [0, 1] representing the model's predicted probabilities for each class. ``dim`` parameter indicates the dimension along 
+# which the values must sum to 1. 
+
+softmax = nn.Softmax(dim=1)
+pred_probab = softmax(logits)
+
+
+#################################################
+# Model Parameters
+# -------------------------
+# Many layers inside a neural network are *parameterized*, i.e. have associated weights 
+# and biases that are optimized during training. Subclassing ``nn.Module`` automatically 
+# tracks all fields defined inside your model object, and makes all parameters 
+# accessible using your model's ``parameters()`` or ``named_parameters()`` methods. 
+# 
+# In this example, we iterate over each parameter, and print its size and a preview of its values. 
+#
+
+
+print("Model structure: ", model, "\n\n")
+
+for name, param in model.named_parameters():
+    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
+
+######################################################################
+# --------------
+#
+
+#################################################################
+# Further Reading
+# --------------
+# - `torch.nn API <https://pytorch.org/docs/stable/nn.html>`_
+
+
+
diff --git a/beginner_source/basics/data_tutorial.py b/beginner_source/basics/data_tutorial.py
new file mode 100644
index 000000000..383dcbc88
--- /dev/null
+++ b/beginner_source/basics/data_tutorial.py
@@ -0,0 +1,251 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ || 
+`Tensors <tensorqs_tutorial.html>`_ ||
+**Datasets & DataLoaders** ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Datasets & Dataloaders
+===================
+
+"""
+
+#################################################################
+# Code for processing data samples can get messy and hard to maintain; we ideally want our dataset code
+# to be decoupled from our model training code for better readability and modularity.
+# PyTorch provides two data primitives: ``torch.utils.data.DataLoader`` and ``torch.utils.data.Dataset``
+# that allow you to use pre-loaded datasets as well as your own data.
+# ``Dataset`` stores the samples and their corresponding labels, and ``DataLoader`` wraps an iterable around
+# the ``Dataset`` to enable easy access to the samples.
+#
+# PyTorch domain libraries provide a number of pre-loaded datasets (such as FashionMNIST) that 
+# subclass ``torch.utils.data.Dataset`` and implement functions specific to the particular data.
+# They can be used to prototype and benchmark your model. You can find them
+# here: `Image Datasets <https://pytorch.org/docs/stable/torchvision/datasets.html>`_,
+# `Text Datasets  <https://pytorch.org/text/stable/datasets.html>`_, and
+# `Audio Datasets <https://pytorch.org/audio/stable/datasets.html>`_
+#
+
+############################################################
+# Loading a Dataset
+# -------------------
+#
+# Here is an example of how to load the `Fashion-MNIST <https://research.zalando.com/welcome/mission/research-projects/fashion-mnist/>`_ dataset from TorchVision.
+# Fashion-MNIST is a dataset of Zalando’s article images consisting of of 60,000 training examples and 10,000 test examples.
+# Each example comprises a 28×28 grayscale image and an associated label from one of 10 classes.
+#
+# We load the `FashionMNIST Dataset <https://pytorch.org/docs/stable/torchvision/datasets.html#fashion-mnist>`_ with the following parameters:
+#  - ``root`` is the path where the train/test data is stored,
+#  - ``train`` specifies training or test dataset,
+#  - ``download=True`` downloads the data from the internet if it's not available at ``root``.
+#  - ``transform`` and ``target_transform`` specify the feature and label transformations
+
+
+import torch
+from torch.utils.data import Dataset
+from torchvision import datasets
+from torchvision.transforms import ToTensor, Lambda
+import matplotlib.pyplot as plt
+
+
+training_data = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor()
+)
+
+test_data = datasets.FashionMNIST(
+    root="data",
+    train=False,
+    download=True,
+    transform=ToTensor()
+)
+
+
+#################################################################
+# Iterating and Visualizing the Dataset
+# -----------------
+#
+# We can index ``Datasets`` manually like a list: ``training_data[index]``. 
+# We use ``matplotlib`` to visualize some samples in our training data.
+
+labels_map = {
+    0: "T-Shirt",
+    1: "Trouser",
+    2: "Pullover",
+    3: "Dress",
+    4: "Coat",
+    5: "Sandal",
+    6: "Shirt",
+    7: "Sneaker",
+    8: "Bag",
+    9: "Ankle Boot",
+}
+figure = plt.figure(figsize=(8, 8))
+cols, rows = 3, 3
+for i in range(1, cols * rows + 1):
+    sample_idx = torch.randint(len(training_data), size=(1,)).item()
+    img, label = training_data[sample_idx]
+    figure.add_subplot(rows, cols, i)
+    plt.title(labels_map[label])
+    plt.axis("off")
+    plt.imshow(img.squeeze(), cmap="gray")
+plt.show()
+
+#################################################################
+# ..
+#  .. figure:: /_static/img/basics/fashion_mnist.png
+#    :alt: fashion_mnist
+
+
+######################################################################
+# --------------
+#
+
+#################################################################
+# Creating a Custom Dataset for your files
+# ---------------------------------------------------
+#
+# A custom Dataset class must implement three functions: `__init__`, `__len__`, and `__getitem__`. 
+# Take a look at this implementation; the FashionMNIST images are stored 
+# in a directory ``img_dir``, and their labels are stored separately in a CSV file ``annotations_file``. 
+#
+# In the next sections, we'll break down what's happening in each of these functions.
+
+
+import os
+import pandas as pd
+from torchvision.io import read_image
+
+class CustomImageDataset(Dataset):
+    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
+        self.img_labels = pd.read_csv(annotations_file)
+        self.img_dir = img_dir
+        self.transform = transform
+        self.target_transform = target_transform
+
+    def __len__(self):
+        return len(self.img_labels)
+
+    def __getitem__(self, idx):
+        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
+        image = read_image(img_path)
+        label = self.img_labels.iloc[idx, 1]
+        if self.transform:
+            image = self.transform(image)
+        if self.target_transform:
+            label = self.target_transform(label)
+        sample = {"image": image, "label": label}
+        return sample
+
+
+#################################################################
+# __init__
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# The __init__ function is run once when instantiating the Dataset object. We initialize
+# the directory containing the images, the annotations file, and both transforms (covered 
+# in more detail in the next section). 
+#
+# The labels.csv file looks like: ::
+#
+#     tshirt1.jpg, 0
+#     tshirt2.jpg, 0
+#     ......
+#     ankleboot999.jpg, 9
+
+
+def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
+    self.img_labels = pd.read_csv(annotations_file)
+    self.img_dir = img_dir
+    self.transform = transform
+    self.target_transform = target_transform
+
+
+#################################################################
+# __len__
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# The __len__ function returns the number of samples in our dataset.
+#
+# Example:
+
+
+def __len__(self):
+    return len(self.img_labels)
+
+
+#################################################################
+# __getitem__
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# The __getitem__ function loads and returns a sample from the dataset at the given index ``idx``. 
+# Based on the index, it identifies the image's location on disk, converts that to a tensor using ``read_image``, retrieves the 
+# corresponding label from the csv data in ``self.img_labels``, calls the transform functions on them (if applicable), and returns the 
+# tensor image and corresponding label in a Python dict.
+
+def __getitem__(self, idx):
+    img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
+    image = read_image(img_path)
+    label = self.img_labels.iloc[idx, 1]
+    if self.transform:
+        image = self.transform(image)
+    if self.target_transform:
+        label = self.target_transform(label)
+    sample = {"image": image, "label": label}
+    return sample
+
+
+######################################################################
+# --------------
+#
+
+
+#################################################################
+# Preparing your data for training with DataLoaders
+# -------------------------------------------------
+# The ``Dataset`` retrieves our dataset's features and labels one sample at a time. While training a model, we typically want to 
+# pass samples in "minibatches", reshuffle the data at every epoch to reduce model overfitting, and use Python's ``multiprocessing`` to
+# speed up data retrieval.
+#
+# ``DataLoader`` is an iterable that abstracts this complexity for us in an easy API.
+
+from torch.utils.data import DataLoader
+
+train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
+test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
+
+###########################
+# Iterate through the DataLoader
+# --------------------------
+#
+# We have loaded that dataset into the ``Dataloader`` and can iterate through the dataset as needed.
+# Each iteration below returns a batch of ``train_features`` and ``train_labels``(containing ``batch_size=64`` features and labels respectively).
+# Because we specified ``shuffle=True``, after we iterate over all batches the data is shuffled (for finer-grained control over 
+# the data loading order, take a look at `Samplers <https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler>`_).
+
+# Display image and label.
+train_features, train_labels = next(iter(train_dataloader))
+print(f"Feature batch shape: {train_features.size()}")
+print(f"Labels batch shape: {train_labels.size()}")
+img = train_features[0].squeeze()
+label = train_labels[0]
+plt.imshow(img, cmap="gray")
+plt.show()
+print(f"Label: {label}")
+
+######################################################################
+# --------------
+#
+
+#################################################################
+# Further Reading
+# --------------
+# - `torch.utils.data API <https://pytorch.org/docs/stable/data.html>`_
+
+
diff --git a/beginner_source/basics/intro.py b/beginner_source/basics/intro.py
new file mode 100644
index 000000000..3fe251985
--- /dev/null
+++ b/beginner_source/basics/intro.py
@@ -0,0 +1,54 @@
+"""
+**Learn the Basics** ||
+`Quickstart <quickstart_tutorial.html>`_ || 
+`Tensors <tensorqs_tutorial.html>`_ || 
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Learn the Basics
+===================
+
+Authors: 
+`Suraj Subramanian <https://github.com/suraj813>`_,
+`Seth Juarez <https://github.com/sethjuarez/>`_, 
+`Cassie Breviu <https://github.com/cassieview/>`_, 
+`Dmitry Soshnikov <https://soshnikov.com/>`_, 
+`Ari Bornstein <https://github.com/aribornstein/>`_ 
+
+Most machine learning workflows involve working with data, creating models, optimizing model 
+parameters, and saving the trained models. This tutorial introduces you to a complete ML workflow 
+implemented in PyTorch, with links to learn more about each of these concepts.
+
+We'll use the FashionMNIST dataset to train a neural network that predicts if an input image belongs 
+to one of the following classes: T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker, 
+Bag, or Ankle boot. 
+
+`This tutorial assumes a basic familiarity with Python and Deep Learning concepts.`
+
+
+Running the Tutorial Code
+------------------
+You can run this tutorial in a couple of ways:
+
+- **In the cloud**: This is the easiest way to get started! Each section has a Colab link at the top, which opens a notebook with the code in a fully-hosted environment. Pro tip: Use Colab with a GPU runtime to speed up operations *Runtime > Change runtime type > GPU*
+- **Locally**: This option requires you to setup PyTorch and TorchVision first on your local machine (`installation instructions <https://pytorch.org/get-started/locally/>`_). Download the notebook or copy the code into your favorite IDE.
+
+
+How to Use this Guide
+----------------- 
+If you're familiar with other deep learning frameworks, check out the `0. Quickstart <quickstart_tutorial.html>`_ first
+to quickly familiarize yourself with PyTorch's API.
+
+If you're new to deep learning frameworks, head right into the first section of our step-by-step guide: `1. Tensors <tensor_tutorial.html>`_.
+
+
+.. include:: /beginner_source/basics/qs_toc.txt
+
+.. toctree::
+   :hidden:
+
+"""
\ No newline at end of file
diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py
new file mode 100644
index 000000000..d08d052ba
--- /dev/null
+++ b/beginner_source/basics/optimization_tutorial.py
@@ -0,0 +1,206 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ || 
+`Tensors <tensorqs_tutorial.html>`_ || 
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+**Optimization** ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Optimizing Model Parameters
+===========================
+
+Now that we have a model and data it's time to train, validate and test our model by optimizing it's parameters on 
+our data. Training a model is an iterative process; in each iteration (called an *epoch*) the model makes a guess about the output, calculates 
+the error in its guess (*loss*), collects the derivatives of the error with respect to its parameters (as we saw in 
+the `previous section  <autograd_tutorial.html>`_), and **optimizes** these parameters using gradient descent. For a more 
+detailed walkthrough of this process, check out this video on `backpropagation from 3Blue1Brown <https://www.youtube.com/watch?v=tIeHLnjs5U8>`__.
+
+Pre-requisite Code 
+-----------------
+We load the code from the previous sections on `Datasets & DataLoaders <data_tutorial.html>`_ 
+and `Build Model  <buildmodel_tutorial.html>`_.
+"""
+
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor, Lambda
+
+training_data = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor()
+)
+
+test_data = datasets.FashionMNIST(
+    root="data",
+    train=False,
+    download=True,
+    transform=ToTensor()
+)
+
+train_dataloader = DataLoader(training_data, batch_size=64)
+test_dataloader = DataLoader(test_data, batch_size=64)
+
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super(NeuralNetwork, self).__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+model = NeuralNetwork()
+
+
+##############################################
+# Hyperparameters 
+# -----------------
+#
+# Hyperparameters are adjustable parameters that let you control the model optimization process. 
+# Different hyperparameter values can impact model training and convergence rates 
+# (`read more <https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html>`__ about hyperparameter tuning)
+#
+# We define the following hyperparameters for training:
+#  - **Number of Epochs** - the number times to iterate over the dataset
+#  - **Batch Size** - the number of data samples seen by the model in each epoch
+#  - **Learning Rate** - how much to update models parameters at each batch/epoch. Smaller values yield slow learning speed, while large values may result in unpredictable behavior during training.
+#
+
+learning_rate = 1e-3
+batch_size = 64
+epochs = 5
+
+
+
+#####################################
+# Optimization Loop
+# -----------------
+#
+# Once we set our hyperparameters, we can then train and optimize our model with an optimization loop. Each 
+# iteration of the optimization loop is called an **epoch**. 
+#
+# Each epoch consists of two main parts:
+#  - **The Train Loop** - iterate over the training dataset and try to converge to optimal parameters.
+#  - **The Validation/Test Loop** - iterate over the test dataset to check if model performance is improving.
+#
+# Let's briefly familiarize ourselves with some of the concepts used in the training loop. Jump ahead to 
+# see the :ref:`full-impl-label` of the optimization loop.
+#
+# Loss Function
+# ~~~~~~~~~~~~~~~~~
+#
+# When presented with some training data, our untrained network is likely not to give the correct 
+# answer. **Loss function** measures the degree of dissimilarity of obtained result to the target value, 
+# and it is the loss function that we want to minimize during training. To calculate the loss we make a 
+# prediction using the inputs of our given data sample and compare it against the true data label value.
+#
+# Common loss functions include `nn.MSELoss <https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html#torch.nn.MSELoss>`_ (Mean Square Error) for regression tasks, and 
+# `nn.NLLLoss <https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html#torch.nn.NLLLoss>`_ (Negative Log Likelihood) for classification. 
+# `nn.CrossEntropyLoss <https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss>`_ combines ``nn.LogSoftmax`` and ``nn.NLLLoss``.
+# 
+# We pass our model's output logits to ``nn.CrossEntropyLoss``, which will normalize the logits and compute the prediction error.
+
+# Initialize the loss function
+loss_fn = nn.CrossEntropyLoss()
+
+
+
+#####################################
+# Optimizer
+# ~~~~~~~~~~~~~~~~~
+#
+# Optimization is the process of adjusting model parameters to reduce model error in each training step. **Optimization algorithms** define how this process is performed (in this example we use Stochastic Gradient Descent).
+# All optimization logic is encapsulated in  the ``optimizer`` object. Here, we use the SGD optimizer; additionally, there are many `different optimizers <https://pytorch.org/docs/stable/optim.html>`_ 
+# available in PyTorch such as ADAM and RMSProp, that work better for different kinds of models and data.
+#
+# We initialize the optimizer by registering the model's parameters that need to be trained, and passing in the learning rate hyperparameter.
+
+optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+#####################################
+# Inside the training loop, optimization happens in three steps:
+#  * Call ``optimizer.zero_grad()`` to reset the gradients of model parameters. Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration.
+#  * Backpropagate the prediction loss with a call to ``loss.backwards()``. PyTorch deposits the gradients of the loss w.r.t. each parameter. 
+#  * Once we have our gradients, we call ``optimizer.step()`` to adjust the parameters by the gradients collected in the backward pass.  
+
+
+########################################
+# .. _full-impl-label:
+#
+# Full Implementation
+# -----------------------
+# We define ``train_loop`` that loops over our optimization code, and ``test_loop`` that 
+# evaluates the model's performance against our test data.
+
+def train_loop(dataloader, model, loss_fn, optimizer):
+    size = len(dataloader.dataset)
+    for batch, (X, y) in enumerate(dataloader):        
+        # Compute prediction and loss
+        pred = model(X)
+        loss = loss_fn(pred, y)
+        
+        # Backpropagation
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        if batch % 100 == 0:
+            loss, current = loss.item(), batch * len(X)
+            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
+
+
+def test_loop(dataloader, model, loss_fn):
+    size = len(dataloader.dataset)
+    test_loss, correct = 0, 0
+
+    with torch.no_grad():
+        for X, y in dataloader:
+            pred = model(X)
+            test_loss += loss_fn(pred, y).item()
+            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+            
+    test_loss /= size
+    correct /= size
+    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+
+
+########################################
+# We initialize the loss function and optimizer, and pass it to ``train_loop`` and ``test_loop``.
+# Feel free to increase the number of epochs to track the model's improving performance.
+
+loss_fn = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+epochs = 10
+for t in range(epochs):
+    print(f"Epoch {t+1}\n-------------------------------")
+    train_loop(train_dataloader, model, loss_fn, optimizer)
+    test_loop(test_dataloader, model, loss_fn)
+print("Done!")
+
+
+
+#################################################################
+# Further Reading
+# -----------------------
+# - `Loss Functions <https://pytorch.org/docs/stable/nn.html#loss-functions>`_
+# - `torch.optim <https://pytorch.org/docs/stable/optim.html>`_
+# - `Warmstart Training a Model <https://pytorch.org/tutorials/recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html>`_
+#
+
diff --git a/beginner_source/basics/qs_toc.txt b/beginner_source/basics/qs_toc.txt
new file mode 100644
index 000000000..7a63e1c7e
--- /dev/null
+++ b/beginner_source/basics/qs_toc.txt
@@ -0,0 +1,8 @@
+| 0. `Quickstart <quickstart_tutorial.html>`_
+| 1. `Tensors <tensorqs_tutorial.html>`_
+| 2. `Datasets and DataLoaders <data_tutorial.html>`_
+| 3. `Transforms <transforms_tutorial.html>`_
+| 4. `Build Model <buildmodel_tutorial.html>`_
+| 5. `Automatic Differentiation <autogradqs_tutorial.html>`_
+| 6. `Optimization Loop <optimization_tutorial.html>`_
+| 7. `Save, Load and Use Model <saveloadrun_tutorial.html>`_
diff --git a/beginner_source/basics/quickstart_tutorial.py b/beginner_source/basics/quickstart_tutorial.py
new file mode 100644
index 000000000..5f7bb2335
--- /dev/null
+++ b/beginner_source/basics/quickstart_tutorial.py
@@ -0,0 +1,243 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+**Quickstart** || 
+`Tensors <tensorqs_tutorial.html>`_ || 
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Quickstart
+===================
+This section runs through the API for common tasks in machine learning. Refer to the links in each section to dive deeper.
+
+Working with data
+-----------------
+PyTorch has two `primitives to work with data <https://pytorch.org/docs/stable/data.html>`_: 
+``torch.utils.data.DataLoader`` and ``torch.utils.data.Dataset``.
+``Dataset`` stores the samples and their corresponding labels, and ``DataLoader`` wraps an iterable around
+the ``Dataset``.
+
+"""
+
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor, Lambda, Compose
+import matplotlib.pyplot as plt
+
+######################################################################
+# PyTorch offers domain-specific libraries such as `TorchText <https://pytorch.org/text/stable/index.html>`_, 
+# `TorchVision <https://pytorch.org/vision/stable/index.html>`_, and `TorchAudio <https://pytorch.org/audio/stable/index.html>`_, 
+# all of which include datasets. For this tutorial, we  will be using a TorchVision dataset.
+#
+# The ``torchvision.datasets`` module contains ``Dataset`` objects for many real-world vision data like 
+# CIFAR, COCO (`full list here <https://pytorch.org/docs/stable/torchvision/datasets.html>`_). In this tutorial, we
+# use the FashionMNIST dataset. Every TorchVision ``Dataset`` includes two arguments: ``transform`` and
+# ``target_transform`` to modify the samples and labels respectively.
+
+# Download training data from open datasets.
+training_data = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor(),
+)
+
+# Download test data from open datasets.
+test_data = datasets.FashionMNIST(
+    root="data",
+    train=False,
+    download=True,
+    transform=ToTensor(),
+)
+
+######################################################################
+# We pass the ``Dataset`` as an argument to ``DataLoader``. This wraps an iterable over our dataset, and supports
+# automatic batching, sampling, shuffling and multiprocess data loading. Here we define a batch size of 64, i.e. each element 
+# in the dataloader iterable will return a batch of 64 features and labels.
+
+batch_size = 64
+
+# Create data loaders.
+train_dataloader = DataLoader(training_data, batch_size=batch_size)
+test_dataloader = DataLoader(test_data, batch_size=batch_size)
+
+for X, y in test_dataloader:
+    print("Shape of X [N, C, H, W]: ", X.shape)
+    print("Shape of y: ", y.shape, y.dtype)
+    break
+
+######################################################################
+# Read more about `loading data in PyTorch <data_tutorial.html>`_.
+#
+
+######################################################################
+# --------------
+#
+
+################################
+# Creating Models
+# ------------------
+# To define a neural network in PyTorch, we create a class that inherits 
+# from `nn.Module <https://pytorch.org/docs/stable/generated/torch.nn.Module.html>`_. We define the layers of the network
+# in the ``__init__`` function and specify how data will pass through the network in the ``forward`` function. To accelerate 
+# operations in the neural network, we move it to the GPU if available.
+
+# Get cpu or gpu device for training.
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Using {} device".format(device))
+
+# Define model
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super(NeuralNetwork, self).__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+model = NeuralNetwork().to(device)
+print(model)
+
+######################################################################
+# Read more about `building neural networks in PyTorch <buildmodel_tutorial.html>`_.
+#
+
+
+######################################################################
+# --------------
+#
+
+
+#####################################################################
+# Optimizing the Model Parameters
+# ----------------------------------------
+# To train a model, we need a `loss function <https://pytorch.org/docs/stable/nn.html#loss-functions>`_
+# and an `optimizer <https://pytorch.org/docs/stable/optim.html>`_. 
+
+loss_fn = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+
+
+####################################################################### 
+# In a single training loop, the model makes predictions on the training dataset (fed to it in batches), and 
+# backpropagates the prediction error to adjust the model's parameters. 
+
+def train(dataloader, model, loss_fn, optimizer):
+    size = len(dataloader.dataset)
+    for batch, (X, y) in enumerate(dataloader):
+        X, y = X.to(device), y.to(device)
+        
+        # Compute prediction error
+        pred = model(X)
+        loss = loss_fn(pred, y)
+        
+        # Backpropagation
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        if batch % 100 == 0:
+            loss, current = loss.item(), batch * len(X)
+            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
+
+##############################################################################
+# We also check the model's performance against the test dataset to ensure it is learning.
+
+def test(dataloader, model):
+    size = len(dataloader.dataset)
+    model.eval()
+    test_loss, correct = 0, 0
+    with torch.no_grad():
+        for X, y in dataloader:
+            X, y = X.to(device), y.to(device)
+            pred = model(X)
+            test_loss += loss_fn(pred, y).item()
+            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+    test_loss /= size
+    correct /= size
+    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+
+##############################################################################
+# The training process is conducted over several iterations (*epochs*). During each epoch, the model learns 
+# parameters to make better predictions. We print the model's accuracy and loss at each epoch; we'd like to see the
+# accuracy increase and the loss decrease with every epoch.
+
+epochs = 5
+for t in range(epochs):
+    print(f"Epoch {t+1}\n-------------------------------")
+    train(train_dataloader, model, loss_fn, optimizer)
+    test(test_dataloader, model)
+print("Done!")
+
+######################################################################
+# Read more about `Training your model <optimization_tutorial.html>`_.
+#
+
+######################################################################
+# --------------
+#
+
+######################################################################
+# Saving Models
+# -------------
+# A common way to save a model is to serialize the internal state dictionary (containing the model parameters).
+
+torch.save(model.state_dict(), "model.pth")
+print("Saved PyTorch Model State to model.pth")
+
+
+
+######################################################################
+# Loading Models
+# ----------------------------
+#
+# The process for loading a model includes re-creating the model structure and loading
+# the state dictionary into it. 
+
+model = NeuralNetwork()
+model.load_state_dict(torch.load("model.pth"))
+
+#############################################################
+# This model can now be used to make predictions.
+
+classes = [
+    "T-shirt/top",
+    "Trouser",
+    "Pullover",
+    "Dress",
+    "Coat",
+    "Sandal",
+    "Shirt",
+    "Sneaker",
+    "Bag",
+    "Ankle boot",
+]
+
+model.eval()
+x, y = test_data[0][0], test_data[0][1]
+with torch.no_grad():
+    pred = model(x)
+    predicted, actual = classes[pred[0].argmax(0)], classes[y]
+    print(f'Predicted: "{predicted}", Actual: "{actual}"')
+
+      
+######################################################################
+# Read more about `Saving & Loading your model <saveloadrun_tutorial.html>`_.
+#
+
+
diff --git a/beginner_source/basics/saveloadrun_tutorial.py b/beginner_source/basics/saveloadrun_tutorial.py
new file mode 100644
index 000000000..4ae64c666
--- /dev/null
+++ b/beginner_source/basics/saveloadrun_tutorial.py
@@ -0,0 +1,83 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ || 
+`Tensors <tensorqs_tutorial.html>`_ || 
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+**Save & Load Model**
+
+Save and Load the Model
+============================
+
+In this section we will look at how to persist model state with saving, loading and running model predictions.
+"""
+
+import torch
+import torch.onnx as onnx
+import torchvision.models as models
+
+
+#######################################################################
+# Saving and Loading Model Weights
+# --------------------------------
+# PyTorch models store the learned parameters in an internal
+# state dictionary, called ``state_dict``. These can be persisted via the ``torch.save``
+# method:
+
+model = models.vgg16(pretrained=True)
+torch.save(model.state_dict(), 'model_weights.pth')
+
+##########################
+# To load model weights, you need to create an instance of the same model first, and then load the parameters 
+# using ``load_state_dict()`` method.
+
+model = models.vgg16() # we do not specify pretrained=True, i.e. do not load default weights
+model.load_state_dict(torch.load('model_weights.pth'))
+model.eval()
+
+###########################
+# .. note:: be sure to call ``model.eval()`` method before inferencing to set the dropout and batch normalization layers to evaluation mode. Failing to do this will yield inconsistent inference results.
+
+#######################################################################
+# Saving and Loading Models with Shapes
+# -------------------------------------
+# When loading model weights, we needed to instantiate the model class first, because the class 
+# defines the structure of a network. We might want to save the structure of this class together with 
+# the model, in which case we can pass ``model`` (and not ``model.state_dict()``) to the saving function:
+
+torch.save(model, 'model.pth')
+
+########################
+# We can then load the model like this:
+
+model = torch.load('model.pth')
+
+########################
+# .. note:: This approach uses Python `pickle <https://docs.python.org/3/library/pickle.html>`_ module when serializing the model, thus it relies on the actual class definition to be available when loading the model.
+
+#######################################################################
+# Exporting Model to ONNX
+# -----------------------
+# PyTorch also has native ONNX export support. Given the dynamic nature of the
+# PyTorch execution graph, however, the export process must
+# traverse the execution graph to produce a persisted ONNX model. For this reason, a
+# test variable of the appropriate size should be passed in to the
+# export routine (in our case, we will create a dummy zero tensor of the correct size):
+
+input_image = torch.zeros((1,3,224,224))
+onnx.export(model, input_image, 'model.onnx')
+
+###########################
+# There are a lot of things you can do with ONNX model, including running inference on different platforms 
+# and in different programming languages. For more details, we recommend 
+# visiting `ONNX tutorial <https://github.com/onnx/tutorials>`_.
+#
+# Congratulations! You have completed the PyTorch beginner tutorial! Try 
+# `revisting the first page <quickstart_tutorial.html>`_ to see the tutorial in its entirety
+# again. We hope this tutorial has helped you get started with deep learning on PyTorch. 
+# Good luck!
+#
+
diff --git a/beginner_source/basics/tensorqs_tutorial.py b/beginner_source/basics/tensorqs_tutorial.py
new file mode 100644
index 000000000..de60e4c7c
--- /dev/null
+++ b/beginner_source/basics/tensorqs_tutorial.py
@@ -0,0 +1,226 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ || 
+**Tensors** || 
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+`Transforms <transforms_tutorial.html>`_ ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Tensors 
+==========================
+
+Tensors are a specialized data structure that are very similar to arrays and matrices. 
+In PyTorch, we use tensors to encode the inputs and outputs of a model, as well as the model’s parameters.
+
+Tensors are similar to `NumPy’s <https://numpy.org/>`_ ndarrays, except that tensors can run on GPUs or other hardware accelerators. In fact, tensors and
+NumPy arrays can often share the same underlying memory, eliminating the need to copy data (see :ref:`bridge-to-np-label`). Tensors 
+are also optimized for automatic differentiation (we'll see more about that later in the `Autograd <autogradqs_tutorial.html>`__ 
+section). If you’re familiar with ndarrays, you’ll be right at home with the Tensor API. If not, follow along!
+"""
+
+import torch
+import numpy as np
+
+
+######################################################################
+# Initializing a Tensor
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+# Tensors can be initialized in various ways. Take a look at the following examples:
+#
+# **Directly from data**
+#
+# Tensors can be created directly from data. The data type is automatically inferred.
+
+data = [[1, 2],[3, 4]]
+x_data = torch.tensor(data)
+
+######################################################################
+# **From a NumPy array**
+#
+# Tensors can be created from NumPy arrays (and vice versa - see :ref:`bridge-to-np-label`).
+np_array = np.array(data)
+x_np = torch.from_numpy(np_array)
+
+
+###############################################################
+# **From another tensor:**
+#
+# The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden.
+
+x_ones = torch.ones_like(x_data) # retains the properties of x_data
+print(f"Ones Tensor: \n {x_ones} \n")
+
+x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data
+print(f"Random Tensor: \n {x_rand} \n")
+
+
+######################################################################
+# **With random or constant values:**
+#
+# ``shape`` is a tuple of tensor dimensions. In the functions below, it determines the dimensionality of the output tensor.
+
+shape = (2,3,)
+rand_tensor = torch.rand(shape)
+ones_tensor = torch.ones(shape)
+zeros_tensor = torch.zeros(shape)
+
+print(f"Random Tensor: \n {rand_tensor} \n")
+print(f"Ones Tensor: \n {ones_tensor} \n")
+print(f"Zeros Tensor: \n {zeros_tensor}")
+
+
+
+######################################################################
+# --------------
+#
+
+######################################################################
+# Attributes of a Tensor
+# ~~~~~~~~~~~~~~~~~
+#
+# Tensor attributes describe their shape, datatype, and the device on which they are stored.
+
+tensor = torch.rand(3,4)
+
+print(f"Shape of tensor: {tensor.shape}")
+print(f"Datatype of tensor: {tensor.dtype}")
+print(f"Device tensor is stored on: {tensor.device}")
+
+
+######################################################################
+# --------------
+#
+
+######################################################################
+# Operations on Tensors
+# ~~~~~~~~~~~~~~~~~
+#
+# Over 100 tensor operations, including arithmetic, linear algebra, matrix manipulation (transposing, 
+# indexing, slicing), sampling and more are
+# comprehensively described `here <https://pytorch.org/docs/stable/torch.html>`__.
+#
+# Each of these operations can be run on the GPU (at typically higher speeds than on a
+# CPU). If you’re using Colab, allocate a GPU by going to Runtime > Change runtime type > GPU.
+# 
+# By default, tensors are created on the CPU. We need to explicitly move tensors to the GPU using 
+# ``.to`` method (after checking for GPU availability). Keep in mind that copying large tensors
+# across devices can be expensive in terms of time and memory!
+
+# We move our tensor to the GPU if available
+if torch.cuda.is_available():
+  tensor = tensor.to('cuda')
+
+
+######################################################################
+# Try out some of the operations from the list.
+# If you're familiar with the NumPy API, you'll find the Tensor API a breeze to use.
+#
+
+###############################################################
+# **Standard numpy-like indexing and slicing:**
+
+tensor = torch.ones(4, 4)
+print('First row: ',tensor[0])
+print('First column: ', tensor[:, 0])
+print('Last column:', tensor[..., -1])
+tensor[:,1] = 0
+print(tensor)
+
+######################################################################
+# **Joining tensors** You can use ``torch.cat`` to concatenate a sequence of tensors along a given dimension.
+# See also `torch.stack <https://pytorch.org/docs/stable/generated/torch.stack.html>`__,
+# another tensor joining op that is subtly different from ``torch.cat``.
+t1 = torch.cat([tensor, tensor, tensor], dim=1)
+print(t1)
+
+
+######################################################################
+# **Arithmetic operations**
+
+# This computes the matrix multiplication between two tensors. y1, y2, y3 will have the same value
+y1 = tensor @ tensor.T
+y2 = tensor.matmul(tensor.T)
+
+y3 = torch.rand_like(tensor)
+torch.matmul(tensor, tensor.T, out=y3)
+
+
+# This computes the element-wise product. z1, z2, z3 will have the same value
+z1 = tensor * tensor
+z2 = tensor.mul(tensor)
+
+z3 = torch.rand_like(tensor)
+torch.mul(tensor, tensor, out=z3)
+
+
+######################################################################
+# **Single-element tensors** If you have a one-element tensor, for example by aggregating all
+# values of a tensor into one value, you can convert it to a Python
+# numerical value using ``item()``:
+
+agg = tensor.sum()
+agg_item = agg.item()  
+print(agg_item, type(agg_item))
+
+
+######################################################################
+# **In-place operations**
+# Operations that store the result into the operand are called in-place. They are denoted by a ``_`` suffix. 
+# For example: ``x.copy_(y)``, ``x.t_()``, will change ``x``.
+
+print(tensor, "\n")
+tensor.add_(5)
+print(tensor)
+
+######################################################################
+# .. note::
+#      In-place operations save some memory, but can be problematic when computing derivatives because of an immediate loss
+#      of history. Hence, their use is discouraged.
+
+
+
+######################################################################
+# --------------
+#
+
+
+######################################################################
+# .. _bridge-to-np-label:
+#
+# Bridge with NumPy
+# ~~~~~~~~~~~~~~~~~
+# Tensors on the CPU and NumPy arrays can share their underlying memory
+# locations, and changing one will change	the other.
+
+
+######################################################################
+# Tensor to NumPy array
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+t = torch.ones(5)
+print(f"t: {t}")
+n = t.numpy()
+print(f"n: {n}")
+
+######################################################################
+# A change in the tensor reflects in the NumPy array.
+
+t.add_(1)
+print(f"t: {t}")
+print(f"n: {n}")
+
+
+######################################################################
+# NumPy array to Tensor
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+n = np.ones(5)
+t = torch.from_numpy(n)
+
+######################################################################
+# Changes in the NumPy array reflects in the tensor.
+np.add(n, 1, out=n)
+print(f"t: {t}")
+print(f"n: {n}")
diff --git a/beginner_source/basics/transforms_tutorial.py b/beginner_source/basics/transforms_tutorial.py
new file mode 100644
index 000000000..27235a9a6
--- /dev/null
+++ b/beginner_source/basics/transforms_tutorial.py
@@ -0,0 +1,69 @@
+"""
+`Learn the Basics <intro.html>`_ ||
+`Quickstart <quickstart_tutorial.html>`_ || 
+`Tensors <tensorqs_tutorial.html>`_ || 
+`Datasets & DataLoaders <data_tutorial.html>`_ ||
+**Transforms** ||
+`Build Model <buildmodel_tutorial.html>`_ ||
+`Autograd <autogradqs_tutorial.html>`_ ||
+`Optimization <optimization_tutorial.html>`_ ||
+`Save & Load Model <saveloadrun_tutorial.html>`_
+
+Transforms
+===================
+
+Data does not always come in its final processed form that is required for 
+training machine learning algorithms. We use **transforms** to perform some
+manipulation of the data and make it suitable for training.
+
+All TorchVision datasets have two parameters -``transform`` to modify the features and
+``target_transform`` to modify the labels - that accept callables containing the transformation logic.
+The `torchvision.transforms <https://pytorch.org/docs/stable/torchvision/transforms.html>`_ module offers 
+several commonly-used transforms out of the box.
+
+The FashionMNIST features are in PIL Image format, and the labels are integers.
+For training, we need the features as normalized tensors, and the labels as one-hot encoded tensors.
+To make these transformations, we use ``ToTensor`` and ``Lambda``.
+"""
+
+from torchvision import datasets
+from torchvision.transforms import ToTensor, Lambda
+
+ds = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor(),
+    target_transform=Lambda(lambda y: torch.zeros(10, dtype=torch.float).scatter_(0, torch.tensor(y), value=1))
+)
+
+#################################################
+# ToTensor()
+# -------------------------------
+#
+# `ToTensor <https://pytorch.org/docs/stable/torchvision/transforms.html#torchvision.transforms.ToTensor>`_ 
+# converts a PIL image or NumPy ``ndarray`` into a ``FloatTensor``. and scales 
+# the image's pixel intensity values in the range [0., 1.]
+#
+
+##############################################
+# Lambda Transforms
+# -------------------------------
+#
+# Lambda transforms apply any user-defined lambda function. Here, we define a function 
+# to turn the integer into a one-hot encoded tensor. 
+# It first creates a zero tensor of size 10 (the number of labels in our dataset) and calls 
+# `scatter_ <https://pytorch.org/docs/stable/tensors.html#torch.Tensor.scatter_>`_ which assigns a 
+# ``value=1`` on the index as given by the label ``y``.
+
+target_transform = Lambda(lambda y: torch.zeros(
+    10, dtype=torch.float).scatter_(dim=0, index=torch.tensor(y), value=1))
+
+######################################################################
+# --------------
+#
+
+#################################################################
+# Further Reading
+# ~~~~~~~~~~~~~~~~~
+# - `torchvision.transforms API <https://pytorch.org/vision/stable/transforms.html>`_
diff --git a/beginner_source/blitz/autograd_tutorial.py b/beginner_source/blitz/autograd_tutorial.py
index 28d88725c..2229fcdd9 100644
--- a/beginner_source/blitz/autograd_tutorial.py
+++ b/beginner_source/blitz/autograd_tutorial.py
@@ -1,187 +1,320 @@
 # -*- coding: utf-8 -*-
 """
-Autograd: 자동 미분
-===================================
+A Gentle Introduction to ``torch.autograd``
+---------------------------------
 
-PyTorch의 모든 신경망의 중심에는 ``autograd`` 패키지가 있습니다.
-먼저 이것을 가볍게 살펴본 뒤, 첫번째 신경망을 학습시켜보겠습니다.
+``torch.autograd`` is PyTorch’s automatic differentiation engine that powers
+neural network training. In this section, you will get a conceptual
+understanding of how autograd helps a neural network train.
 
-``autograd`` 패키지는 Tensor의 모든 연산에 대해 자동 미분을 제공합니다.
-이는 실행-기반-정의(define-by-run) 프레임워크로, 이는 코드를 어떻게 작성하여
-실행하느냐에 따라 역전파가 정의된다는 뜻이며, 역전파는 학습 과정의 매 단계마다
-달라집니다.
+Background
+~~~~~~~~~~
+Neural networks (NNs) are a collection of nested functions that are
+executed on some input data. These functions are defined by *parameters*
+(consisting of weights and biases), which in PyTorch are stored in
+tensors.
 
-더 간단한 용어로 몇 가지 예를 살펴보겠습니다.
+Training a NN happens in two steps:
 
-Tensor
---------
+**Forward Propagation**: In forward prop, the NN makes its best guess
+about the correct output. It runs the input data through each of its
+functions to make this guess.
 
-패키지의 중심에는 ``torch.Tensor`` 클래스가 있습니다. 만약 ``.requires_grad``
-속성을 ``True`` 로 설정하면, 그 tensor에서 이뤄진 모든 연산들을 추적(track)하기
-시작합니다. 계산이 완료된 후 ``.backward()`` 를 호출하여 모든 변화도(gradient)를
-자동으로 계산할 수 있습니다. 이 Tensor의 변화도는 ``.grad`` 속성에 누적됩니다.
+**Backward Propagation**: In backprop, the NN adjusts its parameters
+proportionate to the error in its guess. It does this by traversing
+backwards from the output, collecting the derivatives of the error with
+respect to the parameters of the functions (*gradients*), and optimizing
+the parameters using gradient descent. For a more detailed walkthrough
+of backprop, check out this `video from
+3Blue1Brown <https://www.youtube.com/watch?v=tIeHLnjs5U8>`__.
 
-Tensor가 기록을 추적하는 것을 중단하게 하려면, ``.detach()`` 를 호출하여 연산
-기록으로부터 분리(detach)하여 이후 연산들이 추적되는 것을 방지할 수 있습니다.
 
-기록을 추적하는 것(과 메모리를 사용하는 것)을 방지하기 위해, 코드 블럭을
-``with torch.no_grad():`` 로 감쌀 수 있습니다. 이는 특히 변화도(gradient)는
-필요없지만, `requires_grad=True` 가 설정되어 학습 가능한 매개변수를 갖는 모델을
-평가(evaluate)할 때 유용합니다.
 
-Autograd 구현에서 매우 중요한 클래스가 하나 더 있는데, 이것은 바로 ``Function``
-클래스입니다.
 
-``Tensor`` 와 ``Function`` 은 서로 연결되어 있으며, 모든 연산 과정을
-부호화(encode)하여 순환하지 않는 그래프(acyclic graph)를 생성합니다. 각 tensor는
-``.grad_fn`` 속성을 갖고 있는데, 이는 ``Tensor`` 를 생성한 ``Function`` 을
-참조하고 있습니다. (단, 사용자가 만든 Tensor는 예외로, 이 때 ``grad_fn`` 은
-``None`` 입니다.)
-
-도함수를 계산하기 위해서는 ``Tensor`` 의 ``.backward()`` 를 호출하면
-됩니다. 만약 ``Tensor`` 가 스칼라(scalar)인 경우(예. 하나의 요소 값만 갖는 등)에는
-``backward`` 에 인자를 정해줄 필요가 없습니다. 하지만 여러 개의 요소를 갖고 있을
-때는 tensor의 모양을 ``gradient`` 의 인자로 지정할 필요가 있습니다.
+Usage in PyTorch
+~~~~~~~~~~~
+Let's take a look at a single training step.
+For this example, we load a pretrained resnet18 model from ``torchvision``.
+We create a random data tensor to represent a single image with 3 channels, and height & width of 64,
+and its corresponding ``label`` initialized to some random values.
 """
+import torch, torchvision
+model = torchvision.models.resnet18(pretrained=True)
+data = torch.rand(1, 3, 64, 64)
+labels = torch.rand(1, 1000)
+
+############################################################
+# Next, we run the input data through the model through each of its layers to make a prediction.
+# This is the **forward pass**.
+#
+
+prediction = model(data) # forward pass
+
+############################################################
+# We use the model's prediction and the corresponding label to calculate the error (``loss``).
+# The next step is to backpropagate this error through the network.
+# Backward propagation is kicked off when we call ``.backward()`` on the error tensor.
+# Autograd then calculates and stores the gradients for each model parameter in the parameter's ``.grad`` attribute.
+#
+
+loss = (prediction - labels).sum()
+loss.backward() # backward pass
+
+############################################################
+# Next, we load an optimizer, in this case SGD with a learning rate of 0.01 and momentum of 0.9.
+# We register all the parameters of the model in the optimizer.
+#
+
+optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
+
+######################################################################
+# Finally, we call ``.step()`` to initiate gradient descent. The optimizer adjusts each parameter by its gradient stored in ``.grad``.
+#
+
+optim.step() #gradient descent
+
+######################################################################
+# At this point, you have everything you need to train your neural network.
+# The below sections detail the workings of autograd - feel free to skip them.
+#
+
+
+######################################################################
+# --------------
+#
+
+
+######################################################################
+# Differentiation in Autograd
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Let's take a look at how ``autograd`` collects gradients. We create two tensors ``a`` and ``b`` with
+# ``requires_grad=True``. This signals to ``autograd`` that every operation on them should be tracked.
+#
 
 import torch
 
-###############################################################
-# tensor를 생성하고 ``requires_grad=True`` 를 설정하여 연산을 기록합니다.
-x = torch.ones(2, 2, requires_grad=True)
-print(x)
-
-###############################################################
-# tensor에 연산을 수행합니다:
-y = x + 2
-print(y)
-
-###############################################################
-# ``y`` 는 연산의 결과로 생성된 것이므로 ``grad_fn`` 을 갖습니다.
-print(y.grad_fn)
-
-###############################################################
-# ``y`` 에 다른 연산을 수행합니다.
-z = y * y * 3
-out = z.mean()
-
-print(z, out)
-
-################################################################
-# ``.requires_grad_( ... )`` 는 기존 Tensor의 ``requires_grad`` 값을 바꿔치기
-# (in-place)하여 변경합니다. 입력값이 지정되지 않으면 기본값은 ``False`` 입니다.
-a = torch.randn(2, 2)
-a = ((a * 3) / (a - 1))
-print(a.requires_grad)
-a.requires_grad_(True)
-print(a.requires_grad)
-b = (a * a).sum()
-print(b.grad_fn)
-
-###############################################################
-# 변화도(Gradient)
-# -----------------
-# 이제 역전파(backprop)를 해보겠습니다.
-# ``out`` 은 하나의 스칼라 값만 갖고 있기 때문에, ``out.backward()`` 는
-# ``out.backward(torch.tensor(1.))`` 과 동일합니다.
-
-out.backward()
-
-###############################################################
-# 변화도 d(out)/dx를 출력합니다.
-#
-
-print(x.grad)
-
-###############################################################
-# ``4.5`` 로 이루어진 행렬을 확인할 수 있습니다. ``out`` 을 *Tensor* “:math:`o`”
-# 라고 하면, 다음과 같이 구할 수 있습니다.
-# :math:`o = \frac{1}{4}\sum_i z_i` 이고,
-# :math:`z_i = 3(x_i+2)^2` 이므로 :math:`z_i\bigr\rvert_{x_i=1} = 27` 입니다.
-# 따라서,
-# :math:`\frac{\partial o}{\partial x_i} = \frac{3}{2}(x_i+2)` 이므로,
-# :math:`\frac{\partial o}{\partial x_i}\bigr\rvert_{x_i=1} = \frac{9}{2} = 4.5` 입니다.
-
-###############################################################
-# 수학적으로 벡터 함수 :math:`\vec{y}=f(\vec{x})` 에서 :math:`\vec{x}` 에
-# 대한 :math:`\vec{y}` 의 변화도는 야코비안 행렬(Jacobian Matrix)입니다:
+a = torch.tensor([2., 3.], requires_grad=True)
+b = torch.tensor([6., 4.], requires_grad=True)
+
+######################################################################
+# We create another tensor ``Q`` from ``a`` and ``b``.
+#
+# .. math::
+#    Q = 3a^3 - b^2
+
+Q = 3*a**3 - b**2
+
+
+######################################################################
+# Let's assume ``a`` and ``b`` to be parameters of an NN, and ``Q``
+# to be the error. In NN training, we want gradients of the error
+# w.r.t. parameters, i.e.
 #
 # .. math::
-#   J=\left(\begin{array}{ccc}
-#    \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{1}}{\partial x_{n}}\\
-#    \vdots & \ddots & \vdots\\
-#    \frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
-#    \end{array}\right)
-#
-# 일반적으로, ``torch.autograd`` 는 벡터-야코비안 곱을 계산하는 엔진입니다. 즉,
-# 어떤 벡터 :math:`v=\left(\begin{array}{cccc} v_{1} & v_{2} & \cdots & v_{m}\end{array}\right)^{T}`
-# 에 대해 :math:`v^{T}\cdot J` 을 연산합니다. 만약 :math:`v` 가 스칼라 함수
-# :math:`l=g\left(\vec{y}\right)` 의 기울기인 경우,
-# :math:`v=\left(\begin{array}{ccc}\frac{\partial l}{\partial y_{1}} & \cdots & \frac{\partial l}{\partial y_{m}}\end{array}\right)^{T}`
-# 이며, 연쇄법칙(chain rule)에 따라 벡터-야코비안 곱은 :math:`\vec{x}` 에 대한
-# :math:`l` 의 기울기가 됩니다:
+#    \frac{\partial Q}{\partial a} = 9a^2
+#
+# .. math::
+#    \frac{\partial Q}{\partial b} = -2b
+#
+#
+# When we call ``.backward()`` on ``Q``, autograd calculates these gradients
+# and stores them in the respective tensors' ``.grad`` attribute.
+#
+# We need to explicitly pass a ``gradient`` argument in ``Q.backward()`` because it is a vector.
+# ``gradient`` is a tensor of the same shape as ``Q``, and it represents the
+# gradient of Q w.r.t. itself, i.e.
 #
 # .. math::
-#   J^{T}\cdot v=\left(\begin{array}{ccc}
-#    \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{1}}\\
-#    \vdots & \ddots & \vdots\\
-#    \frac{\partial y_{1}}{\partial x_{n}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
-#    \end{array}\right)\left(\begin{array}{c}
-#    \frac{\partial l}{\partial y_{1}}\\
-#    \vdots\\
-#    \frac{\partial l}{\partial y_{m}}
-#    \end{array}\right)=\left(\begin{array}{c}
-#    \frac{\partial l}{\partial x_{1}}\\
-#    \vdots\\
-#    \frac{\partial l}{\partial x_{n}}
-#    \end{array}\right)
-#
-# (여기서 :math:`v^{T}\cdot J` 은 :math:`J^{T}\cdot v` 를 취했을 때의 열 벡터로
-# 취급할 수 있는 행 벡터를 갖습니다.)
-#
-# 벡터-야코비안 곱의 이러한 특성은 스칼라가 아닌 출력을 갖는 모델에 외부 변화도를
-# 제공(feed)하는 것을 매우 편리하게 해줍니다.
-
-###############################################################
-# 이제 벡터-야코비안 곱의 예제를 살펴보도록 하겠습니다:
-
-x = torch.randn(3, requires_grad=True)
-
-y = x * 2
-while y.data.norm() < 1000:
-    y = y * 2
-
-print(y)
-
-###############################################################
-# 이 경우 ``y`` 는 더 이상 스칼라 값이 아닙니다. ``torch.autograd`` 는
-# 전체 야코비안을 직접 계산할수는 없지만, 벡터-야코비안 곱은 간단히
-# ``backward`` 에 해당 벡터를 인자로 제공하여 얻을 수 있습니다:
-v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)
-y.backward(v)
-
-print(x.grad)
-
-###############################################################
-# 또한 ``with torch.no_grad():`` 로 코드 블럭을 감싸서 autograd가
-# ``.requires_grad=True`` 인 Tensor들의 연산 기록을 추적하는 것을 멈출 수 있습니다.
-print(x.requires_grad)
-print((x ** 2).requires_grad)
-
-with torch.no_grad():
-	print((x ** 2).requires_grad)
-
-###############################################################
-# 또는 ``.detach()`` 를 호출하여 내용물(content)은 같지만 require_grad가 다른
-# 새로운 Tensor를 가져옵니다:
-print(x.requires_grad)
-y = x.detach()
-print(y.requires_grad)
-print(x.eq(y).all())
-
-
-###############################################################
-# **더 읽을거리:**
-#
-# ``autograd.Function`` 관련 문서는 https://pytorch.org/docs/stable/autograd.html#function
-# 에서 찾아볼 수 있습니다.
+#    \frac{dQ}{dQ} = 1
+#
+# Equivalently, we can also aggregate Q into a scalar and call backward implicitly, like ``Q.sum().backward()``.
+#
+external_grad = torch.tensor([1., 1.])
+Q.backward(gradient=external_grad)
+
+
+#######################################################################
+# Gradients are now deposited in ``a.grad`` and ``b.grad``
+
+# check if collected gradients are correct
+print(9*a**2 == a.grad)
+print(-2*b == b.grad)
+
+
+######################################################################
+# Optional Reading - Vector Calculus using ``autograd``
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Mathematically, if you have a vector valued function
+# :math:`\vec{y}=f(\vec{x})`, then the gradient of :math:`\vec{y}` with
+# respect to :math:`\vec{x}` is a Jacobian matrix :math:`J`:
+#
+# .. math::
+#
+#
+#      J
+#      =
+#       \left(\begin{array}{cc}
+#       \frac{\partial \bf{y}}{\partial x_{1}} &
+#       ... &
+#       \frac{\partial \bf{y}}{\partial x_{n}}
+#       \end{array}\right)
+#      =
+#      \left(\begin{array}{ccc}
+#       \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{1}}{\partial x_{n}}\\
+#       \vdots & \ddots & \vdots\\
+#       \frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
+#       \end{array}\right)
+#
+# Generally speaking, ``torch.autograd`` is an engine for computing
+# vector-Jacobian product. That is, given any vector :math:`\vec{v}`, compute the product
+# :math:`J^{T}\cdot \vec{v}`
+#
+# If :math:`\vec{v}` happens to be the gradient of a scalar function :math:`l=g\left(\vec{y}\right)`:
+#
+# .. math::
+#
+#
+#   \vec{v}
+#    =
+#    \left(\begin{array}{ccc}\frac{\partial l}{\partial y_{1}} & \cdots & \frac{\partial l}{\partial y_{m}}\end{array}\right)^{T}
+#
+# then by the chain rule, the vector-Jacobian product would be the
+# gradient of :math:`l` with respect to :math:`\vec{x}`:
+#
+# .. math::
+#
+#
+#      J^{T}\cdot \vec{v}=\left(\begin{array}{ccc}
+#       \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{1}}\\
+#       \vdots & \ddots & \vdots\\
+#       \frac{\partial y_{1}}{\partial x_{n}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
+#       \end{array}\right)\left(\begin{array}{c}
+#       \frac{\partial l}{\partial y_{1}}\\
+#       \vdots\\
+#       \frac{\partial l}{\partial y_{m}}
+#       \end{array}\right)=\left(\begin{array}{c}
+#       \frac{\partial l}{\partial x_{1}}\\
+#       \vdots\\
+#       \frac{\partial l}{\partial x_{n}}
+#       \end{array}\right)
+#
+# This characteristic of vector-Jacobian product is what we use in the above example;
+# ``external_grad`` represents :math:`\vec{v}`.
+#
+
+
+
+######################################################################
+# Computational Graph
+# ~~~~~~~~~~~~~~~~~~~
+#
+# Conceptually, autograd keeps a record of data (tensors) & all executed
+# operations (along with the resulting new tensors) in a directed acyclic
+# graph (DAG) consisting of
+# `Function <https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function>`__
+# objects. In this DAG, leaves are the input tensors, roots are the output
+# tensors. By tracing this graph from roots to leaves, you can
+# automatically compute the gradients using the chain rule.
+#
+# In a forward pass, autograd does two things simultaneously:
+#
+# - run the requested operation to compute a resulting tensor, and
+# - maintain the operation’s *gradient function* in the DAG.
+#
+# The backward pass kicks off when ``.backward()`` is called on the DAG
+# root. ``autograd`` then:
+#
+# - computes the gradients from each ``.grad_fn``,
+# - accumulates them in the respective tensor’s ``.grad`` attribute, and
+# - using the chain rule, propagates all the way to the leaf tensors.
+#
+# Below is a visual representation of the DAG in our example. In the graph,
+# the arrows are in the direction of the forward pass. The nodes represent the backward functions
+# of each operation in the forward pass. The leaf nodes in blue represent our leaf tensors ``a`` and ``b``.
+#
+# .. figure:: /_static/img/dag_autograd.png
+#
+# .. note::
+#   **DAGs are dynamic in PyTorch**
+#   An important thing to note is that the graph is recreated from scratch; after each
+#   ``.backward()`` call, autograd starts populating a new graph. This is
+#   exactly what allows you to use control flow statements in your model;
+#   you can change the shape, size and operations at every iteration if
+#   needed.
+#
+# Exclusion from the DAG
+# ^^^^^^^^^^^^^^^^^^^^^^
+#
+# ``torch.autograd`` tracks operations on all tensors which have their
+# ``requires_grad`` flag set to ``True``. For tensors that don’t require
+# gradients, setting this attribute to ``False`` excludes it from the
+# gradient computation DAG.
+#
+# The output tensor of an operation will require gradients even if only a
+# single input tensor has ``requires_grad=True``.
+#
+
+x = torch.rand(5, 5)
+y = torch.rand(5, 5)
+z = torch.rand((5, 5), requires_grad=True)
+
+a = x + y
+print(f"Does `a` require gradients? : {a.requires_grad}")
+b = x + z
+print(f"Does `b` require gradients?: {b.requires_grad}")
+
+
+######################################################################
+# In a NN, parameters that don't compute gradients are usually called **frozen parameters**.
+# It is useful to "freeze" part of your model if you know in advance that you won't need the gradients of those parameters
+# (this offers some performance benefits by reducing autograd computations).
+#
+# Another common usecase where exclusion from the DAG is important is for
+# `finetuning a pretrained network <https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html>`__
+#
+# In finetuning, we freeze most of the model and typically only modify the classifier layers to make predictions on new labels.
+# Let's walk through a small example to demonstrate this. As before, we load a pretrained resnet18 model, and freeze all the parameters.
+
+from torch import nn, optim
+
+model = torchvision.models.resnet18(pretrained=True)
+
+# Freeze all the parameters in the network
+for param in model.parameters():
+    param.requires_grad = False
+
+######################################################################
+# Let's say we want to finetune the model on a new dataset with 10 labels.
+# In resnet, the classifier is the last linear layer ``model.fc``.
+# We can simply replace it with a new linear layer (unfrozen by default)
+# that acts as our classifier.
+
+model.fc = nn.Linear(512, 10)
+
+######################################################################
+# Now all parameters in the model, except the parameters of ``model.fc``, are frozen.
+# The only parameters that compute gradients are the weights and bias of ``model.fc``.
+
+# Optimize only the classifier
+optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
+
+##########################################################################
+# Notice although we register all the parameters in the optimizer,
+# the only parameters that are computing gradients (and hence updated in gradient descent)
+# are the weights and bias of the classifier.
+#
+# The same exclusionary functionality is available as a context manager in
+# `torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html>`__
+#
+
+######################################################################
+# --------------
+#
+
+######################################################################
+# Further readings:
+# ~~~~~~~~~~~~~~~~~~~
+#
+# -  `In-place operations & Multithreaded Autograd <https://pytorch.org/docs/stable/notes/autograd.html>`__
+# -  `Example implementation of reverse-mode autodiff <https://colab.research.google.com/drive/1VpeE6UvEPRz9HmsHh1KS0XxXjYu533EC>`__
diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py
index 7298c4c5d..2161fb4d2 100644
--- a/beginner_source/blitz/cifar10_tutorial.py
+++ b/beginner_source/blitz/cifar10_tutorial.py
@@ -51,7 +51,7 @@
 4. 학습용 데이터를 사용하여 신경망을 학습합니다.
 5. 시험용 데이터를 사용하여 신경망을 검사합니다.
 
-1. CIFAR10를 불러오고 정규화하기
+1. CIFAR10을 불러오고 정규화하기
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 ``torchvision`` 을 사용하여 매우 쉽게 CIFAR10을 불러올 수 있습니다.
@@ -72,14 +72,16 @@
     [transforms.ToTensor(),
      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
 
+batch_size = 4
+
 trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                         download=True, transform=transform)
-trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                           shuffle=True, num_workers=2)
 
 testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=transform)
-testloader = torch.utils.data.DataLoader(testset, batch_size=4,
+testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                          shuffle=False, num_workers=2)
 
 classes = ('plane', 'car', 'bird', 'cat',
@@ -107,7 +109,7 @@ def imshow(img):
 # 이미지 보여주기
 imshow(torchvision.utils.make_grid(images))
 # 정답(label) 출력
-print(' '.join('%5s' % classes[labels[j]] for j in range(4)))
+print(' '.join('%5s' % classes[labels[j]] for j in range(batch_size)))
 
 
 ########################################################################
@@ -122,7 +124,7 @@ def imshow(img):
 
 class Net(nn.Module):
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(3, 6, 5)
         self.pool = nn.MaxPool2d(2, 2)
         self.conv2 = nn.Conv2d(6, 16, 5)
diff --git a/beginner_source/blitz/tensor_tutorial.py b/beginner_source/blitz/tensor_tutorial.py
index 1824ee6c4..a949f205d 100644
--- a/beginner_source/blitz/tensor_tutorial.py
+++ b/beginner_source/blitz/tensor_tutorial.py
@@ -1,185 +1,200 @@
-# -*- coding: utf-8 -*-
 """
-PyTorch가 무엇인가요?
-=======================
-
-Python 기반의 과학 연산 패키지로 다음과 같은 두 집단을 대상으로 합니다:
-
-- NumPy를 대체하면서 GPU를 이용한 연산이 필요한 경우
-- 최대한의 유연성과 속도를 제공하는 딥러닝 연구 플랫폼이 필요한 경우
+Tensors
+--------------------------------------------
 
-시작하기
------------
+Tensors are a specialized data structure that are very similar to arrays
+and matrices. In PyTorch, we use tensors to encode the inputs and
+outputs of a model, as well as the model’s parameters.
 
-Tensors
-^^^^^^^
+Tensors are similar to NumPy’s ndarrays, except that tensors can run on
+GPUs or other specialized hardware to accelerate computing. If you’re familiar with ndarrays, you’ll
+be right at home with the Tensor API. If not, follow along in this quick
+API walkthrough.
 
-Tensor는 NumPy의 ndarray와 유사하며, GPU를 사용한 연산 가속도 가능합니다.
 """
 
-from __future__ import print_function
 import torch
+import numpy as np
 
-###############################################################
-# .. note::
-#     초기화되지 않은 행렬이 선언되었지만, 사용하기 전에는 명확히 알려진 값을
-#     포함하고 있지는 않습니다. 초기화되지 않은 행렬이 생성되면 그 시점에 할당된
-#     메모리에 존재하던 값들이 초기값으로 나타납니다.
 
-###############################################################
-# 초기화되지 않은 5x3 행렬을 생성합니다:
+######################################################################
+# Tensor Initialization
+# ~~~~~~~~~~~~~~~~~~~~~
+#
+# Tensors can be initialized in various ways. Take a look at the following examples:
+#
+# **Directly from data**
+#
+# Tensors can be created directly from data. The data type is automatically inferred.
 
-x = torch.empty(5, 3)
-print(x)
+data = [[1, 2],[3, 4]]
+x_data = torch.tensor(data)
 
-###############################################################
-# 무작위로 초기화된 행렬을 생성합니다:
+######################################################################
+# **From a NumPy array**
+#
+# Tensors can be created from NumPy arrays (and vice versa - see :ref:`bridge-to-np-label`).
+np_array = np.array(data)
+x_np = torch.from_numpy(np_array)
 
-x = torch.rand(5, 3)
-print(x)
 
 ###############################################################
-# dtype이 long이고 0으로 채워진 행렬을 생성합니다:
+# **From another tensor:**
+#
+# The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden.
 
-x = torch.zeros(5, 3, dtype=torch.long)
-print(x)
+x_ones = torch.ones_like(x_data) # retains the properties of x_data
+print(f"Ones Tensor: \n {x_ones} \n")
 
-###############################################################
-# 데이터로부터 tensor를 직접 생성합니다:
+x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data
+print(f"Random Tensor: \n {x_rand} \n")
 
-x = torch.tensor([5.5, 3])
-print(x)
 
-###############################################################
-# 또는 기존 tensor를 바탕으로 새로운 tensor를 만듭니다. 이들 메소드(method)는
-# 사용자로부터 새로운 값을 제공받지 않은 한, 입력 tensor의 속성들(예. dtype)을
-# 재사용합니다.
+######################################################################
+# **With random or constant values:**
+#
+# ``shape`` is a tuple of tensor dimensions. In the functions below, it determines the dimensionality of the output tensor.
 
-x = x.new_ones(5, 3, dtype=torch.double)      # new_* 메소드는 크기를 받습니다
-print(x)
+shape = (2,3,)
+rand_tensor = torch.rand(shape)
+ones_tensor = torch.ones(shape)
+zeros_tensor = torch.zeros(shape)
 
-x = torch.randn_like(x, dtype=torch.float)    # dtype을 오버라이드(Override) 합니다!
-print(x)                                      # 결과는 동일한 크기를 갖습니다
+print(f"Random Tensor: \n {rand_tensor} \n")
+print(f"Ones Tensor: \n {ones_tensor} \n")
+print(f"Zeros Tensor: \n {zeros_tensor}")
 
-###############################################################
-# 행렬의 크기를 구합니다:
 
-print(x.size())
 
-###############################################################
-# .. note::
-#     ``torch.Size`` 는 튜플(tuple) 타입으로, 모든 튜플 연산을 지원합니다.
-#
-# 연산(Operations)
-# ^^^^^^^^^^^^^^^^
-# 연산을 위한 여러가지 문법을 제공합니다. 다음 예제들을 통해 덧셈 연산을 살펴보겠습니다.
+
+######################################################################
+# --------------
 #
-# 덧셈: 문법1
-y = torch.rand(5, 3)
-print(x + y)
 
-###############################################################
-# 덧셈: 문법2
 
-print(torch.add(x, y))
+######################################################################
+# Tensor Attributes
+# ~~~~~~~~~~~~~~~~~
+#
+# Tensor attributes describe their shape, datatype, and the device on which they are stored.
 
-###############################################################
-# 덧셈: 결과 tensor를 인자로 제공
-result = torch.empty(5, 3)
-torch.add(x, y, out=result)
-print(result)
+tensor = torch.rand(3,4)
 
-###############################################################
-# 덧셈: 바꿔치기(in-place) 방식
+print(f"Shape of tensor: {tensor.shape}")
+print(f"Datatype of tensor: {tensor.dtype}")
+print(f"Device tensor is stored on: {tensor.device}")
 
-# y에 x 더하기
-y.add_(x)
-print(y)
 
-###############################################################
-# .. note::
-#     바꿔치기(in-place) 방식으로 tensor의 값을 변경하는 연산 뒤에는 ``_``가 붙습니다.
-#     예: ``x.copy_(y)``, ``x.t_()`` 는 ``x`` 를 변경합니다.
+######################################################################
+# --------------
 #
-# NumPy스러운 인덱싱 표기 방법을 사용하실 수도 있습니다!
-
-print(x[:, 1])
-
-###############################################################
-# 크기 변경: tensor의 크기(size)나 모양(shape)을 변경하고 싶다면 ``torch.view`` 를 사용합니다:
-x = torch.randn(4, 4)
-y = x.view(16)
-z = x.view(-1, 8)  # -1은 다른 차원에서 유추합니다.
-print(x.size(), y.size(), z.size())
 
-###############################################################
-# 만약 tensor에 하나의 값만 존재한다면, ``.item()`` 을 사용하면 숫자 값을 얻을 수 있습니다.
-x = torch.randn(1)
-print(x)
-print(x.item())
 
-###############################################################
-# **더 읽을거리:**
-#
+######################################################################
+# Tensor Operations
+# ~~~~~~~~~~~~~~~~~
 #
-#   전치(transposing), 인덱싱(indexing), 슬라이싱(slicing), 수학 계산,
-#   선형 대수, 난수(random number) 등, 100가지 이상의 Tensor 연산은
-#   `여기 <http://pytorch.org/docs/torch>`_ 에서 확인하실 수 있습니다.
+# Over 100 tensor operations, including transposing, indexing, slicing,
+# mathematical operations, linear algebra, random sampling, and more are
+# comprehensively described
+# `here <https://pytorch.org/docs/stable/torch.html>`__.
 #
-# NumPy 변환(Bridge)
-# -------------------
+# Each of them can be run on the GPU (at typically higher speeds than on a
+# CPU). If you’re using Colab, allocate a GPU by going to Edit > Notebook
+# Settings.
 #
-# Torch Tensor를 NumPy 배열(array)로 변환하거나, 그 반대로 하는 것은 매우 쉽습니다.
+
+# We move our tensor to the GPU if available
+if torch.cuda.is_available():
+  tensor = tensor.to('cuda')
+
+
+######################################################################
+# Try out some of the operations from the list.
+# If you're familiar with the NumPy API, you'll find the Tensor API a breeze to use.
 #
-# (Torch Tensor가 CPU 상에 있다면) Torch Tensor와 NumPy 배열은 메모리 공간을
-# 공유하기 때문에, 하나를 변경하면 다른 하나도 변경됩니다.
+
+###############################################################
+# **Standard numpy-like indexing and slicing:**
+
+tensor = torch.ones(4, 4)
+tensor[:,1] = 0
+print(tensor)
+
+######################################################################
+# **Joining tensors** You can use ``torch.cat`` to concatenate a sequence of tensors along a given dimension.
+# See also `torch.stack <https://pytorch.org/docs/stable/generated/torch.stack.html>`__,
+# another tensor joining op that is subtly different from ``torch.cat``.
+t1 = torch.cat([tensor, tensor, tensor], dim=1)
+print(t1)
+
+######################################################################
+# **Multiplying tensors**
+
+# This computes the element-wise product
+print(f"tensor.mul(tensor) \n {tensor.mul(tensor)} \n")
+# Alternative syntax:
+print(f"tensor * tensor \n {tensor * tensor}")
+
+######################################################################
 #
-# Torch Tensor를 NumPy 배열로 변환하기
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# This computes the matrix multiplication between two tensors
+print(f"tensor.matmul(tensor.T) \n {tensor.matmul(tensor.T)} \n")
+# Alternative syntax:
+print(f"tensor @ tensor.T \n {tensor @ tensor.T}")
 
-a = torch.ones(5)
-print(a)
 
-###############################################################
+######################################################################
+# **In-place operations**
+# Operations that have a ``_`` suffix are in-place. For example: ``x.copy_(y)``, ``x.t_()``, will change ``x``.
+
+print(tensor, "\n")
+tensor.add_(5)
+print(tensor)
+
+######################################################################
+# .. note::
+#      In-place operations save some memory, but can be problematic when computing derivatives because of an immediate loss
+#      of history. Hence, their use is discouraged.
+
+######################################################################
+# --------------
 #
 
-b = a.numpy()
-print(b)
 
-###############################################################
-# NumPy 배열의 값이 어떻게 변하는지 확인해보세요.
+######################################################################
+# .. _bridge-to-np-label:
+#
+# Bridge with NumPy
+# ~~~~~~~~~~~~~~~~~
+# Tensors on the CPU and NumPy arrays can share their underlying memory
+# locations, and changing one will change	the other.
 
-a.add_(1)
-print(a)
-print(b)
 
-###############################################################
-# NumPy 배열을 Torch Tensor로 변환하기
+######################################################################
+# Tensor to NumPy array
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# np (NumPy) 배열을 변경하면 Torch Tensor의 값도 자동 변경되는 것을 확인해보세요.
+t = torch.ones(5)
+print(f"t: {t}")
+n = t.numpy()
+print(f"n: {n}")
 
-import numpy as np
-a = np.ones(5)
-b = torch.from_numpy(a)
-np.add(a, 1, out=a)
-print(a)
-print(b)
+######################################################################
+# A change in the tensor reflects in the NumPy array.
 
-###############################################################
-# CharTensor를 제외한 CPU 상의 모든 Tensor는 NumPy로 변환할 수 있고,
-# (NumPy에서 Tensor로의) 반대 변환도 가능합니다.
-#
-# CUDA Tensors
-# ------------
-#
-# ``.to`` 메소드를 사용하여 Tensor를 어떠한 장치로도 옮길 수 있습니다.
+t.add_(1)
+print(f"t: {t}")
+print(f"n: {n}")
 
-# 이 코드는 CUDA가 사용 가능한 환경에서만 실행합니다.
-# ``torch.device`` 를 사용하여 tensor를 GPU 안팎으로 이동해보겠습니다.
-if torch.cuda.is_available():
-    device = torch.device("cuda")          # CUDA 장치 객체(device object)로
-    y = torch.ones_like(x, device=device)  # GPU 상에 직접적으로 tensor를 생성하거나
-    x = x.to(device)                       # ``.to("cuda")`` 를 사용하면 됩니다.
-    z = x + y
-    print(z)
-    print(z.to("cpu", torch.double))       # ``.to`` 는 dtype도 함께 변경합니다!
+
+######################################################################
+# NumPy array to Tensor
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+n = np.ones(5)
+t = torch.from_numpy(n)
+
+######################################################################
+# Changes in the NumPy array reflects in the tensor.
+np.add(n, 1, out=n)
+print(f"t: {t}")
+print(f"n: {n}")
diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py
index bcce89918..0012d8a6b 100644
--- a/beginner_source/chatbot_tutorial.py
+++ b/beginner_source/chatbot_tutorial.py
@@ -962,9 +962,10 @@ def train(input_variable, lengths, target_variable, mask, max_target_len, encode
 
     # device 옵션을 설정합니다
     input_variable = input_variable.to(device)
-    lengths = lengths.to(device)
     target_variable = target_variable.to(device)
     mask = mask.to(device)
+    # Lengths for rnn packing should always be on the cpu
+    lengths = lengths.to("cpu")
 
     # 변수를 초기화합니다
     loss = 0
diff --git a/beginner_source/data_loading_tutorial.py b/beginner_source/data_loading_tutorial.py
index 8799d24ce..98df7508f 100644
--- a/beginner_source/data_loading_tutorial.py
+++ b/beginner_source/data_loading_tutorial.py
@@ -286,6 +286,12 @@ def __call__(self, sample):
         return {'image': torch.from_numpy(image),
                 'landmarks': torch.from_numpy(landmarks)}
 
+######################################################################
+# .. note::
+#     위 예시에서, `RandomCrop` 은 외부 라이브러리의 난수 생성기(random number generator; 이 경우, Numpy의 `np.random.int` )를
+#     사용하고 있습니다. 이는 `DataLoader` 가 예상치 못한 동작을 하도록 할 수 있습니다.
+#     (https://pytorch.org/docs/stable/notes/faq.html#my-data-loader-workers-return-identical-random-numbers 를 참고하세요)
+#     실제 상황에서는 `torch.randint` 와 같은 PyTorch가 제공하는 난수 생성기를 사용하는 것이 안전합니다.
 
 ######################################################################
 # Compose transforms
@@ -368,7 +374,7 @@ def __call__(self, sample):
 # 그러나, 대부분의 경우에 대해서 정확하게 작동해야 합니다.
 
 dataloader = DataLoader(transformed_dataset, batch_size=4,
-                        shuffle=True, num_workers=4)
+                        shuffle=True, num_workers=0)
 
 
 # 배치하는 과정을 보여주는 함수입니다.
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
index 1da4614eb..c1862aa69 100644
--- a/beginner_source/dcgan_faces_tutorial.py
+++ b/beginner_source/dcgan_faces_tutorial.py
@@ -274,7 +274,7 @@
 # --------------
 # 
 # With our input parameters set and the dataset prepared, we can now get
-# into the implementation. We will start with the weigth initialization
+# into the implementation. We will start with the weight initialization
 # strategy, then talk about the generator, discriminator, loss functions,
 # and training loop in detail.
 # 
@@ -554,7 +554,7 @@ def forward(self, input):
 # reported are:
 # 
 # -  **Loss_D** - discriminator loss calculated as the sum of losses for
-#    the all real and all fake batches (:math:`log(D(x)) + log(D(G(z)))`).
+#    the all real and all fake batches (:math:`log(D(x)) + log(1 - D(G(z)))`).
 # -  **Loss_G** - generator loss calculated as :math:`log(D(G(z)))`
 # -  **D(x)** - the average output (across the batch) of the discriminator
 #    for the all real batch. This should start close to 1 then
@@ -591,7 +591,7 @@ def forward(self, input):
         # Format batch
         real_cpu = data[0].to(device)
         b_size = real_cpu.size(0)
-        label = torch.full((b_size,), real_label, device=device)
+        label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
         # Forward pass real batch through D
         output = netD(real_cpu).view(-1)
         # Calculate loss on all-real batch
@@ -610,10 +610,10 @@ def forward(self, input):
         output = netD(fake.detach()).view(-1)
         # Calculate D's loss on the all-fake batch
         errD_fake = criterion(output, label)
-        # Calculate the gradients for this batch
+        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
         errD_fake.backward()
         D_G_z1 = output.mean().item()
-        # Add the gradients from the all-real and all-fake batches
+        # Compute error of D as sum over the fake and the real batches
         errD = errD_real + errD_fake
         # Update D
         optimizerD.step()
diff --git a/beginner_source/deep_learning_60min_blitz.rst b/beginner_source/deep_learning_60min_blitz.rst
index 7cecc99b6..8e3e8a2b9 100644
--- a/beginner_source/deep_learning_60min_blitz.rst
+++ b/beginner_source/deep_learning_60min_blitz.rst
@@ -9,13 +9,18 @@
      <iframe width="560" height="315" src="https://www.youtube.com/embed/u7x8RXwLKcA" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
    </div>
 
-이 튜토리얼의 목표:
+파이토치(PyTorch)가 무엇인가요?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+PyTorch는 Python 기반의 과학 연산 패키지로 다음 두 가지 목적으로 제공됩니다:
 
+- GPU 및 다른 가속기의 성능을 사용하기 위한 NumPy의 대체제 제공
+- 신경망 구현에 유용한 자동 미분(automatic differntiation) 라이브러리 제공
+
+이 튜토리얼의 목표
+~~~~~~~~~~~~~~~~~~~~~~~~
 -  높은 수준에서 PyTorch의 Tensor library와 신경망(Neural Network)를 이해합니다.
 -  이미지를 분류하는 작은 신경망을 학습시킵니다.
 
-*이 튜토리얼은 독자가 NumPy에 대한 기본적 이해를 하고 있다고 가정합니다.*
-
 .. Note::
     `torch`_ 와 `torchvision`_ 패키지를 설치했는지 확인하십시오.
 
diff --git a/beginner_source/deeplabv3_on_android.rst b/beginner_source/deeplabv3_on_android.rst
new file mode 100644
index 000000000..9b17e04fd
--- /dev/null
+++ b/beginner_source/deeplabv3_on_android.rst
@@ -0,0 +1,230 @@
+Image Segmentation DeepLabV3 on Android
+=================================================
+
+**Author**: `Jeff Tang <https://github.com/jeffxtang>`_
+
+**Reviewed by**: `Jeremiah Chung <https://github.com/jeremiahschung>`_
+
+Introduction
+------------
+
+Semantic image segmentation is a computer vision task that uses semantic labels to mark specific regions of an input image. The PyTorch semantic image segmentation `DeepLabV3 model <https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101>`_ can be used to label image regions with `20 semantic classes <http://host.robots.ox.ac.uk:8080/pascal/VOC/voc2007/segexamples/index.html>`_ including, for example, bicycle, bus, car, dog, and person. Image segmentation models can be very useful in applications such as autonomous driving and scene understanding.
+
+In this tutorial, we will provide a step-by-step guide on how to prepare and run the PyTorch DeepLabV3 model on Android, taking you from the beginning of having a model you may want to use on Android to the end of having a complete Android app using the model. We will also cover practical and general tips on how to check if your next favorable pre-trained PyTorch models can run on Android, and how to avoid pitfalls.
+
+.. note:: Before going through this tutorial, you should check out `PyTorch Mobile for Android <https://pytorch.org/mobile/android/>`_ and give the PyTorch Android `HelloWorld <https://github.com/pytorch/android-demo-app/tree/master/HelloWorldApp>`_ example app a quick try. This tutorial will go beyond the image classification model, usually the first kind of model deployed on mobile. The complete code repo for this tutorial is available `here <https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation>`_.
+
+Learning Objectives
+-------------------
+
+In this tutorial, you will learn how to:
+
+1. Convert the DeepLabV3 model for Android deployment.
+
+2. Get the output of the model for the example input image in Python and compare it to the output from the Android app.
+
+3. Build a new Android app or reuse an Android example app to load the converted model.
+
+4. Prepare the input into the format that the model expects and process the model output.
+
+5. Complete the UI, refactor, build and run the app to see image segmentation in action.
+
+Pre-requisites
+---------------
+
+* PyTorch 1.6 or 1.7
+
+* torchvision 0.7 or 0.8
+
+* Android Studio 3.5.1 or above with NDK installed
+
+Steps
+---------
+
+1. Convert the DeepLabV3 model for Android deployment
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The first step to deploying a model on Android is to convert the model into the `TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_ format.
+
+.. note::
+    Not all PyTorch models can be converted to TorchScript at this time because a model definition may use language features that are not in TorchScript, which is a subset of Python. See the `Script and Optimize Recipe <../recipes/script_optimized.html>`_ for more details.
+
+Simply run the script below to generate the scripted model `deeplabv3_scripted.pt`:
+
+::
+
+    import torch
+
+    # use deeplabv3_resnet50 instead of resnet101 to reduce the model size
+    model = torch.hub.load('pytorch/vision:v0.7.0', 'deeplabv3_resnet50', pretrained=True)
+    model.eval()
+
+    scriptedm = torch.jit.script(model)
+    torch.jit.save(scriptedm, "deeplabv3_scripted.pt")
+
+The size of the generated `deeplabv3_scripted.pt` model file should be around 168MB. Ideally, a model should also be quantized for significant size reduction and faster inference before being deployed on an Android app. To have a general understanding of quantization, see the `Quantization Recipe <../recipes/quantization.html>`_ and the resource links there. We will cover in detail how to correctly apply a quantization workflow called Post Training `Static Quantization <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_ to the DeepLabV3 model in a future tutorial or recipe.
+
+2. Get example input and output of the model in Python
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Now that we have a scripted PyTorch model, let's test with some example inputs to make sure the model works correctly on Android. First, let's write a Python script that uses the model to make inferences and examine inputs and outputs. For this example of the DeepLabV3 model, we can reuse the code in Step 1 and in the `DeepLabV3 model hub site <https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101>`_. Add the following code snippet to the code above:
+
+::
+
+    from PIL import Image
+    from torchvision import transforms
+    input_image = Image.open("deeplab.jpg")
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+
+    input_tensor = preprocess(input_image)
+    input_batch = input_tensor.unsqueeze(0)
+    with torch.no_grad():
+        output = model(input_batch)['out'][0]
+
+    print(input_batch.shape)
+    print(output.shape)
+
+Download `deeplab.jpg` from `here <https://github.com/jeffxtang/android-demo-app/blob/new_demo_apps/ImageSegmentation/app/src/main/assets/deeplab.jpg>`_, then run the script above and you will see the shapes of the input and output of the model:
+
+::
+
+    torch.Size([1, 3, 400, 400])
+    torch.Size([21, 400, 400])
+
+So if you provide the same image input `deeplab.jpg` of size 400x400 to the model on Android, the output of the model should have the size [21, 400, 400]. You should also print out at least the beginning parts of the actual data of the input and output, to be used in Step 4 below to compare with the actual input and output of the model when running in the Android app.
+
+3. Build a new Android app or reuse an example app and load the model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+First, follow Step 3 of the `Model Preparation for Android recipe <../recipes/model_preparation_android.html#add-the-model-and-pytorch-library-on-android>`_ to use our model in an Android Studio project with PyTorch Mobile enabled. Because both DeepLabV3 used in this tutorial and MobileNet v2 used in the PyTorch HelloWorld Android example are computer vision models, you can also get the `HelloWorld example repo <https://github.com/pytorch/android-demo-app/tree/master/HelloWorldApp>`_ to make it easier to modify the code that loads the model and processes the input and output. The main goal in this step and Step 4 is to make sure the model `deeplabv3_scripted.pt` generated in Step 1 can indeed work correctly on Android.
+
+Now let's add `deeplabv3_scripted.pt` and `deeplab.jpg` used in Step 2 to the Android Studio project and modify the `onCreate` method in the `MainActivity` to resemble:
+
+.. code-block:: java
+
+    Module module = null;
+    try {
+      module = Module.load(assetFilePath(this, "deeplabv3_scripted.pt"));
+    } catch (IOException e) {
+      Log.e("ImageSegmentation", "Error loading model!", e);
+      finish();
+    }
+
+Then set a breakpoint at the line `finish()` and build and run the app. If the app doesn't stop at the breakpoint, it means  that the scripted model in Step 1 has been successfully loaded on Android.
+
+4. Process the model input and output for model inference
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+After the model loads in the previous step, let's verify that it works with expected inputs and can generate expected outputs. As the model input for the DeepLabV3 model is an image the same as that of the MobileNet v2 in the HelloWorld example, we will reuse some of the code in the `MainActivity.java <https://github.com/pytorch/android-demo-app/blob/master/HelloWorldApp/app/src/main/java/org/pytorch/helloworld/MainActivity.java>`_ file from HelloWorld for input processing. Replace the code snippet between `line 50 <https://github.com/pytorch/android-demo-app/blob/master/HelloWorldApp/app/src/main/java/org/pytorch/helloworld/MainActivity.java#L50>`_ and 73 in `MainActivity.java` with the following code:
+
+.. code-block:: java
+
+    final Tensor inputTensor = TensorImageUtils.bitmapToFloat32Tensor(bitmap,
+            TensorImageUtils.TORCHVISION_NORM_MEAN_RGB,
+            TensorImageUtils.TORCHVISION_NORM_STD_RGB);
+    final float[] inputs = inputTensor.getDataAsFloatArray();
+
+    Map<String, IValue> outTensors =
+        module.forward(IValue.from(inputTensor)).toDictStringKey();
+
+    // the key "out" of the output tensor contains the semantic masks
+    // see https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101
+    final Tensor outputTensor = outTensors.get("out").toTensor();
+    final float[] outputs = outputTensor.getDataAsFloatArray();
+
+    int width = bitmap.getWidth();
+    int height = bitmap.getHeight();
+
+.. note::
+    The model output is a dictionary for the DeepLabV3 model so we use `toDictStringKey` to correctly extract the result. For other models, the model output may also be a single tensor or a tuple of tensors, among other things.
+
+With the code changes shown above, you can set breakpoints after `final float[] inputs` and `final float[] outputs`, which populate the input tensor and output tensor data to float arrays for easy debugging. Run the app and when it stops at the breakpoints, compare the numbers in `inputs` and `outputs` with the model input and output data you see in Step 2 to see if they match. For the same inputs to the models running on Android and Python, you should get the same outputs.
+
+.. warning::
+    You may see different model outputs with the same image input when running on an Android emulator due to some Android emulator's floating point implementation issue. So it is best to test the app on a real Android device.
+
+All we have done so far is to confirm that the model of our interest can be scripted and run correctly in our Android app as in Python. The steps we walked through so far for using a model in an iOS app consumes the bulk, if not most, of our app development time, similar to how data preprocessing is the heaviest lift for a typical machine learning project.
+
+5. Complete the UI, refactor, build and run the app
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Now we are ready to complete the app and the UI to actually see the processed result as a new image. The output processing code should be like this, added to the end of the code snippet in Step 4:
+
+.. code-block:: java
+
+    int[] intValues = new int[width * height];
+    // go through each element in the output of size [WIDTH, HEIGHT] and
+    // set different color for different classnum
+    for (int j = 0; j < width; j++) {
+        for (int k = 0; k < height; k++) {
+            // maxi: the index of the 21 CLASSNUM with the max probability
+            int maxi = 0, maxj = 0, maxk = 0;
+            double maxnum = -100000.0;
+            for (int i=0; i < CLASSNUM; i++) {
+                if (outputs[i*(width*height) + j*width + k] > maxnum) {
+                    maxnum = outputs[i*(width*height) + j*width + k];
+                    maxi = i; maxj = j; maxk= k;
+                }
+            }
+            // color coding for person (red), dog (green), sheep (blue)
+            // black color for background and other classes
+            if (maxi == PERSON)
+                intValues[maxj*width + maxk] = 0xFFFF0000; // red
+            else if (maxi == DOG)
+                intValues[maxj*width + maxk] = 0xFF00FF00; // green
+            else if (maxi == SHEEP)
+                intValues[maxj*width + maxk] = 0xFF0000FF; // blue
+            else
+                intValues[maxj*width + maxk] = 0xFF000000; // black
+        }
+    }
+
+The constants used in the code above are defined in the beginning of the class `MainActivity`:
+
+.. code-block:: java
+
+    private static final int CLASSNUM = 21;
+    private static final int DOG = 12;
+    private static final int PERSON = 15;
+    private static final int SHEEP = 17;
+
+
+The implementation here is based on the understanding of the DeepLabV3 model which outputs a tensor of size [21, width, height] for an input image of width*height. Each element in the width*height output array is a value between 0 and 20 (for a total of 21 semantic labels described in Introduction) and the value is used to set a specific color. Color coding of the segmentation here is based on the class with the highest probability, and you can extend the color coding for all classes in your own dataset.
+
+After the output processing, you will also need to call the code below to render the RGB `intValues` array to a bitmap instance `outputBitmap` before displaying it on an `ImageView`:
+
+.. code-block:: java
+
+    Bitmap bmpSegmentation = Bitmap.createScaledBitmap(bitmap, width, height, true);
+    Bitmap outputBitmap = bmpSegmentation.copy(bmpSegmentation.getConfig(), true);
+    outputBitmap.setPixels(intValues, 0, outputBitmap.getWidth(), 0, 0,
+        outputBitmap.getWidth(), outputBitmap.getHeight());
+    imageView.setImageBitmap(outputBitmap);
+
+The UI for this app is also similar to that for HelloWorld, except that you do not need the `TextView` to show the image classification result. You can also add two buttons `Segment` and `Restart` as shown in the code repo to run the model inference and to show back the original image after the segmentation result is shown.
+
+Now when you run the app on an Android emulator or preferably an actual device, you will see screens like the following:
+
+.. image:: /_static/img/deeplabv3_android.png
+   :width: 300 px
+.. image:: /_static/img/deeplabv3_android2.png
+   :width: 300 px
+
+
+Recap
+--------
+
+In this tutorial, we described what it takes to convert a pre-trained PyTorch DeepLabV3 model for Android and how to make sure the model can run successfully on Android. Our focus was to help you understand the process of confirming that a model can indeed run on Android. The complete code repo is available `here <https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation>`_.
+
+More advanced topics such as quantization and using models via transfer learning or of your own on Android will be covered soon in future demo apps and tutorials.
+
+
+Learn More
+------------
+
+1. `PyTorch Mobile site <https://pytorch.org/mobile>`_
+2. `DeepLabV3 model <https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101>`_
+3. `DeepLabV3 paper <https://arxiv.org/pdf/1706.05587.pdf>`_
diff --git a/beginner_source/deeplabv3_on_ios.rst b/beginner_source/deeplabv3_on_ios.rst
new file mode 100644
index 000000000..ee27384de
--- /dev/null
+++ b/beginner_source/deeplabv3_on_ios.rst
@@ -0,0 +1,248 @@
+Image Segmentation DeepLabV3 on iOS
+==============================================
+
+**Author**: `Jeff Tang <https://github.com/jeffxtang>`_
+
+**Reviewed by**: `Jeremiah Chung <https://github.com/jeremiahschung>`_
+
+Introduction
+------------
+
+Semantic image segmentation is a computer vision task that uses semantic labels to mark specific regions of an input image. The PyTorch semantic image segmentation `DeepLabV3 model <https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101>`_ can be used to label image regions with `20 semantic classes <http://host.robots.ox.ac.uk:8080/pascal/VOC/voc2007/segexamples/index.html>`_ including, for example, bicycle, bus, car, dog, and person. Image segmentation models can be very useful in applications such as autonomous driving and scene understanding.
+
+In this tutorial, we will provide a step-by-step guide on how to prepare and run the PyTorch DeepLabV3 model on iOS, taking you from the beginning of having a model you may want to use on iOS to the end of having a complete iOS app using the model. We will also cover practical and general tips on how to check if your next favorite pre-trained PyTorch models can run on iOS, and how to avoid pitfalls.
+
+.. note:: Before going through this tutorial, you should check out `PyTorch Mobile for iOS <https://pytorch.org/mobile/ios/>`_ and give the PyTorch iOS `HelloWorld <https://github.com/pytorch/ios-demo-app/tree/master/HelloWorld>`_ example app a quick try. This tutorial will go beyond the image classification model, usually the first kind of model deployed on mobile. The complete code repo for this tutorial is available `here <https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation>`_.
+
+Learning Objectives
+-------------------
+
+In this tutorial, you will learn how to:
+
+1. Convert the DeepLabV3 model for iOS deployment.
+
+2. Get the output of the model for the example input image in Python and compare it to the output from the iOS app.
+
+3. Build a new iOS app or reuse an iOS example app to load the converted model.
+
+4. Prepare the input into the format that the model expects and process the model output.
+
+5. Complete the UI, refactor, build and run the app to see image segmentation in action.
+
+Pre-requisites
+---------------
+
+* PyTorch 1.6 or 1.7
+
+* torchvision 0.7 or 0.8
+
+* Xcode 11 or 12
+
+Steps
+---------
+
+
+1. Convert the DeepLabV3 model for iOS deployment
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The first step to deploying a model on iOS is to convert the model into the `TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_ format.
+
+.. note::
+    Not all PyTorch models can be converted to TorchScript at this time because a model definition may use language features that are not in TorchScript, which is a subset of Python. See the `Script and Optimize Recipe <../recipes/script_optimized.html>`_ for more details.
+
+Simply run the script below to generate the scripted model `deeplabv3_scripted.pt`:
+
+::
+
+    import torch
+
+    # use deeplabv3_resnet50 instead of deeplabv3_resnet101 to reduce the model size
+    model = torch.hub.load('pytorch/vision:v0.8.0', 'deeplabv3_resnet50', pretrained=True)
+    model.eval()
+
+    scriptedm = torch.jit.script(model)
+    torch.jit.save(scriptedm, "deeplabv3_scripted.pt")
+
+The size of the generated `deeplabv3_scripted.pt` model file should be around 168MB. Ideally, a model should also be quantized for significant size reduction and faster inference before being deployed on an iOS app. To have a general understanding of quantization, see the `Quantization Recipe <../recipes/quantization.html>`_ and the resource links there. We will cover in detail how to correctly apply a quantization workflow called Post Training `Static Quantization <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_ to the DeepLabV3 model in a future tutorial or recipe.
+
+2. Get example input and output of the model in Python
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Now that we have a scripted PyTorch model, let's test with some example inputs to make sure the model works correctly on iOS. First, let's write a Python script that uses the model to make inferences and examine inputs and outputs. For this example of the DeepLabV3 model, we can reuse the code in Step 1 and in the `DeepLabV3 model hub site <https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101>`_. Add the following code snippet to the code above:
+
+::
+
+    from PIL import Image
+    from torchvision import transforms
+    input_image = Image.open("deeplab.jpg")
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+
+    input_tensor = preprocess(input_image)
+    input_batch = input_tensor.unsqueeze(0)
+    with torch.no_grad():
+        output = model(input_batch)['out'][0]
+
+    print(input_batch.shape)
+    print(output.shape)
+
+Download `deeplab.jpg` from `here <https://github.com/pytorch/ios-demo-app/blob/master/ImageSegmentation/ImageSegmentation/deeplab.jpg>`_ and run the script above to see the shapes of the input and output of the model:
+
+::
+
+    torch.Size([1, 3, 400, 400])
+    torch.Size([21, 400, 400])
+
+So if you provide the same image input `deeplab.jpg` of size 400x400 to the model on iOS, the output of the model should have the size [21, 400, 400]. You should also print out at least the beginning parts of the actual data of the input and output, to be used in Step 4 below to compare with the actual input and output of the model when running in the iOS app.
+
+3. Build a new iOS app or reuse an example app and load the model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+First, follow Step 3 of the `Model Preparation for iOS recipe <../recipes/model_preparation_ios.html#add-the-model-and-pytorch-library-on-ios>`_ to use our model in an Xcode project with PyTorch Mobile enabled. Because both the DeepLabV3 model used in this tutorial and the MobileNet v2 model used in the PyTorch HelloWorld iOS example are computer vision models, you may choose to start with the `HelloWorld example repo <https://github.com/pytorch/ios-demo-app/tree/master/HelloWorld>`_ as a template to reuse the code that loads the model and processes the input and output.
+
+Now let's add `deeplabv3_scripted.pt` and `deeplab.jpg` used in Step 2 to the Xcode project and modify `ViewController.swift` to resemble:
+
+.. code-block:: swift
+
+    class ViewController: UIViewController {
+        var image = UIImage(named: "deeplab.jpg")!
+
+        override func viewDidLoad() {
+            super.viewDidLoad()
+        }
+
+        private lazy var module: TorchModule = {
+            if let filePath = Bundle.main.path(forResource: "deeplabv3_scripted",
+                  ofType: "pt"),
+                let module = TorchModule(fileAtPath: filePath) {
+                return module
+            } else {
+                fatalError("Can't load the model file!")
+            }
+        }()
+    }
+
+Then set a breakpoint at the line `return module` and build and run the app. The app should stop at the breakpoint, meaning that the scripted model in Step 1 has been successfully loaded on iOS.
+
+4. Process the model input and output for model inference
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+After the model loads in the previous step, let's verify that it works with expected inputs and can generate expected outputs. As the model input for the DeepLabV3 model is an image, the same as that of the MobileNet v2 in the HelloWorld example, we will reuse some of the code in the `TorchModule.mm <https://github.com/pytorch/ios-demo-app/blob/master/HelloWorld/HelloWorld/HelloWorld/TorchBridge/TorchModule.mm>`_ file from HelloWorld for input processing. Replace the `predictImage` method implementation in `TorchModule.mm` with the following code:
+
+.. code-block:: objective-c
+
+    - (unsigned char*)predictImage:(void*)imageBuffer {
+        // 1. the example deeplab.jpg size is size 400x400 and there are 21 semantic classes
+        const int WIDTH = 400;
+        const int HEIGHT = 400;
+        const int CLASSNUM = 21;
+
+        at::Tensor tensor = torch::from_blob(imageBuffer, {1, 3, WIDTH, HEIGHT}, at::kFloat);
+        torch::autograd::AutoGradMode guard(false);
+        at::AutoNonVariableTypeMode non_var_type_mode(true);
+
+        // 2. convert the input tensor to an NSMutableArray for debugging
+        float* floatInput = tensor.data_ptr<float>();
+        if (!floatInput) {
+            return nil;
+        }
+        NSMutableArray* inputs = [[NSMutableArray alloc] init];
+        for (int i = 0; i < 3 * WIDTH * HEIGHT; i++) {
+            [inputs addObject:@(floatInput[i])];
+        }
+
+        // 3. the output of the model is a dictionary of string and tensor, as
+        // specified at https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101
+        auto outputDict = _impl.forward({tensor}).toGenericDict();
+
+        // 4. convert the output to another NSMutableArray for easy debugging
+        auto outputTensor = outputDict.at("out").toTensor();
+        float* floatBuffer = outputTensor.data_ptr<float>();
+        if (!floatBuffer) {
+          return nil;
+        }
+        NSMutableArray* results = [[NSMutableArray alloc] init];
+        for (int i = 0; i < CLASSNUM * WIDTH * HEIGHT; i++) {
+          [results addObject:@(floatBuffer[i])];
+        }
+
+        return nil;
+    }
+
+.. note::
+    The model output is a dictionary for the DeepLabV3 model so we use `toGenericDict` to correctly extract the result. For other models, the model output may also be a single tensor or a tuple of tensors, among other things.
+
+With the code changes shown above, you can set breakpoints after the two for loops that populate `inputs` and `results` and compare them with the model input and output data you saw in Step 2 to see if they match. For the same inputs to the models running on iOS and Python, you should get the same outputs.
+
+All we have done so far is to confirm that the model of our interest can be scripted and run correctly in our iOS app as in Python. The steps we walked through so far for using a model in an iOS app consumes the bulk, if not most, of our app development time, similar to how data preprocessing is the heaviest lift for a typical machine learning project.
+
+5. Complete the UI, refactor, build and run the app
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Now we are ready to complete the app and the UI to actually see the processed result as a new image. The output processing code should be like this, added to the end of the code snippet in Step 4 in `TorchModule.mm` - remember to first remove the line `return nil;` temporarily put there to make the code build and run:
+
+.. code-block:: objective-c
+
+    // see the 20 semantic classes link in Introduction
+    const int DOG = 12;
+    const int PERSON = 15;
+    const int SHEEP = 17;
+
+    NSMutableData* data = [NSMutableData dataWithLength:
+        sizeof(unsigned char) * 3 * WIDTH * HEIGHT];
+    unsigned char* buffer = (unsigned char*)[data mutableBytes];
+    // go through each element in the output of size [WIDTH, HEIGHT] and
+    // set different color for different classnum
+    for (int j = 0; j < WIDTH; j++) {
+        for (int k = 0; k < HEIGHT; k++) {
+            // maxi: the index of the 21 CLASSNUM with the max probability
+            int maxi = 0, maxj = 0, maxk = 0;
+            float maxnum = -100000.0;
+            for (int i = 0; i < CLASSNUM; i++) {
+                if ([results[i * (WIDTH * HEIGHT) + j * WIDTH + k] floatValue] > maxnum) {
+                    maxnum = [results[i * (WIDTH * HEIGHT) + j * WIDTH + k] floatValue];
+                    maxi = i; maxj = j; maxk = k;
+                }
+            }
+            int n = 3 * (maxj * width + maxk);
+            // color coding for person (red), dog (green), sheep (blue)
+            // black color for background and other classes
+            buffer[n] = 0; buffer[n+1] = 0; buffer[n+2] = 0;
+            if (maxi == PERSON) buffer[n] = 255;
+            else if (maxi == DOG) buffer[n+1] = 255;
+            else if (maxi == SHEEP) buffer[n+2] = 255;
+        }
+    }
+    return buffer;
+
+The implementation here is based on the understanding of the DeepLabV3 model which outputs a tensor of size [21, width, height] for an input image of width*height. Each element in the width*height output array is a value between 0 and 20 (for a total of 21 semantic labels described in Introduction) and the value is used to set a specific color. Color coding of the segmentation here is based on the class with the highest probability, and you can extend the color coding for all classes in your own dataset.
+
+After the output processing, you will also need to call a helper function to convert the RGB `buffer` to an `UIImage` instance to be shown on `UIImageView`. You can refer to the example code `convertRGBBufferToUIImage` defined in `UIImageHelper.mm` in the code repo.
+
+The UI for this app is also similar to that for HelloWorld, except that you do not need the `UITextView` to show the image classification result. You can also add two buttons `Segment` and `Restart` as shown in the code repo to run the model inference and to show back the original image after the segmentation result is shown.
+
+The last step before we can run the app is to connect all the pieces together. Modify the `ViewController.swift` file to use the `predictImage`, which is refactored and changed to `segmentImage` in the repo, and helper functions you built as shown in the example code in the repo in `ViewController.swift`. Connect the buttons to the actions and you should be good to go.
+
+Now when you run the app on an iOS simulator or an actual iOS device, you will see the following screens:
+
+.. image:: /_static/img/deeplabv3_ios.png
+   :width: 300 px
+.. image:: /_static/img/deeplabv3_ios2.png
+   :width: 300 px
+
+
+Recap
+--------
+
+In this tutorial, we described what it takes to convert a pre-trained PyTorch DeepLabV3 model for iOS and how to make sure the model can run successfully on iOS. Our focus was to help you understand the process of confirming that a model can indeed run on iOS. The complete code repo is available `here <https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation>`_.
+
+More advanced topics such as quantization and using models via transfer learning or of your own on iOS will be covered soon in future demo apps and tutorials.
+
+Learn More
+------------
+
+1. `PyTorch Mobile site <https://pytorch.org/mobile>`_
+2. `DeepLabV3 model <https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101>`_
+3. `DeepLabV3 paper <https://arxiv.org/pdf/1706.05587.pdf>`_
diff --git a/beginner_source/dist_overview.rst b/beginner_source/dist_overview.rst
index bc9f7fe6b..572b3df9d 100644
--- a/beginner_source/dist_overview.rst
+++ b/beginner_source/dist_overview.rst
@@ -113,7 +113,7 @@ model replicas. Moreover, the model is broadcast at DDP construction time instea
 of in every forward pass, which also helps to speed up training. DDP is shipped
 with several performance optimization technologies. For a more in-depth
 explanation, please refer to this
-`DDP paper <https://arxiv.org/abs/2006.15704>`__ (VLDB'20).
+`DDP paper <http://www.vldb.org/pvldb/vol13/p3005-li.pdf>`__ (VLDB'20).
 
 
 DDP materials are listed below:
@@ -131,8 +131,10 @@ DDP materials are listed below:
    tutorial.
 3. The `Launching and configuring distributed data parallel applications <https://github.com/pytorch/examples/blob/master/distributed/ddp/README.md>`__
    document shows how to use the DDP launching script.
-4. `PyTorch Distributed Trainer with Amazon AWS <aws_distributed_training_tutorial.html>`__
-   demonstrates how to use DDP on AWS.
+4. The `Shard Optimizer States With ZeroRedundancyOptimizer <https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html>`__
+   recipe demonstrates how `ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__
+   helps to reduce optimizer memory footprint for distributed data-parallel
+   training.
 
 TorchElastic
 ~~~~~~~~~~~~
@@ -195,3 +197,13 @@ RPC Tutorials are listed below:
    `@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
    decorator, which can help speed up inference and training. It uses similar
    RL and PS examples employed in the above tutorials 1 and 2.
+5. The `Combining Distributed DataParallel with Distributed RPC Framework <../advanced/rpc_ddp_tutorial.html>`__
+   tutorial demonstrates how to combine DDP with RPC to train a model using
+   distributed data parallelism combined with distributed model parallelism.
+
+
+PyTorch Distributed Developers
+------------------------------
+
+If you'd like to contribute to PyTorch Distributed, please refer to our
+`Developer Guide <https://github.com/pytorch/pytorch/blob/master/torch/distributed/CONTRIBUTING.md>`_.
diff --git a/beginner_source/examples_autograd/polynomial_autograd.py b/beginner_source/examples_autograd/polynomial_autograd.py
new file mode 100755
index 000000000..65ab5892d
--- /dev/null
+++ b/beginner_source/examples_autograd/polynomial_autograd.py
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch: Tensors and autograd
+-------------------------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`pi` by minimizing squared Euclidean distance.
+
+This implementation computes the forward pass using operations on PyTorch
+Tensors, and uses PyTorch autograd to compute gradients.
+
+
+A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
+Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
+holding the gradient of ``x`` with respect to some scalar value.
+"""
+import torch
+import math
+
+dtype = torch.float
+device = torch.device("cpu")
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+
+# Create Tensors to hold input and outputs.
+# By default, requires_grad=False, which indicates that we do not need to
+# compute gradients with respect to these Tensors during the backward pass.
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Create random Tensors for weights. For a third order polynomial, we need
+# 4 weights: y = a + b x + c x^2 + d x^3
+# Setting requires_grad=True indicates that we want to compute gradients with
+# respect to these Tensors during the backward pass.
+a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
+b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
+c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
+d = torch.randn((), device=device, dtype=dtype, requires_grad=True)
+
+learning_rate = 1e-6
+for t in range(2000):
+    # Forward pass: compute predicted y using operations on Tensors.
+    y_pred = a + b * x + c * x ** 2 + d * x ** 3
+
+    # Compute and print loss using operations on Tensors.
+    # Now loss is a Tensor of shape (1,)
+    # loss.item() gets the scalar value held in the loss.
+    loss = (y_pred - y).pow(2).sum()
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Use autograd to compute the backward pass. This call will compute the
+    # gradient of loss with respect to all Tensors with requires_grad=True.
+    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
+    # the gradient of the loss with respect to a, b, c, d respectively.
+    loss.backward()
+
+    # Manually update weights using gradient descent. Wrap in torch.no_grad()
+    # because weights have requires_grad=True, but we don't need to track this
+    # in autograd.
+    with torch.no_grad():
+        a -= learning_rate * a.grad
+        b -= learning_rate * b.grad
+        c -= learning_rate * c.grad
+        d -= learning_rate * d.grad
+
+        # Manually zero the gradients after updating weights
+        a.grad = None
+        b.grad = None
+        c.grad = None
+        d.grad = None
+
+print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')
diff --git a/beginner_source/examples_autograd/polynomial_custom_function.py b/beginner_source/examples_autograd/polynomial_custom_function.py
new file mode 100755
index 000000000..33fc1a246
--- /dev/null
+++ b/beginner_source/examples_autograd/polynomial_custom_function.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch: Defining New autograd Functions
+----------------------------------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`pi` by minimizing squared Euclidean distance. Instead of writing the
+polynomial as :math:`y=a+bx+cx^2+dx^3`, we write the polynomial as
+:math:`y=a+b P_3(c+dx)` where :math:`P_3(x)=\frac{1}{2}\left(5x^3-3x\right)` is
+the `Legendre polynomial`_ of degree three.
+
+.. _Legendre polynomial:
+    https://en.wikipedia.org/wiki/Legendre_polynomials
+
+This implementation computes the forward pass using operations on PyTorch
+Tensors, and uses PyTorch autograd to compute gradients.
+
+In this implementation we implement our own custom autograd function to perform
+:math:`P_3'(x)`. By mathematics, :math:`P_3'(x)=\frac{3}{2}\left(5x^2-1\right)`
+"""
+import torch
+import math
+
+
+class LegendrePolynomial3(torch.autograd.Function):
+    """
+    We can implement our own custom autograd Functions by subclassing
+    torch.autograd.Function and implementing the forward and backward passes
+    which operate on Tensors.
+    """
+
+    @staticmethod
+    def forward(ctx, input):
+        """
+        In the forward pass we receive a Tensor containing the input and return
+        a Tensor containing the output. ctx is a context object that can be used
+        to stash information for backward computation. You can cache arbitrary
+        objects for use in the backward pass using the ctx.save_for_backward method.
+        """
+        ctx.save_for_backward(input)
+        return 0.5 * (5 * input ** 3 - 3 * input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        In the backward pass we receive a Tensor containing the gradient of the loss
+        with respect to the output, and we need to compute the gradient of the loss
+        with respect to the input.
+        """
+        input, = ctx.saved_tensors
+        return grad_output * 1.5 * (5 * input ** 2 - 1)
+
+
+dtype = torch.float
+device = torch.device("cpu")
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+
+# Create Tensors to hold input and outputs.
+# By default, requires_grad=False, which indicates that we do not need to
+# compute gradients with respect to these Tensors during the backward pass.
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Create random Tensors for weights. For this example, we need
+# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
+# not too far from the correct result to ensure convergence.
+# Setting requires_grad=True indicates that we want to compute gradients with
+# respect to these Tensors during the backward pass.
+a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
+b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
+c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
+d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)
+
+learning_rate = 5e-6
+for t in range(2000):
+    # To apply our Function, we use Function.apply method. We alias this as 'P3'.
+    P3 = LegendrePolynomial3.apply
+
+    # Forward pass: compute predicted y using operations; we compute
+    # P3 using our custom autograd operation.
+    y_pred = a + b * P3(c + d * x)
+
+    # Compute and print loss
+    loss = (y_pred - y).pow(2).sum()
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Use autograd to compute the backward pass.
+    loss.backward()
+
+    # Update weights using gradient descent
+    with torch.no_grad():
+        a -= learning_rate * a.grad
+        b -= learning_rate * b.grad
+        c -= learning_rate * c.grad
+        d -= learning_rate * d.grad
+
+        # Manually zero the gradients after updating weights
+        a.grad = None
+        b.grad = None
+        c.grad = None
+        d.grad = None
+
+print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')
diff --git a/beginner_source/examples_autograd/tf_two_layer_net.py b/beginner_source/examples_autograd/tf_two_layer_net.py
deleted file mode 100755
index fb05210cc..000000000
--- a/beginner_source/examples_autograd/tf_two_layer_net.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-TensorFlow: 정적 그래프(Static Graph)
---------------------------------------
-
-하나의 은닉층(hidden layer)과 편향(bias)이 없는 완전히 연결된 ReLU 신경망을,
-유클리드 거리(Euclidean distance) 제곱을 최소화하는 식으로 x로부터 y를 예측하도록
-학습하겠습니다.
-
-기본적인 TensorFlow 연산을 사용하여 연산 그래프를 구성한 다음, 그래프를 여러 차례
-실행하여 실제로 신경망을 학습시켜보겠습니다.
-
-Tensorflow와 PyTorch의 주요한 차이점 중 하나는, PyTorch는 동적 연산 그래프를
-사용하는데 반해 Tensorflow는 정적 연산 그래프를 사용한다는 것입니다.
-
-먼저 Tensorflow에서 연산 그래프를 구성한 다음, 해당 그래프를 여러 차례
-실행해보겠습니다.
-"""
-import tensorflow as tf
-import numpy as np
-
-# 먼저 연산 그래프를 구성합니다:
-
-# N은 배치 크기이며, D_in은 입력의 차원입니다;
-# H는 은닉층의 차원이며, D_out은 출력 차원입니다.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# 입력과 정답(target) 데이터를 위한 플레이스홀더(placeholder)를 생성합니다;
-# 이는 우리가 그래프를 실행할 때 실제 데이터로 채워질 것입니다.
-x = tf.placeholder(tf.float32, shape=(None, D_in))
-y = tf.placeholder(tf.float32, shape=(None, D_out))
-
-# 가중치를 저장하기 위한 Variable을 생성하고 무작위 데이터로 초기화합니다.
-# Tensorflow의 Variable은 그래프가 실행되는 동안 그 값이 유지됩니다.
-w1 = tf.Variable(tf.random_normal((D_in, H)))
-w2 = tf.Variable(tf.random_normal((H, D_out)))
-
-# 순전파 단계: Tensorflow의 Tensor 연산을 사용하여 예상되는 y 값을 계산합니다.
-# 이 코드가 어떠한 수치 연산을 실제로 수행하지는 않는다는 것을 유의하세요;
-# 이 단계에서는 나중에 실행할 연산 그래프를 구성하기만 합니다.
-h = tf.matmul(x, w1)
-h_relu = tf.maximum(h, tf.zeros(1))
-y_pred = tf.matmul(h_relu, w2)
-
-# Tensorflow의 Tensor 연산을 사용하여 손실(loss)을 계산합니다.
-loss = tf.reduce_sum((y - y_pred) ** 2.0)
-
-# w1, w2에 대한 손실의 변화도(gradient)를 계산합니다.
-grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])
-
-# 경사하강법(gradient descent)을 사용하여 가중치를 갱신합니다. 실제로 가중치를
-# 갱신하기 위해서는 그래프가 실행될 때 new_w1과 new_w2 계산(evaluate)해야 합니다.
-# Tensorflow에서 가중치의 값을 갱신하는 작업이 연산 그래프의 일부임을 유의하십시오;
-# PyTorch에서는 이 작업이 연산 그래프의 밖에서 일어납니다.
-learning_rate = 1e-6
-new_w1 = w1.assign(w1 - learning_rate * grad_w1)
-new_w2 = w2.assign(w2 - learning_rate * grad_w2)
-
-# 지금까지 우리는 연산 그래프를 구성하였으므로, 실제로 그래프를 실행하기 위해 이제
-# Tensorflow 세션(session)에 들어가보겠습니다.
-with tf.Session() as sess:
-    # 그래프를 한 번 실행하여 w1과 w2 Variable을 초기화합니다.
-    sess.run(tf.global_variables_initializer())
-
-    # 입력 데이터 x와 정답 데이터 y를 저장하기 위한 NumPy 배열을 생성합니다.
-    x_value = np.random.randn(N, D_in)
-    y_value = np.random.randn(N, D_out)
-    for t in range(500):
-        # 그래프를 여러 번 실행합니다. 매번 그래프가 실행할 때마다 feed_dict
-        # 인자에 x_value를 x에, y_value를 y에 할당(bind)하도록 명시합니다.
-        # 또한, 그래프를 실행할 때마다 손실과 new_w1, new_w2 값을
-        # 계산하려고 합니다; 이러한 Tensor들의 값은 NumPy 배열로 반환됩니다.
-        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
-                                    feed_dict={x: x_value, y: y_value})
-        if t % 100 == 99:
-            print(t, loss_value)
diff --git a/beginner_source/examples_autograd/two_layer_net_autograd.py b/beginner_source/examples_autograd/two_layer_net_autograd.py
deleted file mode 100755
index 610f9108a..000000000
--- a/beginner_source/examples_autograd/two_layer_net_autograd.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: Tensor와 autograd
--------------------------------
-
-하나의 은닉층(hidden layer)과 편향(bias)이 없는 완전히 연결된 ReLU 신경망을,
-유클리드 거리(Euclidean distance) 제곱을 최소화하는 식으로 x로부터 y를 예측하도록
-학습하겠습니다.
-
-이번에는 PyTorch Tensor 연산을 사용하여 순전파 단계를 계산하고, PyTorch autograd를
-사용하여 변화도(gradient)를 계산하는 것을 구현해보겠습니다.
-
-PyTorch Tensor는 연산 그래프에서 노드(node)로 표현됩니다. 만약 ``x`` 가
-``x.requires_grad=True`` 인 Tensor면 ``x.grad`` 는 어떤 스칼라 값에 대한 ``x`` 의
-변화도를 갖는 또 다른 Tensor입니다.
-"""
-import torch
-
-dtype = torch.float
-device = torch.device("cpu")
-# device = torch.device("cuda:0") # GPU에서 실행하려면 이 주석을 제거하세요.
-
-# N은 배치 크기이며, D_in은 입력의 차원입니다;
-# H는 은닉층의 차원이며, D_out은 출력 차원입니다.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# 입력과 출력을 저장하기 위해 무작위 값을 갖는 Tensor를 생성합니다.
-# requires_grad=False로 설정하여 역전파 중에 이 Tensor들에 대한 변화도를 계산할
-# 필요가 없음을 나타냅니다. (requres_grad의 기본값이 False이므로 아래 코드에는
-# 이를 반영하지 않았습니다.)
-x = torch.randn(N, D_in, device=device, dtype=dtype)
-y = torch.randn(N, D_out, device=device, dtype=dtype)
-
-# 가중치를 저장하기 위해 무작위 값을 갖는 Tensor를 생성합니다.
-# requires_grad=True로 설정하여 역전파 중에 이 Tensor들에 대한
-# 변화도를 계산할 필요가 있음을 나타냅니다.
-w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
-w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
-
-learning_rate = 1e-6
-for t in range(500):
-    # 순전파 단계: Tensor 연산을 사용하여 예상되는 y 값을 계산합니다. 이는 Tensor를
-    # 사용한 순전파 단계와 완전히 동일하지만, 역전파 단계를 별도로 구현하지 않아도
-    # 되므로 중간값들에 대한 참조(reference)를 갖고 있을 필요가 없습니다.
-    y_pred = x.mm(w1).clamp(min=0).mm(w2)
-
-    # Tensor 연산을 사용하여 손실을 계산하고 출력합니다.
-    # loss는 (1,) 형태의 Tensor이며, loss.item()은 loss의 스칼라 값입니다.
-    loss = (y_pred - y).pow(2).sum()
-    if t % 100 == 99:
-        print(t, loss.item())
-
-    # autograd를 사용하여 역전파 단계를 계산합니다. 이는 requires_grad=True를
-    # 갖는 모든 Tensor에 대해 손실의 변화도를 계산합니다. 이후 w1.grad와 w2.grad는
-    # w1과 w2 각각에 대한 손실의 변화도를 갖는 Tensor가 됩니다.
-    loss.backward()
-
-    # 경사하강법(gradient descent)을 사용하여 가중치를 수동으로 갱신합니다.
-    # torch.no_grad()로 감싸는 이유는 가중치들이 requires_grad=True이지만
-    # autograd에서는 이를 추적할 필요가 없기 때문입니다.
-    # 다른 방법은 weight.data 및 weight.grad.data를 조작하는 방법입니다.
-    # tensor.data가 tensor의 저장공간을 공유하기는 하지만, 이력을
-    # 추적하지 않는다는 것을 기억하십시오.
-    # 또한, 이를 위해 torch.optim.SGD 를 사용할 수도 있습니다.
-    with torch.no_grad():
-        w1 -= learning_rate * w1.grad
-        w2 -= learning_rate * w2.grad
-
-        # 가중치 갱신 후에는 수동으로 변화도를 0으로 만듭니다.
-        w1.grad.zero_()
-        w2.grad.zero_()
diff --git a/beginner_source/examples_autograd/two_layer_net_custom_function.py b/beginner_source/examples_autograd/two_layer_net_custom_function.py
deleted file mode 100755
index 3b8df3f8b..000000000
--- a/beginner_source/examples_autograd/two_layer_net_custom_function.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: 새 autograd Function 정의하기
-----------------------------------------
-
-하나의 은닉층(hidden layer)과 편향(bias)이 없는 완전히 연결된 ReLU 신경망을,
-유클리드 거리(Euclidean distance) 제곱을 최소화하는 식으로 x로부터 y를 예측하도록
-학습하겠습니다.
-
-PyTorch Variable 연산을 사용하여 순전파를 계산하고, PyTorch autograd를 사용하여
-변화도(gradient)를 계산하는 것을 구현하겠습니다.
-
-여기서는 사용자 정의 autograd 함수를 구현하여 ReLU 기능을 수행하도록 하겠습니다.
-"""
-import torch
-
-
-class MyReLU(torch.autograd.Function):
-    """
-    torch.autograd.Function을 상속받아 사용자 정의 autograd Function을 구현하고,
-    Tensor 연산을 하는 순전파와 역전파 단계를 구현하겠습니다.
-    """
-
-    @staticmethod
-    def forward(ctx, input):
-        """
-        순전파 단계에서는 입력을 갖는 Tensor를 받아 출력을 갖는 Tensor를 반환합니다.
-        ctx는 컨텍스트 객체(context object)로 역전파 연산을 위한 정보 저장에
-        사용합니다. ctx.save_for_backward method를 사용하여 역전파 단계에서 사용할 어떠한
-        객체도 저장(cache)해 둘 수 있습니다.
-        """
-        ctx.save_for_backward(input)
-        return input.clamp(min=0)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        """
-        역전파 단계에서는 출력에 대한 손실의 변화도를 갖는 Tensor를 받고, 입력에
-        대한 손실의 변화도를 계산합니다.
-        """
-        input, = ctx.saved_tensors
-        grad_input = grad_output.clone()
-        grad_input[input < 0] = 0
-        return grad_input
-
-
-dtype = torch.float
-device = torch.device("cpu")
-# device = torch.device("cuda:0") # GPU에서 실행하려면 이 주석을 제거하세요.
-
-# N은 배치 크기이며, D_in은 입력의 차원입니다;
-# H는 은닉층의 차원이며, D_out은 출력 차원입니다.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# 입력과 출력을 저장하기 위해 무작위 값을 갖는 Tensor를 생성합니다.
-x = torch.randn(N, D_in, device=device, dtype=dtype)
-y = torch.randn(N, D_out, device=device, dtype=dtype)
-
-# 가중치를 저장하기 위해 무작위 값을 갖는 Tensor를 생성합니다.
-w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
-w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
-
-learning_rate = 1e-6
-for t in range(500):
-    # 사용자 정의 Function을 적용하기 위해 Function.apply 메소드를 사용합니다.
-    # 여기에 'relu'라는 이름을 붙였습니다.
-    relu = MyReLU.apply
-
-    # 순전파 단계: Tensor 연산을 사용하여 예상되는 y 값을 계산합니다;
-    # 사용자 정의 autograd 연산을 사용하여 ReLU를 계산합니다.
-    y_pred = relu(x.mm(w1)).mm(w2)
-
-    # 손실을 계산하고 출력합니다.
-    loss = (y_pred - y).pow(2).sum()
-    if t % 100 == 99:
-        print(t, loss.item())
-
-    # autograde를 사용하여 역전파 단계를 계산합니다.
-    loss.backward()
-
-    # 경사하강법(gradient descent)을 사용하여 가중치를 갱신합니다.
-    with torch.no_grad():
-        w1 -= learning_rate * w1.grad
-        w2 -= learning_rate * w2.grad
-
-        # 가중치 갱신 후에는 수동으로 변화도를 0으로 만듭니다.
-        w1.grad.zero_()
-        w2.grad.zero_()
diff --git a/beginner_source/examples_nn/dynamic_net.py b/beginner_source/examples_nn/dynamic_net.py
index b42ea4283..31fa40f3e 100755
--- a/beginner_source/examples_nn/dynamic_net.py
+++ b/beginner_source/examples_nn/dynamic_net.py
@@ -1,73 +1,77 @@
 # -*- coding: utf-8 -*-
 """
-PyTorch: 제어 흐름(Control Flow) + 가중치 공유(Weight Sharing)
----------------------------------------------------------------
+PyTorch: Control Flow + Weight Sharing
+--------------------------------------
 
-PyTorch 동적 그래프의 강력함을 보여주기 위해, 매우 이상한 모델을 구현해보겠습니다:
-각 순전파 단계에서 많은 은닉 계층을 갖는 완전히 연결(fully-connected)된 ReLU
-신경망이 무작위로 0 ~ 3 사이의 숫자를 선택하고, 가장 안쪽(innermost)의 은닉층들을
-계산하기 위해 동일한 가중치를 여러 번 재사용합니다.
+To showcase the power of PyTorch dynamic graphs, we will implement a very strange
+model: a third-fifth order polynomial that on each forward pass
+chooses a random number between 3 and 5 and uses that many orders, reusing
+the same weights multiple times to compute the fourth and fifth order.
 """
 import random
 import torch
+import math
 
 
 class DynamicNet(torch.nn.Module):
-    def __init__(self, D_in, H, D_out):
+    def __init__(self):
         """
-        생성자에서 순전파 단계에서 사용할 3개의 nn.Linear 인스턴스를 생성합니다.
+        In the constructor we instantiate five parameters and assign them as members.
         """
-        super(DynamicNet, self).__init__()
-        self.input_linear = torch.nn.Linear(D_in, H)
-        self.middle_linear = torch.nn.Linear(H, H)
-        self.output_linear = torch.nn.Linear(H, D_out)
+        super().__init__()
+        self.a = torch.nn.Parameter(torch.randn(()))
+        self.b = torch.nn.Parameter(torch.randn(()))
+        self.c = torch.nn.Parameter(torch.randn(()))
+        self.d = torch.nn.Parameter(torch.randn(()))
+        self.e = torch.nn.Parameter(torch.randn(()))
 
     def forward(self, x):
         """
-        모델의 순전파 단계에서, 무작위로 0, 1, 2 또는 3 중에 하나를 선택하고
-        은닉층을 계산하기 위해 여러번 사용한 middle_linear Module을 재사용합니다.
+        For the forward pass of the model, we randomly choose either 4, 5
+        and reuse the e parameter to compute the contribution of these orders.
 
-        각 순전파 단계는 동적 연산 그래프를 구성하기 때문에, 모델의 순전파 단계를
-        정의할 때 반복문이나 조건문과 같은 일반적인 Python 제어 흐름 연산자를 사용할
-        수 있습니다.
+        Since each forward pass builds a dynamic computation graph, we can use normal
+        Python control-flow operators like loops or conditional statements when
+        defining the forward pass of the model.
 
-        여기에서 연산 그래프를 정의할 때 동일 Module을 여러번 재사용하는 것이
-        완벽히 안전하다는 것을 알 수 있습니다. 이것이 각 Module을 한 번씩만 사용할
-        수 있었던 Lua Torch보다 크게 개선된 부분입니다.
+        Here we also see that it is perfectly safe to reuse the same parameter many
+        times when defining a computational graph.
         """
-        h_relu = self.input_linear(x).clamp(min=0)
-        for _ in range(random.randint(0, 3)):
-            h_relu = self.middle_linear(h_relu).clamp(min=0)
-        y_pred = self.output_linear(h_relu)
-        return y_pred
+        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
+        for exp in range(4, random.randint(4, 6)):
+            y = y + self.e * x ** exp
+        return y
 
+    def string(self):
+        """
+        Just like any class in Python, you can also define custom method on PyTorch modules
+        """
+        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'
 
-# N은 배치 크기이며, D_in은 입력의 차원입니다;
-# H는 은닉층의 차원이며, D_out은 출력 차원입니다.
-N, D_in, H, D_out = 64, 1000, 100, 10
 
-# 입력과 출력을 저장하기 위해 무작위 값을 갖는 Tensor를 생성합니다.
-x = torch.randn(N, D_in)
-y = torch.randn(N, D_out)
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
 
-# 앞서 정의한 클래스를 생성(instantiating)하여 모델을 구성합니다.
-model = DynamicNet(D_in, H, D_out)
+# Construct our model by instantiating the class defined above
+model = DynamicNet()
 
-# 손실함수와 Optimizer를 만듭니다. 이 이상한 모델을 순수한 확률적 경사 하강법
-# (stochastic gradient decent)으로 학습하는 것은 어려우므로, 모멘텀(momentum)을
-# 사용합니다.
+# Construct our loss function and an Optimizer. Training this strange model with
+# vanilla stochastic gradient descent is tough, so we use momentum
 criterion = torch.nn.MSELoss(reduction='sum')
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
-for t in range(500):
-    # 순전파 단계: 모델에 x를 전달하여 예상되는 y 값을 계산합니다.
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
+for t in range(30000):
+    # Forward pass: Compute predicted y by passing x to the model
     y_pred = model(x)
 
-    # 손실을 계산하고 출력합니다.
+    # Compute and print loss
     loss = criterion(y_pred, y)
-    if t % 100 == 99:
+    if t % 2000 == 1999:
         print(t, loss.item())
 
-    # 변화도를 0으로 만들고, 역전파 단계를 수행하고, 가중치를 갱신합니다.
+    # Zero gradients, perform a backward pass, and update the weights.
     optimizer.zero_grad()
     loss.backward()
     optimizer.step()
+
+print(f'Result: {model.string()}')
diff --git a/beginner_source/examples_nn/polynomial_module.py b/beginner_source/examples_nn/polynomial_module.py
new file mode 100755
index 000000000..0fa401bad
--- /dev/null
+++ b/beginner_source/examples_nn/polynomial_module.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch: Custom nn Modules
+--------------------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`pi` by minimizing squared Euclidean distance.
+
+This implementation defines the model as a custom Module subclass. Whenever you
+want a model more complex than a simple sequence of existing Modules you will
+need to define your model this way.
+"""
+import torch
+import math
+
+
+class Polynomial3(torch.nn.Module):
+    def __init__(self):
+        """
+        In the constructor we instantiate four parameters and assign them as
+        member parameters.
+        """
+        super().__init__()
+        self.a = torch.nn.Parameter(torch.randn(()))
+        self.b = torch.nn.Parameter(torch.randn(()))
+        self.c = torch.nn.Parameter(torch.randn(()))
+        self.d = torch.nn.Parameter(torch.randn(()))
+
+    def forward(self, x):
+        """
+        In the forward function we accept a Tensor of input data and we must return
+        a Tensor of output data. We can use Modules defined in the constructor as
+        well as arbitrary operators on Tensors.
+        """
+        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
+
+    def string(self):
+        """
+        Just like any class in Python, you can also define custom method on PyTorch modules
+        """
+        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
+
+# Construct our model by instantiating the class defined above
+model = Polynomial3()
+
+# Construct our loss function and an Optimizer. The call to model.parameters()
+# in the SGD constructor will contain the learnable parameters (defined 
+# with torch.nn.Parameter) which are members of the model.
+criterion = torch.nn.MSELoss(reduction='sum')
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
+for t in range(2000):
+    # Forward pass: Compute predicted y by passing x to the model
+    y_pred = model(x)
+
+    # Compute and print loss
+    loss = criterion(y_pred, y)
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Zero gradients, perform a backward pass, and update the weights.
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+
+print(f'Result: {model.string()}')
diff --git a/beginner_source/examples_nn/polynomial_nn.py b/beginner_source/examples_nn/polynomial_nn.py
new file mode 100755
index 000000000..9d5aca053
--- /dev/null
+++ b/beginner_source/examples_nn/polynomial_nn.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch: nn
+-----------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`pi` by minimizing squared Euclidean distance.
+
+This implementation uses the nn package from PyTorch to build the network.
+PyTorch autograd makes it easy to define computational graphs and take gradients,
+but raw autograd can be a bit too low-level for defining complex neural networks;
+this is where the nn package can help. The nn package defines a set of Modules,
+which you can think of as a neural network layer that has produces output from
+input and may have some trainable weights.
+"""
+import torch
+import math
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
+
+# For this example, the output y is a linear function of (x, x^2, x^3), so
+# we can consider it as a linear layer neural network. Let's prepare the
+# tensor (x, x^2, x^3).
+p = torch.tensor([1, 2, 3])
+xx = x.unsqueeze(-1).pow(p)
+
+# In the above code, x.unsqueeze(-1) has shape (2000, 1), and p has shape
+# (3,), for this case, broadcasting semantics will apply to obtain a tensor
+# of shape (2000, 3) 
+
+# Use the nn package to define our model as a sequence of layers. nn.Sequential
+# is a Module which contains other Modules, and applies them in sequence to
+# produce its output. The Linear Module computes output from input using a
+# linear function, and holds internal Tensors for its weight and bias.
+# The Flatten layer flatens the output of the linear layer to a 1D tensor,
+# to match the shape of `y`.
+model = torch.nn.Sequential(
+    torch.nn.Linear(3, 1),
+    torch.nn.Flatten(0, 1)
+)
+
+# The nn package also contains definitions of popular loss functions; in this
+# case we will use Mean Squared Error (MSE) as our loss function.
+loss_fn = torch.nn.MSELoss(reduction='sum')
+
+learning_rate = 1e-6
+for t in range(2000):
+
+    # Forward pass: compute predicted y by passing x to the model. Module objects
+    # override the __call__ operator so you can call them like functions. When
+    # doing so you pass a Tensor of input data to the Module and it produces
+    # a Tensor of output data.
+    y_pred = model(xx)
+
+    # Compute and print loss. We pass Tensors containing the predicted and true
+    # values of y, and the loss function returns a Tensor containing the
+    # loss.
+    loss = loss_fn(y_pred, y)
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Zero the gradients before running the backward pass.
+    model.zero_grad()
+
+    # Backward pass: compute gradient of the loss with respect to all the learnable
+    # parameters of the model. Internally, the parameters of each Module are stored
+    # in Tensors with requires_grad=True, so this call will compute gradients for
+    # all learnable parameters in the model.
+    loss.backward()
+
+    # Update the weights using gradient descent. Each parameter is a Tensor, so
+    # we can access its gradients like we did before.
+    with torch.no_grad():
+        for param in model.parameters():
+            param -= learning_rate * param.grad
+
+# You can access the first layer of `model` like accessing the first item of a list
+linear_layer = model[0]
+
+# For linear layer, its parameters are stored as `weight` and `bias`.
+print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')
diff --git a/beginner_source/examples_nn/polynomial_optim.py b/beginner_source/examples_nn/polynomial_optim.py
new file mode 100755
index 000000000..434fb6624
--- /dev/null
+++ b/beginner_source/examples_nn/polynomial_optim.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch: optim
+--------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`pi` by minimizing squared Euclidean distance.
+
+This implementation uses the nn package from PyTorch to build the network.
+
+Rather than manually updating the weights of the model as we have been doing,
+we use the optim package to define an Optimizer that will update the weights
+for us. The optim package defines many optimization algorithms that are commonly
+used for deep learning, including SGD+momentum, RMSProp, Adam, etc.
+"""
+import torch
+import math
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
+
+# Prepare the input tensor (x, x^2, x^3).
+p = torch.tensor([1, 2, 3])
+xx = x.unsqueeze(-1).pow(p)
+
+# Use the nn package to define our model and loss function.
+model = torch.nn.Sequential(
+    torch.nn.Linear(3, 1),
+    torch.nn.Flatten(0, 1)
+)
+loss_fn = torch.nn.MSELoss(reduction='sum')
+
+# Use the optim package to define an Optimizer that will update the weights of
+# the model for us. Here we will use RMSprop; the optim package contains many other
+# optimization algorithms. The first argument to the RMSprop constructor tells the
+# optimizer which Tensors it should update.
+learning_rate = 1e-3
+optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
+for t in range(2000):
+    # Forward pass: compute predicted y by passing x to the model.
+    y_pred = model(xx)
+
+    # Compute and print loss.
+    loss = loss_fn(y_pred, y)
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Before the backward pass, use the optimizer object to zero all of the
+    # gradients for the variables it will update (which are the learnable
+    # weights of the model). This is because by default, gradients are
+    # accumulated in buffers( i.e, not overwritten) whenever .backward()
+    # is called. Checkout docs of torch.autograd.backward for more details.
+    optimizer.zero_grad()
+
+    # Backward pass: compute gradient of the loss with respect to model
+    # parameters
+    loss.backward()
+
+    # Calling the step function on an Optimizer makes an update to its
+    # parameters
+    optimizer.step()
+
+
+linear_layer = model[0]
+print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')
diff --git a/beginner_source/examples_nn/two_layer_net_module.py b/beginner_source/examples_nn/two_layer_net_module.py
deleted file mode 100755
index 2b26da13a..000000000
--- a/beginner_source/examples_nn/two_layer_net_module.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: 사용자 정의 nn Module
--------------------------------
-
-하나의 은닉층(hidden layer)과 편향(bias)이 없는 완전히 연결된 ReLU 신경망을,
-유클리드 거리(Euclidean distance) 제곱을 최소화하는 식으로 x로부터 y를 예측하도록
-학습하겠습니다.
-
-이번에는 사용자 정의 Module의 서브클래스로 모델을 정의합니다. 기존 Module의 간단한
-구성보다 더 복잡한 모델을 원한다면, 이 방법으로 모델을 정의하면 됩니다.
-"""
-import torch
-
-
-class TwoLayerNet(torch.nn.Module):
-    def __init__(self, D_in, H, D_out):
-        """
-        생성자에서 2개의 nn.Linear 모듈을 생성하고, 멤버 변수로 지정합니다.
-        """
-        super(TwoLayerNet, self).__init__()
-        self.linear1 = torch.nn.Linear(D_in, H)
-        self.linear2 = torch.nn.Linear(H, D_out)
-
-    def forward(self, x):
-        """
-        순전파 함수에서는 입력 데이터의 Tensor를 받고 출력 데이터의 Tensor를
-        반환해야 합니다. Tensor 상의 임의의 연산자뿐만 아니라 생성자에서 정의한
-        Module도 사용할 수 있습니다.
-        """
-        h_relu = self.linear1(x).clamp(min=0)
-        y_pred = self.linear2(h_relu)
-        return y_pred
-
-
-# N은 배치 크기이며, D_in은 입력의 차원입니다;
-# H는 은닉층의 차원이며, D_out은 출력 차원입니다.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# 입력과 출력을 저장하기 위해 무작위 값을 갖는 Tensor를 생성합니다.
-x = torch.randn(N, D_in)
-y = torch.randn(N, D_out)
-
-# 앞에서 정의한 클래스를 생성하여 모델을 구성합니다.
-model = TwoLayerNet(D_in, H, D_out)
-
-# 손실 함수와 Optimizer를 만듭니다. SGD 생성자에 model.parameters()를 호출하면
-# 모델의 멤버인 2개의 nn.Linear 모듈의 학습 가능한 매개변수들이 포함됩니다.
-criterion = torch.nn.MSELoss(reduction='sum')
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
-for t in range(500):
-    # 순전파 단계: 모델에 x를 전달하여 예상되는 y 값을 계산합니다.
-    y_pred = model(x)
-
-    # 손실을 계산하고 출력합니다.
-    loss = criterion(y_pred, y)
-    if t % 100 == 99:
-        print(t, loss.item())
-
-    # 변화도를 0으로 만들고, 역전파 단계를 수행하고, 가중치를 갱신합니다.
-    optimizer.zero_grad()
-    loss.backward()
-    optimizer.step()
diff --git a/beginner_source/examples_nn/two_layer_net_nn.py b/beginner_source/examples_nn/two_layer_net_nn.py
deleted file mode 100755
index 46f514ed0..000000000
--- a/beginner_source/examples_nn/two_layer_net_nn.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: nn
------------
-
-하나의 은닉층(hidden layer)과 편향(bias)이 없는 완전히 연결된 ReLU 신경망을,
-유클리드 거리(Euclidean distance) 제곱을 최소화하는 식으로 x로부터 y를 예측하도록
-학습하겠습니다.
-
-이번에는 PyTorch의 nn 패키지를 사용하여 신경망을 구현하겠습니다.
-PyTorch autograd는 연산 그래프를 정의하고 변화도를 계산하는 것을 손쉽게 만들어주지만,
-autograd 그 자체만으로는 복잡한 신경망을 정의하기에는 너무 저수준(low-level)일 수
-있습니다; 이것이 nn 패키지가 필요한 이유입니다. nn 패키지는 Module의 집합을
-정의하는데, 이는 입력으로부터 출력을 생성하고 학습 가능한 가중치를 갖는
-신경망 계층(neural network layer)이라고 생각할 수 있습니다.
-"""
-import torch
-
-# N은 배치 크기이며, D_in은 입력의 차원입니다;
-# H는 은닉층의 차원이며, D_out은 출력 차원입니다.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# 입력과 출력을 저장하기 위해 무작위 값을 갖는 Tensor를 생성합니다.
-x = torch.randn(N, D_in)
-y = torch.randn(N, D_out)
-
-# nn 패키지를 사용하여 모델을 순차적 계층(sequence of layers)으로 정의합니다.
-# nn.Sequential은 다른 Module들을 포함하는 Module로, 그 Module들을 순차적으로
-# 적용하여 출력을 생성합니다. 각각의 Linear Module은 선형 함수를 사용하여
-# 입력으로부터 출력을 계산하고, 내부 Tensor에 가중치와 편향을 저장합니다.
-model = torch.nn.Sequential(
-    torch.nn.Linear(D_in, H),
-    torch.nn.ReLU(),
-    torch.nn.Linear(H, D_out),
-)
-
-# 또한 nn 패키지에는 널리 사용하는 손실 함수들에 대한 정의도 포함하고 있습니다;
-# 여기에서는 평균 제곱 오차(MSE; Mean Squared Error)를 손실 함수로 사용하겠습니다.
-loss_fn = torch.nn.MSELoss(reduction='sum')
-
-learning_rate = 1e-4
-for t in range(500):
-    # 순전파 단계: 모델에 x를 전달하여 예상되는 y 값을 계산합니다. Module 객체는
-    # __call__ 연산자를 덮어써(override) 함수처럼 호출할 수 있게 합니다.
-    # 이렇게 함으로써 입력 데이터의 Tensor를 Module에 전달하여 출력 데이터의
-    # Tensor를 생성합니다.
-    y_pred = model(x)
-
-    # 손실을 계산하고 출력합니다. 예측한 y와 정답인 y를 갖는 Tensor들을 전달하고,
-    # 손실 함수는 손실 값을 갖는 Tensor를 반환합니다.
-    loss = loss_fn(y_pred, y)
-    if t % 100 == 99:
-        print(t, loss.item())
-
-    # 역전파 단계를 실행하기 전에 변화도를 0으로 만듭니다.
-    model.zero_grad()
-
-    # 역전파 단계: 모델의 학습 가능한 모든 매개변수에 대해 손실의 변화도를
-    # 계산합니다. 내부적으로 각 Module의 매개변수는 requires_grad=True 일 때
-    # Tensor 내에 저장되므로, 이 호출은 모든 모델의 모든 학습 가능한 매개변수의
-    # 변화도를 계산하게 됩니다.
-    loss.backward()
-
-    # 경사하강법(gradient descent)를 사용하여 가중치를 갱신합니다. 각 매개변수는
-    # Tensor이므로 이전에 했던 것과 같이 변화도에 접근할 수 있습니다.
-    with torch.no_grad():
-        for param in model.parameters():
-            param -= learning_rate * param.grad
diff --git a/beginner_source/examples_nn/two_layer_net_optim.py b/beginner_source/examples_nn/two_layer_net_optim.py
deleted file mode 100755
index 6f0bd5ba1..000000000
--- a/beginner_source/examples_nn/two_layer_net_optim.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: optim
---------------
-
-하나의 은닉층(hidden layer)과 편향(bias)이 없는 완전히 연결된 ReLU 신경망을,
-유클리드 거리(Euclidean distance) 제곱을 최소화하는 식으로 x로부터 y를 예측하도록
-학습하겠습니다.
-
-이번에는 PyTorch의 nn 패키지를 사용하여 신경망을 구현해보겠습니다.
-
-지금까지 해왔던 것처럼 직접 모델의 가중치를 갱신하는 대신, optim 패키지를 사용하여
-가중치를 갱신할 Optimizer를 정의합니다. optim 패키지는 일반적으로 딥러닝에 사용하는
-SGD+momentum, RMSProp, Adam 등과 같은 다양한 최적화(Optimization) 알고리즘을
-정의합니다.
-"""
-import torch
-
-# N은 배치 크기이며, D_in은 입력의 차원입니다;
-# H는 은닉층의 차원이며, D_out은 출력 차원입니다.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# 입력과 출력을 저장하기 위해 무작위 값을 갖는 Tensor를 생성합니다.
-x = torch.randn(N, D_in)
-y = torch.randn(N, D_out)
-
-# nn 패키지를 사용하여 모델과 손실 함수를 정의합니다.
-model = torch.nn.Sequential(
-    torch.nn.Linear(D_in, H),
-    torch.nn.ReLU(),
-    torch.nn.Linear(H, D_out),
-)
-loss_fn = torch.nn.MSELoss(reduction='sum')
-
-# optim 패키지를 사용하여 모델의 가중치를 갱신할 Optimizer를 정의합니다.
-# 여기서는 Adam을 사용하겠습니다; optim 패키지는 다른 다양한 최적화 알고리즘을
-# 포함하고 있습니다. Adam 생성자의 첫번째 인자는 어떤 Tensor가 갱신되어야 하는지
-# 알려줍니다.
-learning_rate = 1e-4
-optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-for t in range(500):
-    # 순전파 단계: 모델에 x를 전달하여 예상되는 y 값을 계산합니다.
-    y_pred = model(x)
-
-    # 손실을 계산하고 출력합니다.
-    loss = loss_fn(y_pred, y)
-    if t % 100 == 99:
-        print(t, loss.item())
-
-    # 역전파 단계 전에, Optimizer 객체를 사용하여 (모델의 학습 가능한 가중치인)
-    # 갱신할 변수들에 대한 모든 변화도를 0으로 만듭니다. 이렇게 하는 이유는
-    # 기본적으로 .backward()를 호출할 때마다 변화도가 버퍼(buffer)에 (덮어쓰지 않고)
-    # 누적되기 때문입니다. 더 자세한 내용은 torch.autograd.backward에 대한 문서를
-    # 참조하세요.
-    optimizer.zero_grad()
-
-    # 역전파 단계: 모델의 매개변수에 대한 손실의 변화도를 계산합니다.
-    loss.backward()
-
-    # Optimizer의 step 함수를 호출하면 매개변수가 갱신됩니다.
-    optimizer.step()
diff --git a/beginner_source/examples_tensor/polynomial_numpy.py b/beginner_source/examples_tensor/polynomial_numpy.py
new file mode 100755
index 000000000..a1a378e50
--- /dev/null
+++ b/beginner_source/examples_tensor/polynomial_numpy.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+"""
+Warm-up: numpy
+--------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`pi` by minimizing squared Euclidean distance.
+
+This implementation uses numpy to manually compute the forward pass, loss, and
+backward pass.
+
+A numpy array is a generic n-dimensional array; it does not know anything about
+deep learning or gradients or computational graphs, and is just a way to perform
+generic numeric computations.
+"""
+import numpy as np
+import math
+
+# Create random input and output data
+x = np.linspace(-math.pi, math.pi, 2000)
+y = np.sin(x)
+
+# Randomly initialize weights
+a = np.random.randn()
+b = np.random.randn()
+c = np.random.randn()
+d = np.random.randn()
+
+learning_rate = 1e-6
+for t in range(2000):
+    # Forward pass: compute predicted y
+    # y = a + b x + c x^2 + d x^3
+    y_pred = a + b * x + c * x ** 2 + d * x ** 3
+
+    # Compute and print loss
+    loss = np.square(y_pred - y).sum()
+    if t % 100 == 99:
+        print(t, loss)
+
+    # Backprop to compute gradients of a, b, c, d with respect to loss
+    grad_y_pred = 2.0 * (y_pred - y)
+    grad_a = grad_y_pred.sum()
+    grad_b = (grad_y_pred * x).sum()
+    grad_c = (grad_y_pred * x ** 2).sum()
+    grad_d = (grad_y_pred * x ** 3).sum()
+
+    # Update weights
+    a -= learning_rate * grad_a
+    b -= learning_rate * grad_b
+    c -= learning_rate * grad_c
+    d -= learning_rate * grad_d
+
+print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')
diff --git a/beginner_source/examples_tensor/polynomial_tensor.py b/beginner_source/examples_tensor/polynomial_tensor.py
new file mode 100755
index 000000000..1e35b0f24
--- /dev/null
+++ b/beginner_source/examples_tensor/polynomial_tensor.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+"""
+PyTorch: Tensors
+----------------
+
+A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi`
+to :math:`pi` by minimizing squared Euclidean distance.
+
+This implementation uses PyTorch tensors to manually compute the forward pass,
+loss, and backward pass.
+
+A PyTorch Tensor is basically the same as a numpy array: it does not know
+anything about deep learning or computational graphs or gradients, and is just
+a generic n-dimensional array to be used for arbitrary numeric computation.
+
+The biggest difference between a numpy array and a PyTorch Tensor is that
+a PyTorch Tensor can run on either CPU or GPU. To run operations on the GPU,
+just cast the Tensor to a cuda datatype.
+"""
+
+import torch
+import math
+
+
+dtype = torch.float
+device = torch.device("cpu")
+# device = torch.device("cuda:0") # Uncomment this to run on GPU
+
+# Create random input and output data
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Randomly initialize weights
+a = torch.randn((), device=device, dtype=dtype)
+b = torch.randn((), device=device, dtype=dtype)
+c = torch.randn((), device=device, dtype=dtype)
+d = torch.randn((), device=device, dtype=dtype)
+
+learning_rate = 1e-6
+for t in range(2000):
+    # Forward pass: compute predicted y
+    y_pred = a + b * x + c * x ** 2 + d * x ** 3
+
+    # Compute and print loss
+    loss = (y_pred - y).pow(2).sum().item()
+    if t % 100 == 99:
+        print(t, loss)
+
+    # Backprop to compute gradients of a, b, c, d with respect to loss
+    grad_y_pred = 2.0 * (y_pred - y)
+    grad_a = grad_y_pred.sum()
+    grad_b = (grad_y_pred * x).sum()
+    grad_c = (grad_y_pred * x ** 2).sum()
+    grad_d = (grad_y_pred * x ** 3).sum()
+
+    # Update weights using gradient descent
+    a -= learning_rate * grad_a
+    b -= learning_rate * grad_b
+    c -= learning_rate * grad_c
+    d -= learning_rate * grad_d
+
+
+print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')
diff --git a/beginner_source/examples_tensor/two_layer_net_numpy.py b/beginner_source/examples_tensor/two_layer_net_numpy.py
deleted file mode 100755
index aa09745b0..000000000
--- a/beginner_source/examples_tensor/two_layer_net_numpy.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-준비 운동: NumPy
------------------
-
-하나의 은닉층(hidden layer)과 편향(bias)이 없는 완전히 연결된 ReLU 신경망을,
-유클리드 오차(Euclidean error)를 최소화하는 식으로 x로부터 y를 예측하도록
-학습하겠습니다.
-
-NumPy를 사용하여 수동으로 순전파, 손실(loss), 그리고 역전파 연산을 하는 것을
-구현해보겠습니다.
-
-NumPy 배열은 일반적은 N차원 배열입니다; 딥러닝이나 변화도(gradient), 연산
-그래프(computational graph)는 알지 못하며, 일반적인 수치 연산을 수행합니다.
-"""
-import numpy as np
-
-# N은 배치 크기이며, D_in은 입력의 차원입니다;
-# H는 은닉층의 차원이며, D_out은 출력 차원입니다.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# 무작위의 입력과 출력 데이터를 생성합니다.
-x = np.random.randn(N, D_in)
-y = np.random.randn(N, D_out)
-
-# 무작위로 가중치를 초기화합니다.
-w1 = np.random.randn(D_in, H)
-w2 = np.random.randn(H, D_out)
-
-learning_rate = 1e-6
-for t in range(500):
-    # 순전파 단계: 예측값 y를 계산합니다.
-    h = x.dot(w1)
-    h_relu = np.maximum(h, 0)
-    y_pred = h_relu.dot(w2)
-
-    # 손실(loss)을 계산하고 출력합니다.
-    loss = np.square(y_pred - y).sum()
-    print(t, loss)
-
-    # 손실에 따른 w1, w2의 변화도를 계산하고 역전파합니다.
-    grad_y_pred = 2.0 * (y_pred - y)
-    grad_w2 = h_relu.T.dot(grad_y_pred)
-    grad_h_relu = grad_y_pred.dot(w2.T)
-    grad_h = grad_h_relu.copy()
-    grad_h[h < 0] = 0
-    grad_w1 = x.T.dot(grad_h)
-
-    # 가중치를 갱신합니다.
-    w1 -= learning_rate * grad_w1
-    w2 -= learning_rate * grad_w2
diff --git a/beginner_source/examples_tensor/two_layer_net_tensor.py b/beginner_source/examples_tensor/two_layer_net_tensor.py
deleted file mode 100755
index 3ee326c68..000000000
--- a/beginner_source/examples_tensor/two_layer_net_tensor.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTorch: Tensors
-----------------
-
-하나의 은닉층(hidden layer)과 편향(bias)이 없는 완전히 연결된 ReLU 신경망을,
-유클리드 거리(Euclidean distance) 제곱을 최소화하는 식으로 x로부터 y를 예측하도록
-학습하겠습니다.
-
-여기서는 PyTorch tensor를 사용하여 순전파, 손실(loss), 그리고 역전파 연산까지
-직접 구현해보겠습니다.
-
-PyTorch Tensor는 기본적으로 NumPy 배열과 동일합니다; 딥러닝이나 연산 그래프
-(computational graph), 변화도(gradient)는 알지 못하며 임의의 숫자 계산에 사용하는
-일반적인 N차원 배열입니다.
-
-NumPy 배열과 PyTorch Tensor의 가장 큰 차이점은 PyTorch Tensor는 CPU나 GPU 어디서든
-실행이 가능하다는 점입니다. GPU에서 연산을 하기 위해서는, Tensor를 CUDA 자료형으로
-변환(cast)해주기만 하면 됩니다.
-"""
-
-import torch
-
-
-dtype = torch.float
-device = torch.device("cpu")
-# device = torch.device("cuda:0") # GPU에서 실행하려면 이 주석을 제거하세요.
-
-# N은 배치 크기이며, D_in은 입력의 차원입니다;
-# H는 은닉층의 차원이며, D_out은 출력 차원입니다.
-N, D_in, H, D_out = 64, 1000, 100, 10
-
-# 무작위의 입력과 출력 데이터를 생성합니다.
-x = torch.randn(N, D_in, device=device, dtype=dtype)
-y = torch.randn(N, D_out, device=device, dtype=dtype)
-
-# 무작위로 가중치를 초기화합니다.
-w1 = torch.randn(D_in, H, device=device, dtype=dtype)
-w2 = torch.randn(H, D_out, device=device, dtype=dtype)
-
-learning_rate = 1e-6
-for t in range(500):
-    # 순전파 단계: 예측값 y를 계산합니다.
-    h = x.mm(w1)
-    h_relu = h.clamp(min=0)
-    y_pred = h_relu.mm(w2)
-
-    # 손실(loss)을 계산하고 출력합니다.
-    loss = (y_pred - y).pow(2).sum().item()
-    if t % 100 == 99:
-        print(t, loss)
-
-    # 손실에 따른 w1, w2의 변화도를 계산하고 역전파합니다.
-    grad_y_pred = 2.0 * (y_pred - y)
-    grad_w2 = h_relu.t().mm(grad_y_pred)
-    grad_h_relu = grad_y_pred.mm(w2.t())
-    grad_h = grad_h_relu.clone()
-    grad_h[h < 0] = 0
-    grad_w1 = x.t().mm(grad_h)
-
-    # 경사하강법(gradient descent)를 사용하여 가중치를 갱신합니다.
-    w1 -= learning_rate * grad_w1
-    w2 -= learning_rate * grad_w2
diff --git a/beginner_source/fgsm_tutorial.py b/beginner_source/fgsm_tutorial.py
index ea0b4c651..6625e4f0f 100644
--- a/beginner_source/fgsm_tutorial.py
+++ b/beginner_source/fgsm_tutorial.py
@@ -77,6 +77,13 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
+# NOTE: 아래는 MNIST 데이터셋을 내려받을 때 "User-agent" 관련한 제한을 푸는 코드입니다.
+#       더 자세한 내용은 https://github.com/pytorch/vision/issues/3497 을 참고해주세요.
+from six.moves import urllib
+opener = urllib.request.build_opener()
+opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+urllib.request.install_opener(opener)
+
 
 ######################################################################
 # 구현
@@ -356,4 +363,4 @@ def test( model, device, test_loader, epsilon ):
 # 그러나 적대적 머신 러닝 분야에 대해서 많은 것을 알기 위한 최고의 방법은 많이 시도해보는 것입니다.
 # NIPS 2017 경쟁에서 소개된 다양한 공격 방법을 직접 구현해 보고, FGSM 과 어떤 점이 다른지 연구해 보세요.
 # 그리고 나서 직접 만든 공격으로부터 모델을 방어해 보세요.
-#
+#
\ No newline at end of file
diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py
new file mode 100644
index 000000000..11524618c
--- /dev/null
+++ b/beginner_source/hyperparameter_tuning_tutorial.py
@@ -0,0 +1,462 @@
+# -*- coding: utf-8 -*-
+"""
+Hyperparameter tuning with Ray Tune
+===================================
+
+Hyperparameter tuning can make the difference between an average model and a highly
+accurate one. Often simple things like choosing a different learning rate or changing
+a network layer size can have a dramatic impact on your model performance.
+
+Fortunately, there are tools that help with finding the best combination of parameters.
+`Ray Tune <https://docs.ray.io/en/latest/tune.html>`_ is an industry standard tool for
+distributed hyperparameter tuning. Ray Tune includes the latest hyperparameter search
+algorithms, integrates with TensorBoard and other analysis libraries, and natively
+supports distributed training through `Ray's distributed machine learning engine
+<https://ray.io/>`_.
+
+In this tutorial, we will show you how to integrate Ray Tune into your PyTorch
+training workflow. We will extend `this tutorial from the PyTorch documentation
+<https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html>`_ for training
+a CIFAR10 image classifier.
+
+As you will see, we only need to add some slight modifications. In particular, we
+need to
+
+1. wrap data loading and training in functions,
+2. make some network parameters configurable,
+3. add checkpointing (optional),
+4. and define the search space for the model tuning
+
+|
+
+To run this tutorial, please make sure the following packages are
+installed:
+
+-  ``ray[tune]``: Distributed hyperparameter tuning library
+-  ``torchvision``: For the data transformers
+
+Setup / Imports
+---------------
+Let's start with the imports:
+"""
+from functools import partial
+import numpy as np
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import random_split
+import torchvision
+import torchvision.transforms as transforms
+from ray import tune
+from ray.tune import CLIReporter
+from ray.tune.schedulers import ASHAScheduler
+
+######################################################################
+# Most of the imports are needed for building the PyTorch model. Only the last three
+# imports are for Ray Tune.
+#
+# Data loaders
+# ------------
+# We wrap the data loaders in their own function and pass a global data directory.
+# This way we can share a data directory between different trials.
+
+
+def load_data(data_dir="./data"):
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+    ])
+
+    trainset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=True, download=True, transform=transform)
+
+    testset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=False, download=True, transform=transform)
+
+    return trainset, testset
+
+######################################################################
+# Configurable neural network
+# ---------------------------
+# We can only tune those parameters that are configurable. In this example, we can specify
+# the layer sizes of the fully connected layers:
+
+
+class Net(nn.Module):
+    def __init__(self, l1=120, l2=84):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, l1)
+        self.fc2 = nn.Linear(l1, l2)
+        self.fc3 = nn.Linear(l2, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+######################################################################
+# The train function
+# ------------------
+# Now it gets interesting, because we introduce some changes to the example `from the PyTorch
+# documentation <https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html>`_.
+#
+# We wrap the training script in a function ``train_cifar(config, checkpoint_dir=None, data_dir=None)``.
+# As you can guess, the ``config`` parameter will receive the hyperparameters we would like to
+# train with. The ``checkpoint_dir`` parameter is used to restore checkpoints. The ``data_dir`` specifies
+# the directory where we load and store the data, so multiple runs can share the same data source.
+#
+# .. code-block:: python
+#
+#     net = Net(config["l1"], config["l2"])
+#
+#     if checkpoint_dir:
+#         model_state, optimizer_state = torch.load(
+#             os.path.join(checkpoint_dir, "checkpoint"))
+#         net.load_state_dict(model_state)
+#         optimizer.load_state_dict(optimizer_state)
+#
+# The learning rate of the optimizer is made configurable, too:
+#
+# .. code-block:: python
+#
+#     optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
+#
+# We also split the training data into a training and validation subset. We thus train on
+# 80% of the data and calculate the validation loss on the remaining 20%. The batch sizes
+# with which we iterate through the training and test sets are configurable as well.
+#
+# Adding (multi) GPU support with DataParallel
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Image classification benefits largely from GPUs. Luckily, we can continue to use
+# PyTorch's abstractions in Ray Tune. Thus, we can wrap our model in ``nn.DataParallel``
+# to support data parallel training on multiple GPUs:
+#
+# .. code-block:: python
+#
+#     device = "cpu"
+#     if torch.cuda.is_available():
+#         device = "cuda:0"
+#         if torch.cuda.device_count() > 1:
+#             net = nn.DataParallel(net)
+#     net.to(device)
+#
+# By using a ``device`` variable we make sure that training also works when we have
+# no GPUs available. PyTorch requires us to send our data to the GPU memory explicitly,
+# like this:
+#
+# .. code-block:: python
+#
+#     for i, data in enumerate(trainloader, 0):
+#         inputs, labels = data
+#         inputs, labels = inputs.to(device), labels.to(device)
+#
+# The code now supports training on CPUs, on a single GPU, and on multiple GPUs. Notably, Ray
+# also supports `fractional GPUs <https://docs.ray.io/en/master/using-ray-with-gpus.html#fractional-gpus>`_
+# so we can share GPUs among trials, as long as the model still fits on the GPU memory. We'll come back
+# to that later.
+#
+# Communicating with Ray Tune
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The most interesting part is the communication with Ray Tune:
+#
+# .. code-block:: python
+#
+#     with tune.checkpoint_dir(epoch) as checkpoint_dir:
+#         path = os.path.join(checkpoint_dir, "checkpoint")
+#         torch.save((net.state_dict(), optimizer.state_dict()), path)
+#
+#     tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
+#
+# Here we first save a checkpoint and then report some metrics back to Ray Tune. Specifically,
+# we send the validation loss and accuracy back to Ray Tune. Ray Tune can then use these metrics
+# to decide which hyperparameter configuration lead to the best results. These metrics
+# can also be used to stop bad performing trials early in order to avoid wasting
+# resources on those trials.
+#
+# The checkpoint saving is optional, however, it is necessary if we wanted to use advanced
+# schedulers like
+# `Population Based Training <https://docs.ray.io/en/master/tune/tutorials/tune-advanced-tutorial.html>`_.
+# Also, by saving the checkpoint we can later load the trained models and validate them
+# on a test set.
+#
+# Full training function
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# The full code example looks like this:
+
+
+def train_cifar(config, checkpoint_dir=None, data_dir=None):
+    net = Net(config["l1"], config["l2"])
+
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        if torch.cuda.device_count() > 1:
+            net = nn.DataParallel(net)
+    net.to(device)
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
+
+    if checkpoint_dir:
+        model_state, optimizer_state = torch.load(
+            os.path.join(checkpoint_dir, "checkpoint"))
+        net.load_state_dict(model_state)
+        optimizer.load_state_dict(optimizer_state)
+
+    trainset, testset = load_data(data_dir)
+
+    test_abs = int(len(trainset) * 0.8)
+    train_subset, val_subset = random_split(
+        trainset, [test_abs, len(trainset) - test_abs])
+
+    trainloader = torch.utils.data.DataLoader(
+        train_subset,
+        batch_size=int(config["batch_size"]),
+        shuffle=True,
+        num_workers=8)
+    valloader = torch.utils.data.DataLoader(
+        val_subset,
+        batch_size=int(config["batch_size"]),
+        shuffle=True,
+        num_workers=8)
+
+    for epoch in range(10):  # loop over the dataset multiple times
+        running_loss = 0.0
+        epoch_steps = 0
+        for i, data in enumerate(trainloader, 0):
+            # get the inputs; data is a list of [inputs, labels]
+            inputs, labels = data
+            inputs, labels = inputs.to(device), labels.to(device)
+
+            # zero the parameter gradients
+            optimizer.zero_grad()
+
+            # forward + backward + optimize
+            outputs = net(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+
+            # print statistics
+            running_loss += loss.item()
+            epoch_steps += 1
+            if i % 2000 == 1999:  # print every 2000 mini-batches
+                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
+                                                running_loss / epoch_steps))
+                running_loss = 0.0
+
+        # Validation loss
+        val_loss = 0.0
+        val_steps = 0
+        total = 0
+        correct = 0
+        for i, data in enumerate(valloader, 0):
+            with torch.no_grad():
+                inputs, labels = data
+                inputs, labels = inputs.to(device), labels.to(device)
+
+                outputs = net(inputs)
+                _, predicted = torch.max(outputs.data, 1)
+                total += labels.size(0)
+                correct += (predicted == labels).sum().item()
+
+                loss = criterion(outputs, labels)
+                val_loss += loss.cpu().numpy()
+                val_steps += 1
+
+        with tune.checkpoint_dir(epoch) as checkpoint_dir:
+            path = os.path.join(checkpoint_dir, "checkpoint")
+            torch.save((net.state_dict(), optimizer.state_dict()), path)
+
+        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
+    print("Finished Training")
+
+######################################################################
+# As you can see, most of the code is adapted directly from the original example.
+#
+# Test set accuracy
+# -----------------
+# Commonly the performance of a machine learning model is tested on a hold-out test
+# set with data that has not been used for training the model. We also wrap this in a
+# function:
+
+
+def test_accuracy(net, device="cpu"):
+    trainset, testset = load_data()
+
+    testloader = torch.utils.data.DataLoader(
+        testset, batch_size=4, shuffle=False, num_workers=2)
+
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for data in testloader:
+            images, labels = data
+            images, labels = images.to(device), labels.to(device)
+            outputs = net(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+    return correct / total
+
+######################################################################
+# The function also expects a ``device`` parameter, so we can do the
+# test set validation on a GPU.
+#
+# Configuring the search space
+# ----------------------------
+# Lastly, we need to define Ray Tune's search space. Here is an example:
+#
+# .. code-block:: python
+#
+#     config = {
+#         "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
+#         "l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
+#         "lr": tune.loguniform(1e-4, 1e-1),
+#         "batch_size": tune.choice([2, 4, 8, 16])
+#     }
+#
+# The ``tune.sample_from()`` function makes it possible to define your own sample
+# methods to obtain hyperparameters. In this example, the ``l1`` and ``l2`` parameters
+# should be powers of 2 between 4 and 256, so either 4, 8, 16, 32, 64, 128, or 256.
+# The ``lr`` (learning rate) should be uniformly sampled between 0.0001 and 0.1. Lastly,
+# the batch size is a choice between 2, 4, 8, and 16.
+#
+# At each trial, Ray Tune will now randomly sample a combination of parameters from these
+# search spaces. It will then train a number of models in parallel and find the best
+# performing one among these. We also use the ``ASHAScheduler`` which will terminate bad
+# performing trials early.
+#
+# We wrap the ``train_cifar`` function with ``functools.partial`` to set the constant
+# ``data_dir`` parameter. We can also tell Ray Tune what resources should be
+# available for each trial:
+#
+# .. code-block:: python
+#
+#     gpus_per_trial = 2
+#     # ...
+#     result = tune.run(
+#         partial(train_cifar, data_dir=data_dir),
+#         resources_per_trial={"cpu": 8, "gpu": gpus_per_trial},
+#         config=config,
+#         num_samples=num_samples,
+#         scheduler=scheduler,
+#         progress_reporter=reporter,
+#         checkpoint_at_end=True)
+#
+# You can specify the number of CPUs, which are then available e.g.
+# to increase the ``num_workers`` of the PyTorch ``DataLoader`` instances. The selected
+# number of GPUs are made visible to PyTorch in each trial. Trials do not have access to
+# GPUs that haven't been requested for them - so you don't have to care about two trials
+# using the same set of resources.
+#
+# Here we can also specify fractional GPUs, so something like ``gpus_per_trial=0.5`` is
+# completely valid. The trials will then share GPUs among each other.
+# You just have to make sure that the models still fit in the GPU memory.
+#
+# After training the models, we will find the best performing one and load the trained
+# network from the checkpoint file. We then obtain the test set accuracy and report
+# everything by printing.
+#
+# The full main function looks like this:
+
+
+def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
+    data_dir = os.path.abspath("./data")
+    load_data(data_dir)
+    config = {
+        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
+        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
+        "lr": tune.loguniform(1e-4, 1e-1),
+        "batch_size": tune.choice([2, 4, 8, 16])
+    }
+    scheduler = ASHAScheduler(
+        metric="loss",
+        mode="min",
+        max_t=max_num_epochs,
+        grace_period=1,
+        reduction_factor=2)
+    reporter = CLIReporter(
+        # parameter_columns=["l1", "l2", "lr", "batch_size"],
+        metric_columns=["loss", "accuracy", "training_iteration"])
+    result = tune.run(
+        partial(train_cifar, data_dir=data_dir),
+        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
+        config=config,
+        num_samples=num_samples,
+        scheduler=scheduler,
+        progress_reporter=reporter)
+
+    best_trial = result.get_best_trial("loss", "min", "last")
+    print("Best trial config: {}".format(best_trial.config))
+    print("Best trial final validation loss: {}".format(
+        best_trial.last_result["loss"]))
+    print("Best trial final validation accuracy: {}".format(
+        best_trial.last_result["accuracy"]))
+
+    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        if gpus_per_trial > 1:
+            best_trained_model = nn.DataParallel(best_trained_model)
+    best_trained_model.to(device)
+
+    best_checkpoint_dir = best_trial.checkpoint.value
+    model_state, optimizer_state = torch.load(os.path.join(
+        best_checkpoint_dir, "checkpoint"))
+    best_trained_model.load_state_dict(model_state)
+
+    test_acc = test_accuracy(best_trained_model, device)
+    print("Best trial test set accuracy: {}".format(test_acc))
+
+
+if __name__ == "__main__":
+    # You can change the number of GPUs per trial here:
+    main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)
+
+
+######################################################################
+# If you run the code, an example output could look like this:
+#
+# .. code-block::
+#
+#     Number of trials: 10 (10 TERMINATED)
+#     +-----+------+------+-------------+--------------+---------+------------+--------------------+
+#     | ... |   l1 |   l2 |          lr |   batch_size |    loss |   accuracy | training_iteration |
+#     |-----+------+------+-------------+--------------+---------+------------+--------------------|
+#     | ... |   64 |    4 | 0.00011629  |            2 | 1.87273 |     0.244  |                  2 |
+#     | ... |   32 |   64 | 0.000339763 |            8 | 1.23603 |     0.567  |                  8 |
+#     | ... |    8 |   16 | 0.00276249  |           16 | 1.1815  |     0.5836 |                 10 |
+#     | ... |    4 |   64 | 0.000648721 |            4 | 1.31131 |     0.5224 |                  8 |
+#     | ... |   32 |   16 | 0.000340753 |            8 | 1.26454 |     0.5444 |                  8 |
+#     | ... |    8 |    4 | 0.000699775 |            8 | 1.99594 |     0.1983 |                  2 |
+#     | ... |  256 |    8 | 0.0839654   |           16 | 2.3119  |     0.0993 |                  1 |
+#     | ... |   16 |  128 | 0.0758154   |           16 | 2.33575 |     0.1327 |                  1 |
+#     | ... |   16 |    8 | 0.0763312   |           16 | 2.31129 |     0.1042 |                  4 |
+#     | ... |  128 |   16 | 0.000124903 |            4 | 2.26917 |     0.1945 |                  1 |
+#     +-----+------+------+-------------+--------------+---------+------------+--------------------+
+#
+#
+#     Best trial config: {'l1': 8, 'l2': 16, 'lr': 0.00276249, 'batch_size': 16, 'data_dir': '...'}
+#     Best trial final validation loss: 1.181501
+#     Best trial final validation accuracy: 0.5836
+#     Best trial test set accuracy: 0.5806
+#
+# Most trials have been stopped early in order to avoid wasting resources.
+# The best performing trial achieved a validation accuracy of about 58%, which could
+# be confirmed on the test set.
+#
+# So that's it! You can now tune the parameters of your PyTorch models.
diff --git a/beginner_source/nlp/pytorch_tutorial.py b/beginner_source/nlp/pytorch_tutorial.py
index d61496d38..62249bf2d 100644
--- a/beginner_source/nlp/pytorch_tutorial.py
+++ b/beginner_source/nlp/pytorch_tutorial.py
@@ -14,10 +14,6 @@
 # Author: Robert Guthrie
 
 import torch
-import torch.autograd as autograd
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
 
 torch.manual_seed(1)
 
@@ -274,7 +270,7 @@
 
 ###############################################################
 # You can also stop autograd from tracking history on Tensors
-# with ``.requires_grad``=True by wrapping the code block in
+# with ``.requires_grad=True`` by wrapping the code block in
 # ``with torch.no_grad():``
 print(x.requires_grad)
 print((x ** 2).requires_grad)
diff --git a/beginner_source/nlp/sequence_models_tutorial.py b/beginner_source/nlp/sequence_models_tutorial.py
index 4db036195..92512e88d 100644
--- a/beginner_source/nlp/sequence_models_tutorial.py
+++ b/beginner_source/nlp/sequence_models_tutorial.py
@@ -21,14 +21,14 @@
 part-of-speech tags, and a myriad of other things.
 
 
-LSTM's in Pytorch
+LSTMs in Pytorch
 ~~~~~~~~~~~~~~~~~
 
 Before getting to the example, note a few things. Pytorch's LSTM expects
 all of its inputs to be 3D tensors. The semantics of the axes of these
 tensors is important. The first axis is the sequence itself, the second
 indexes instances in the mini-batch, and the third indexes elements of
-the input. We haven't discussed mini-batching, so lets just ignore that
+the input. We haven't discussed mini-batching, so let's just ignore that
 and assume we will always have just 1 dimension on the second axis. If
 we want to run the sequence model over the sentence "The cow jumped",
 our input should look like
@@ -95,7 +95,9 @@
 # In this section, we will use an LSTM to get part of speech tags. We will
 # not use Viterbi or Forward-Backward or anything like that, but as a
 # (challenging) exercise to the reader, think about how Viterbi could be
-# used after you have seen what is going on.
+# used after you have seen what is going on. In this example, we also refer
+# to embeddings. If you are unfamiliar with embeddings, you can read up 
+# about them `here <https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html>`__.
 #
 # The model is as follows: let our input sentence be
 # :math:`w_1, \dots, w_M`, where :math:`w_i \in V`, our vocab. Also, let
@@ -127,16 +129,19 @@ def prepare_sequence(seq, to_ix):
 
 
 training_data = [
+    # Tags are: DET - determiner; NN - noun; V - verb
+    # For example, the word "The" is a determiner 
     ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
     ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
 ]
 word_to_ix = {}
+# For each words-list (sentence) and tags-list in each tuple of training_data
 for sent, tags in training_data:
     for word in sent:
-        if word not in word_to_ix:
-            word_to_ix[word] = len(word_to_ix)
+        if word not in word_to_ix:  # word has not been assigned an index yet
+            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
 print(word_to_ix)
-tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
+tag_to_ix = {"DET": 0, "NN": 1, "V": 2}  # Assign each tag with a unique index
 
 # These will usually be more like 32 or 64 dimensional.
 # We will keep them small, so we can see how the weights change as we train.
diff --git a/beginner_source/profiler.py b/beginner_source/profiler.py
new file mode 100644
index 000000000..450e450b0
--- /dev/null
+++ b/beginner_source/profiler.py
@@ -0,0 +1,320 @@
+"""
+Profiling your PyTorch Module
+------------
+**Author:** `Suraj Subramanian <https://github.com/suraj813>`_
+
+PyTorch includes a profiler API that is useful to identify the time and
+memory costs of various PyTorch operations in your code. Profiler can be
+easily integrated in your code, and the results can be printed as a table
+or retured in a JSON trace file.
+
+.. note::
+    Profiler supports multithreaded models. Profiler runs in the
+    same thread as the operation but it will also profile child operators
+    that might run in another thread. Concurrently-running profilers will be
+    scoped to their own thread to prevent mixing of results.
+
+.. note::
+    PyTorch 1.8 introduces the new API that will replace the older profiler API
+    in the future releases. Check the new API at `this page <https://pytorch.org/docs/master/profiler.html>`__.
+
+Head on over to `this
+recipe <https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html>`__
+for a quicker walkthrough of Profiler API usage.
+
+
+--------------
+"""
+
+import torch
+import numpy as np
+from torch import nn
+import torch.autograd.profiler as profiler
+
+
+######################################################################
+# Performance debugging using Profiler
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Profiler can be useful to identify performance bottlenecks in your
+# models. In this example, we build a custom module that performs two
+# sub-tasks:
+#
+# - a linear transformation on the input, and
+# - use the transformation result to get indices on a mask tensor.
+#
+# We wrap the code for each sub-task in separate labelled context managers using
+# ``profiler.record_function("label")``. In the profiler output, the
+# aggregate performance metrics of all operations in the sub-task will
+# show up under its corresponding label.
+#
+#
+# Note that using Profiler incurs some overhead, and is best used only for investigating
+# code. Remember to remove it if you are benchmarking runtimes.
+#
+
+class MyModule(nn.Module):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super(MyModule, self).__init__()
+        self.linear = nn.Linear(in_features, out_features, bias)
+
+    def forward(self, input, mask):
+        with profiler.record_function("LINEAR PASS"):
+            out = self.linear(input)
+
+        with profiler.record_function("MASK INDICES"):
+            threshold = out.sum(axis=1).mean().item()
+            hi_idx = np.argwhere(mask.cpu().numpy() > threshold)
+            hi_idx = torch.from_numpy(hi_idx).cuda()
+
+        return out, hi_idx
+
+
+######################################################################
+# Profile the forward pass
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We initialize random input and mask tensors, and the model.
+#
+# Before we run the profiler, we warm-up CUDA to ensure accurate
+# performance benchmarking. We wrap the forward pass of our module in the
+# ``profiler.profile`` context manager. The ``with_stack=True`` parameter appends the
+# file and line number of the operation in the trace.
+#
+# .. WARNING::
+#     ``with_stack=True`` incurs an additional overhead, and is better suited for investigating code.
+#     Remember to remove it if you are benchmarking performance.
+#
+
+model = MyModule(500, 10).cuda()
+input = torch.rand(128, 500).cuda()
+mask = torch.rand((500, 500, 500), dtype=torch.double).cuda()
+
+# warm-up
+model(input, mask)
+
+with profiler.profile(with_stack=True, profile_memory=True) as prof:
+    out, idx = model(input, mask)
+
+
+######################################################################
+# Print profiler results
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Finally, we print the profiler results. ``profiler.key_averages``
+# aggregates the results by operator name, and optionally by input
+# shapes and/or stack trace events.
+# Grouping by input shapes is useful to identify which tensor shapes
+# are utilized by the model.
+#
+# Here, we use ``group_by_stack_n=5`` which aggregates runtimes by the
+# operation and its traceback (truncated to the most recent 5 events), and
+# display the events in the order they are registered. The table can also
+# be sorted by passing a ``sort_by`` argument (refer to the
+# `docs <https://pytorch.org/docs/stable/autograd.html#profiler>`__ for
+# valid sorting keys).
+#
+# .. Note::
+#   When running profiler in a notebook, you might see entries like ``<ipython-input-18-193a910735e8>(13): forward``
+#   instead of filenames in the stacktrace. These correspond to ``<notebook-cell>(line number): calling-function``.
+
+print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))
+
+"""
+(Some columns are omitted)
+
+-------------  ------------  ------------  ------------  ---------------------------------
+         Name    Self CPU %      Self CPU  Self CPU Mem   Source Location
+-------------  ------------  ------------  ------------  ---------------------------------
+ MASK INDICES        87.88%        5.212s    -953.67 Mb  /mnt/xarfuse/.../torch/au
+                                                         <ipython-input-...>(10): forward
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         <ipython-input-...>(9): <module>
+                                                         /mnt/xarfuse/.../IPython/
+
+  aten::copy_        12.07%     715.848ms           0 b  <ipython-input-...>(12): forward
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         <ipython-input-...>(9): <module>
+                                                         /mnt/xarfuse/.../IPython/
+                                                         /mnt/xarfuse/.../IPython/
+
+  LINEAR PASS         0.01%     350.151us         -20 b  /mnt/xarfuse/.../torch/au
+                                                         <ipython-input-...>(7): forward
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         <ipython-input-...>(9): <module>
+                                                         /mnt/xarfuse/.../IPython/
+
+  aten::addmm         0.00%     293.342us           0 b  /mnt/xarfuse/.../torch/nn
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         <ipython-input-...>(8): forward
+                                                         /mnt/xarfuse/.../torch/nn
+
+   aten::mean         0.00%     235.095us           0 b  <ipython-input-...>(11): forward
+                                                         /mnt/xarfuse/.../torch/nn
+                                                         <ipython-input-...>(9): <module>
+                                                         /mnt/xarfuse/.../IPython/
+                                                         /mnt/xarfuse/.../IPython/
+
+-----------------------------  ------------  ---------- ----------------------------------
+Self CPU time total: 5.931s
+
+"""
+
+######################################################################
+# Improve memory performance
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Note that the most expensive operations - in terms of memory and time -
+# are at ``forward (10)`` representing the operations within MASK INDICES. Let’s try to
+# tackle the memory consumption first. We can see that the ``.to()``
+# operation at line 12 consumes 953.67 Mb. This operation copies ``mask`` to the CPU.
+# ``mask`` is initialized with a ``torch.double`` datatype. Can we reduce the memory footprint by casting
+# it to ``torch.float`` instead?
+#
+
+model = MyModule(500, 10).cuda()
+input = torch.rand(128, 500).cuda()
+mask = torch.rand((500, 500, 500), dtype=torch.float).cuda()
+
+# warm-up
+model(input, mask)
+
+with profiler.profile(with_stack=True, profile_memory=True) as prof:
+    out, idx = model(input, mask)
+
+print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))
+
+"""
+(Some columns are omitted)
+
+-----------------  ------------  ------------  ------------  --------------------------------
+             Name    Self CPU %      Self CPU  Self CPU Mem   Source Location
+-----------------  ------------  ------------  ------------  --------------------------------
+     MASK INDICES        93.61%        5.006s    -476.84 Mb  /mnt/xarfuse/.../torch/au
+                                                             <ipython-input-...>(10): forward
+                                                             /mnt/xarfuse/  /torch/nn
+                                                             <ipython-input-...>(9): <module>
+                                                             /mnt/xarfuse/.../IPython/
+
+      aten::copy_         6.34%     338.759ms           0 b  <ipython-input-...>(12): forward
+                                                             /mnt/xarfuse/.../torch/nn
+                                                             <ipython-input-...>(9): <module>
+                                                             /mnt/xarfuse/.../IPython/
+                                                             /mnt/xarfuse/.../IPython/
+
+ aten::as_strided         0.01%     281.808us           0 b  <ipython-input-...>(11): forward
+                                                             /mnt/xarfuse/.../torch/nn
+                                                             <ipython-input-...>(9): <module>
+                                                             /mnt/xarfuse/.../IPython/
+                                                             /mnt/xarfuse/.../IPython/
+
+      aten::addmm         0.01%     275.721us           0 b  /mnt/xarfuse/.../torch/nn
+                                                             /mnt/xarfuse/.../torch/nn
+                                                             /mnt/xarfuse/.../torch/nn
+                                                             <ipython-input-...>(8): forward
+                                                             /mnt/xarfuse/.../torch/nn
+
+      aten::_local        0.01%     268.650us           0 b  <ipython-input-...>(11): forward
+      _scalar_dense                                          /mnt/xarfuse/.../torch/nn
+                                                             <ipython-input-...>(9): <module>
+                                                             /mnt/xarfuse/.../IPython/
+                                                             /mnt/xarfuse/.../IPython/
+
+-----------------  ------------  ------------  ------------  --------------------------------
+Self CPU time total: 5.347s
+
+"""
+
+######################################################################
+#
+# The CPU memory footprint for this operation has halved.
+#
+# Improve time performance
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# While the time consumed has also reduced a bit, it’s still too high.
+# Turns out copying a matrix from CUDA to CPU is pretty expensive!
+# The ``aten::copy_`` operator in ``forward (12)`` copies ``mask`` to CPU
+# so that it can use the NumPy ``argwhere`` function. ``aten::copy_`` at ``forward(13)``
+# copies the array back to CUDA as a tensor. We could eliminate both of these if we use a
+# ``torch`` function ``nonzero()`` here instead.
+#
+
+class MyModule(nn.Module):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super(MyModule, self).__init__()
+        self.linear = nn.Linear(in_features, out_features, bias)
+
+    def forward(self, input, mask):
+        with profiler.record_function("LINEAR PASS"):
+            out = self.linear(input)
+
+        with profiler.record_function("MASK INDICES"):
+            threshold = out.sum(axis=1).mean()
+            hi_idx = (mask > threshold).nonzero(as_tuple=True)
+
+        return out, hi_idx
+
+
+model = MyModule(500, 10).cuda()
+input = torch.rand(128, 500).cuda()
+mask = torch.rand((500, 500, 500), dtype=torch.float).cuda()
+
+# warm-up
+model(input, mask)
+
+with profiler.profile(with_stack=True, profile_memory=True) as prof:
+    out, idx = model(input, mask)
+
+print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))
+
+"""
+(Some columns are omitted)
+
+--------------  ------------  ------------  ------------  ---------------------------------
+          Name    Self CPU %      Self CPU  Self CPU Mem   Source Location
+--------------  ------------  ------------  ------------  ---------------------------------
+      aten::gt        57.17%     129.089ms           0 b  <ipython-input-...>(12): forward
+                                                          /mnt/xarfuse/.../torch/nn
+                                                          <ipython-input-...>(25): <module>
+                                                          /mnt/xarfuse/.../IPython/
+                                                          /mnt/xarfuse/.../IPython/
+
+ aten::nonzero        37.38%      84.402ms           0 b  <ipython-input-...>(12): forward
+                                                          /mnt/xarfuse/.../torch/nn
+                                                          <ipython-input-...>(25): <module>
+                                                          /mnt/xarfuse/.../IPython/
+                                                          /mnt/xarfuse/.../IPython/
+
+   INDEX SCORE         3.32%       7.491ms    -119.21 Mb  /mnt/xarfuse/.../torch/au
+                                                          <ipython-input-...>(10): forward
+                                                          /mnt/xarfuse/.../torch/nn
+                                                          <ipython-input-...>(25): <module>
+                                                          /mnt/xarfuse/.../IPython/
+
+aten::as_strided         0.20%    441.587us          0 b  <ipython-input-...>(12): forward
+                                                          /mnt/xarfuse/.../torch/nn
+                                                          <ipython-input-...>(25): <module>
+                                                          /mnt/xarfuse/.../IPython/
+                                                          /mnt/xarfuse/.../IPython/
+
+ aten::nonzero
+     _numpy             0.18%     395.602us           0 b  <ipython-input-...>(12): forward
+                                                          /mnt/xarfuse/.../torch/nn
+                                                          <ipython-input-...>(25): <module>
+                                                          /mnt/xarfuse/.../IPython/
+                                                          /mnt/xarfuse/.../IPython/
+--------------  ------------  ------------  ------------  ---------------------------------
+Self CPU time total: 225.801ms
+
+"""
+
+
+######################################################################
+# Further Reading
+# ~~~~~~~~~~~~~~~~~
+# We have seen how Profiler can be used to investigate time and memory bottlenecks in PyTorch models.
+# Read more about Profiler here:
+#
+# - `Profiler Usage Recipe <https://pytorch.org/tutorials/recipes/recipes/profiler.html>`__
+# - `Profiling RPC-Based Workloads <https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html>`__
+# - `Profiler API Docs <https://pytorch.org/docs/stable/autograd.html?highlight=profiler#profiler>`__
diff --git a/beginner_source/ptcheat.rst b/beginner_source/ptcheat.rst
index 4bcf289f2..dda57fc8c 100644
--- a/beginner_source/ptcheat.rst
+++ b/beginner_source/ptcheat.rst
@@ -5,7 +5,7 @@ Imports
 =========
 
 General
----------
+-------
 
 .. code-block:: python
 
@@ -13,7 +13,7 @@ General
     from torch.utils.data import Dataset, Dataloader    # dataset representation and loading
 
 Neural Network API
---------------------
+------------------
 
 .. code-block:: python
 
@@ -30,44 +30,44 @@ See `autograd <https://pytorch.org/docs/stable/autograd.html>`__,
 and `optim <https://pytorch.org/docs/stable/optim.html>`__
 
 Torchscript and JIT
----------------------
+---------------
 
 .. code-block:: python
 
-    torch.jit.trace()         # takes your module or function and an example
-                              # data input, and traces the computational steps
+    torch.jit.trace()         # takes your module or function and an example 
+                              # data input, and traces the computational steps 
                               # that the data encounters as it progresses through the model
 
-    @script                   # decorator used to indicate data-dependent
+    @script                   # decorator used to indicate data-dependent 
                               # control flow within the code being traced
 
 See `Torchscript <https://pytorch.org/docs/stable/jit.html>`__
 
 ONNX
-------
+----
 
 .. code-block:: python
 
-    torch.onnx.export(model, dummy data, xxxx.proto)       # exports an ONNX formatted
+    torch.onnx.export(model, dummy data, xxxx.proto)       # exports an ONNX formatted  
                                                            # model using a trained model, dummy
                                                            # data and the desired file name
 
     model = onnx.load("alexnet.proto")                     # load an ONNX model
-    onnx.checker.check_model(model)                        # check that the model
-                                                           # IR is well formed
-
-    onnx.helper.printable_graph(model.graph)               # print a human readable
+    onnx.checker.check_model(model)                        # check that the model 
+                                                           # IR is well formed  
+                    
+    onnx.helper.printable_graph(model.graph)               # print a human readable 
                                                            # representation of the graph
 
 See `onnx <https://pytorch.org/docs/stable/onnx.html>`__
 
 Vision
---------
+------
 
 .. code-block:: python
 
-    from torchvision import datasets, models, transforms     # vision datasets,
-                                                             # architectures &
+    from torchvision import datasets, models, transforms     # vision datasets, 
+                                                             # architectures & 
                                                              # transforms
 
     import torchvision.transforms as transforms              # composable transforms
@@ -76,12 +76,12 @@ See
 `torchvision <https://pytorch.org/docs/stable/torchvision/index.html>`__
 
 Distributed Training
-----------------------
+--------------------
 
 .. code-block:: python
 
-    import torch.distributed as dist          # distributed communication
-    from multiprocessing import Process       # memory sharing processes
+    import torch.distributed as dist             # distributed communication
+    from torch.multiprocessing import Process    # memory sharing processes
 
 See `distributed <https://pytorch.org/docs/stable/distributed.html>`__
 and
@@ -91,17 +91,17 @@ Tensors
 =========
 
 Creation
-----------
+--------
 
 .. code-block:: python
 
-    torch.randn(*size)              # tensor with independent N(0,1) entries
-    torch.[ones|zeros](*size)       # tensor with all 1's [or 0's]
-    torch.Tensor(L)                 # create tensor from [nested] list or ndarray L
-    x.clone()                       # clone of x
-    with torch.no_grad():           # code wrap that stops autograd from tracking tensor history
-    requires_grad=True              # arg, when set to True, tracks computation
-                                    # history for future derivative calculations
+    x = torch.randn(*size)              # tensor with independent N(0,1) entries
+    x = torch.[ones|zeros](*size)       # tensor with all 1's [or 0's]
+    x = torch.tensor(L)                 # create tensor from [nested] list or ndarray L
+    y = x.clone()                       # clone of x
+    with torch.no_grad():               # code wrap that stops autograd from tracking tensor history
+    requires_grad=True                  # arg, when set to True, tracks computation 
+                                        # history for future derivative calculations
 
 See `tensor <https://pytorch.org/docs/stable/tensors.html>`__
 
@@ -110,14 +110,16 @@ Dimensionality
 
 .. code-block:: python
 
-    x.size()                              # return tuple-like object of dimensions
-    torch.cat(tensor_seq, dim=0)          # concatenates tensors along dim
-    x.view(a,b,...)                       # reshapes x into size (a,b,...)
-    x.view(-1,a)                          # reshapes x into size (b,a) for some b
-    x.transpose(a,b)                      # swaps dimensions a and b
-    x.permute(*dims)                      # permutes dimensions
-    x.unsqueeze(dim)                      # tensor with added axis
-    x.unsqueeze(dim=2)                    # (a,b,c) tensor -> (a,b,1,c) tensor
+    x.size()                                  # return tuple-like object of dimensions
+    x = torch.cat(tensor_seq, dim=0)          # concatenates tensors along dim
+    y = x.view(a,b,...)                       # reshapes x into size (a,b,...)
+    y = x.view(-1,a)                          # reshapes x into size (b,a) for some b
+    y = x.transpose(a,b)                      # swaps dimensions a and b
+    y = x.permute(*dims)                      # permutes dimensions
+    y = x.unsqueeze(dim)                      # tensor with added axis
+    y = x.unsqueeze(dim=2)                    # (a,b,c) tensor -> (a,b,1,c) tensor
+    y = x.squeeze()                           # removes all dimensions of size 1 (a,1,b,1) -> (a,b)
+    y = x.squeeze(dim=1)                      # removes specified dimension of size 1 (a,1,b,1) -> (a,b,1)
 
 See `tensor <https://pytorch.org/docs/stable/tensors.html>`__
 
@@ -127,9 +129,9 @@ Algebra
 
 .. code-block:: python
 
-    A.mm(B)       # matrix multiplication
-    A.mv(x)       # matrix-vector multiplication
-    x.t()         # matrix transpose
+    ret = A.mm(B)       # matrix multiplication
+    ret = A.mv(x)       # matrix-vector multiplication
+    x = x.t()           # matrix transpose
 
 See `math
 operations <https://pytorch.org/docs/stable/torch.html?highlight=mm#math-operations>`__
@@ -139,24 +141,24 @@ GPU Usage
 
 .. code-block:: python
 
-    torch.cuda.is_available                                 # check for cuda
-    x.cuda()                                                # move x's data from
-                                                            # CPU to GPU and return new object
+    torch.cuda.is_available                                     # check for cuda
+    x = x.cuda()                                                # move x's data from 
+                                                                # CPU to GPU and return new object
 
-    x.cpu()                                                 # move x's data from GPU to CPU
-                                                            # and return new object
+    x = x.cpu()                                                 # move x's data from GPU to CPU 
+                                                                # and return new object
 
-    if not args.disable_cuda and torch.cuda.is_available(): # device agnostic code
-        args.device = torch.device('cuda')                  # and modularity
-    else:                                                   #
-        args.device = torch.device('cpu')                   #
+    if not args.disable_cuda and torch.cuda.is_available():     # device agnostic code 
+        args.device = torch.device('cuda')                      # and modularity
+    else:                                                       #
+        args.device = torch.device('cpu')                       #
 
-    net.to(device)                                          # recursively convert their
-                                                            # parameters and buffers to
-                                                            # device specific tensors
+    net.to(device)                                              # recursively convert their 
+                                                                # parameters and buffers to 
+                                                                # device specific tensors
 
-    mytensor.to(device)                                     # copy your tensors to a device
-                                                            # (gpu, cpu)
+    x = x.to(device)                                            # copy your tensors to a device 
+                                                                # (gpu, cpu)
 
 See `cuda <https://pytorch.org/docs/stable/cuda.html>`__
 
@@ -165,21 +167,21 @@ Deep Learning
 
 .. code-block:: python
 
-    nn.Linear(m,n)                                # fully connected layer from
+    nn.Linear(m,n)                                # fully connected layer from 
                                                   # m to n units
 
-    nn.ConvXd(m,n,s)                              # X dimensional conv layer from
-                                                  # m to n channels where X⍷{1,2,3}
+    nn.ConvXd(m,n,s)                              # X dimensional conv layer from 
+                                                  # m to n channels where X⍷{1,2,3} 
                                                   # and the kernel size is s
 
-    nn.MaxPoolXd(s)                               # X dimension pooling layer
+    nn.MaxPoolXd(s)                               # X dimension pooling layer 
                                                   # (notation as above)
 
-    nn.BatchNorm                                  # batch norm layer
+    nn.BatchNormXd                                # batch norm layer
     nn.RNN/LSTM/GRU                               # recurrent layers
     nn.Dropout(p=0.5, inplace=False)              # dropout layer for any dimensional input
     nn.Dropout2d(p=0.5, inplace=False)            # 2-dimensional channel-wise dropout
-    nn.Embedding(num_embeddings, embedding_dim)   # (tensor-wise) mapping from
+    nn.Embedding(num_embeddings, embedding_dim)   # (tensor-wise) mapping from 
                                                   # indices to embedding vectors
 
 See `nn <https://pytorch.org/docs/stable/nn.html>`__
@@ -189,12 +191,16 @@ Loss Functions
 
 .. code-block:: python
 
-    nn.X                                  # where X is BCELoss, CrossEntropyLoss,
-                                          # L1Loss, MSELoss, NLLLoss, SoftMarginLoss,
-                                          # MultiLabelSoftMarginLoss, CosineEmbeddingLoss,
-                                          # KLDivLoss, MarginRankingLoss, HingeEmbeddingLoss
-                                          # or CosineEmbeddingLoss
-
+    nn.X                                  # where X is L1Loss, MSELoss, CrossEntropyLoss
+                                          # CTCLoss, NLLLoss, PoissonNLLLoss, 
+                                          # KLDivLoss, BCELoss, BCEWithLogitsLoss,
+                                          # MarginRankingLoss, HingeEmbeddingLoss,
+                                          # MultiLabelMarginLoss, SmoothL1Loss,
+                                          # SoftMarginLoss, MultiLabelSoftMarginLoss,
+                                          # CosineEmbeddingLoss, MultiMarginLoss,
+                                          # or TripletMarginLoss
+    
+ 
 See `loss
 functions <https://pytorch.org/docs/stable/nn.html#loss-functions>`__
 
@@ -203,11 +209,11 @@ Activation Functions
 
 .. code-block:: python
 
-    nn.X                                  # where X is ReLU, ReLU6, ELU, SELU, PReLU, LeakyReLU,
-                                          # Threshold, HardTanh, Sigmoid, Tanh,
-                                          # LogSigmoid, Softplus, SoftShrink,
-                                          # Softsign, TanhShrink, Softmin, Softmax,
-                                          # Softmax2d or LogSoftmax
+    nn.X                                  # where X is ReLU, ReLU6, ELU, SELU, PReLU, LeakyReLU, 
+                                          # RReLu, CELU, GELU, Threshold, Hardshrink, HardTanh,
+                                          # Sigmoid, LogSigmoid, Softplus, SoftShrink, 
+                                          # Softsign, Tanh, TanhShrink, Softmin, Softmax, 
+                                          # Softmax2d, LogSoftmax or AdaptiveSoftmaxWithLoss
 
 See `activation
 functions <https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity>`__
@@ -219,9 +225,9 @@ Optimizers
 
     opt = optim.x(model.parameters(), ...)      # create optimizer
     opt.step()                                  # update weights
-    optim.X                                     # where X is SGD, Adadelta, Adagrad, Adam,
-                                                # SparseAdam, Adamax, ASGD,
-                                                # LBFGS, RMSProp or Rprop
+    optim.X                                     # where X is SGD, Adadelta, Adagrad, Adam, 
+                                                # AdamW, SparseAdam, Adamax, ASGD, 
+                                                # LBFGS, RMSprop or Rprop
 
 See `optimizers <https://pytorch.org/docs/stable/optim.html>`__
 
@@ -232,8 +238,10 @@ Learning rate scheduling
 
     scheduler = optim.X(optimizer,...)      # create lr scheduler
     scheduler.step()                        # update lr at start of epoch
-    optim.lr_scheduler.X                    # where X is LambdaLR, StepLR, MultiStepLR,
-                  # ExponentialLR or ReduceLROnPLateau
+    optim.lr_scheduler.X                    # where X is LambdaLR, MultiplicativeLR,
+                                            # StepLR, MultiStepLR, ExponentialLR,
+                                            # CosineAnnealingLR, ReduceLROnPlateau, CyclicLR,
+                                            # OneCycleLR, CosineAnnealingWarmRestarts,
 
 See `learning rate
 scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`__
@@ -258,14 +266,14 @@ Dataloaders and DataSamplers
 
 .. code-block:: python
 
-    DataLoader(dataset, batch_size=1, ...)      # loads data batches agnostic
+    DataLoader(dataset, batch_size=1, ...)      # loads data batches agnostic 
                                                 # of structure of individual data points
 
-    sampler.Sampler(dataset,...)                # abstract class dealing with
+    sampler.Sampler(dataset,...)                # abstract class dealing with 
                                                 # ways to sample from dataset
 
-    sampler.XSampler where ...                  # Sequential, Random, Subset,
-                                                # WeightedRandom or Distributed
+    sampler.XSampler where ...                  # Sequential, Random, SubsetRandom,
+                                                # WeightedRandom, Batch, Distributed
 
 See
 `dataloader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`__
diff --git a/beginner_source/pytorch_with_examples.rst b/beginner_source/pytorch_with_examples.rst
index 00fcc7b9e..2ebc0b04c 100644
--- a/beginner_source/pytorch_with_examples.rst
+++ b/beginner_source/pytorch_with_examples.rst
@@ -1,210 +1,242 @@
-예제로 배우는 파이토치(PyTorch)
-********************************
+Learning PyTorch with Examples
+******************************
 **Author**: `Justin Johnson <https://github.com/jcjohnson/pytorch-examples>`_
-  **번역**: `박정환 <http://github.com/9bow>`_
 
-이 튜토리얼에서는 `PyTorch <https://github.com/pytorch/pytorch>`__ 의 핵심적인
-개념을 예제를 통해 소개합니다.
+.. Note::
+	This is one of our older PyTorch tutorials. You can view our latest
+	beginner content in 
+	`Learn the Basics <https://pytorch.org/tutorials/beginner/basics/intro.html>`_.
+
+This tutorial introduces the fundamental concepts of
+`PyTorch <https://github.com/pytorch/pytorch>`__ through self-contained
+examples.
 
-본질적으로, PyTorch에는 2가지 주요한 특징이 있습니다:
+At its core, PyTorch provides two main features:
 
-- NumPy와 유사하지만 GPU 상에서 실행 가능한 N차원 Tensor
-- 신경망을 구성하고 학습하는 과정에서의 자동 미분
+- An n-dimensional Tensor, similar to numpy but can run on GPUs
+- Automatic differentiation for building and training neural networks
 
-완전히 연결된 ReLU 신경망을 예제로 사용할 것입니다. 이 신경망은 하나의 은닉층
-(hidden layer)을 갖고 있으며, 신경망의 출력과 정답 사이의 유클리드 거리
-(Euclidean distance)를 최소화하는 식으로 경사하강법(gradient descent)을 사용하여
-무작위의 데이터를 맞추도록 학습할 것입니다.
+We will use a problem of fitting :math:`y=\sin(x)` with a third order polynomial
+as our running example. The network will have four parameters, and will be trained with
+gradient descent to fit random data by minimizing the Euclidean distance
+between the network output and the true output.
 
 .. Note::
-    각각의 예제들은 :ref:`이 페이지의 마지막 부분 <examples-download>` 에서
-    살펴볼 수 있습니다.
+	You can browse the individual examples at the
+	:ref:`end of this page <examples-download>`.
 
 .. contents:: Table of Contents
-    :local:
+	:local:
 
 Tensors
 =======
 
-준비 운동: NumPy
------------------
+Warm-up: numpy
+--------------
 
-PyTorch를 소개하기 전에, 먼저 NumPy를 사용하여 신경망을 구성해보겠습니다.
+Before introducing PyTorch, we will first implement the network using
+numpy.
 
-NumPy는 N차원 배열 객체와 함께 이러한 배열들을 조작하기 위한 다양한 함수들을
-제공합니다. NumPy는 과학적 분야의 연산을 위한 포괄적인 프레임워크 (Framework)입니다;
-NumPy는 연산 그래프(computation graph)나 딥러닝, 변화도(gradient)에 대해서는 알지
-못합니다. 하지만 NumPy 연산을 사용하여 순전파 단계와 역전파 단계를 직접 구현함으로써,
-2계층(two-layer)을 갖는 신경망이 무작위의 데이터를 맞추도록 할 수 있습니다:
+Numpy provides an n-dimensional array object, and many functions for
+manipulating these arrays. Numpy is a generic framework for scientific
+computing; it does not know anything about computation graphs, or deep
+learning, or gradients. However we can easily use numpy to fit a
+third order polynomial to sine function by manually implementing the forward
+and backward passes through the network using numpy operations:
 
-.. includenodoc:: /beginner/examples_tensor/two_layer_net_numpy.py
+.. includenodoc:: /beginner/examples_tensor/polynomial_numpy.py
 
 
 PyTorch: Tensors
 ----------------
 
-NumPy는 훌륭한 프레임워크지만, GPU를 사용하여 수치 연산을 가속화할 수는 없습니다.
-현대의 심층 신경망에서 GPU는 종종 `50배 또는 그 이상 <https://github.com/jcjohnson/cnn-benchmarks>`__
-의 속도 향상을 제공하기 때문에, 안타깝게도 NumPy는 현대의 딥러닝에는 충분치 않습니다.
+Numpy is a great framework, but it cannot utilize GPUs to accelerate its
+numerical computations. For modern deep neural networks, GPUs often
+provide speedups of `50x or
+greater <https://github.com/jcjohnson/cnn-benchmarks>`__, so
+unfortunately numpy won't be enough for modern deep learning.
 
-이번에는 PyTorch의 기본적인 개념인 **Tensor** 에 대해서 알아보겠습니다.
-PyTorch Tensor는 개념적으로 NumPy 배열과 동일합니다: Tensor는 N차원 배열이며,
-PyTorch는 Tensor 연산을 위한 다양한 함수들을 제공합니다. NumPy 배열처럼 PyTorch
-Tensor는 딥러닝이나 연산 그래프, 변화도는 알지 못하며, 과학적 분야의 연산을 위한
-포괄적인 도구입니다.
+Here we introduce the most fundamental PyTorch concept: the **Tensor**.
+A PyTorch Tensor is conceptually identical to a numpy array: a Tensor is
+an n-dimensional array, and PyTorch provides many functions for
+operating on these Tensors. Behind the scenes, Tensors can keep track of
+a computational graph and gradients, but they're also useful as a
+generic tool for scientific computing.
 
-그러나 NumPy와는 달리, PyTorch Tensor는 GPU를 활용하여 수치 연산을 가속화할 수
-있습니다. GPU에서 PyTorch Tensor를 실행하기 위해서는 단지 새로운 자료형으로
-변환(Cast)해주기만 하면 됩니다.
+Also unlike numpy, PyTorch Tensors can utilize GPUs to accelerate
+their numeric computations. To run a PyTorch Tensor on GPU, you simply
+need to specify the correct device.
 
-여기에서는 PyTorch Tensor를 사용하여 2계층의 신경망이 무작위 데이터를 맞추도록
-할 것입니다. 위의 NumPy 예제에서와 같이 신경망의 순전파 단계와 역전파 단계는 직접
-구현하겠습니다.
+Here we use PyTorch Tensors to fit a third order polynomial to sine function.
+Like the numpy example above we need to manually implement the forward
+and backward passes through the network:
 
-.. includenodoc:: /beginner/examples_tensor/two_layer_net_tensor.py
+.. includenodoc:: /beginner/examples_tensor/polynomial_tensor.py
 
 
 Autograd
 ========
 
-PyTorch: Tensor와 autograd
+PyTorch: Tensors and autograd
 -------------------------------
 
-위의 예제들에서 우리는 신경망의 순전파 단계와 역전파 단계를 직접 구현하였습니다.
-작은 2계층 신경망에서 역전파 단계를 직접 구현하는 것은 큰 일이 아니지만,
-대규모의 복잡한 신경망에서는 매우 아슬아슬한 일일 것입니다.
-
-다행히도, `자동 미분 <https://en.wikipedia.org/wiki/Automatic_differentiation>`__
-을 사용하여 신경망에서 역전파 단계의 연산을 자동화할 수 있습니다. PyTorch의
-**autograd** 패키지는 정확히 이런 기능을 제공합니다.
-Autograd를 사용할 때, 신경망의 순전파 단계는 **연산 그래프** 를 정의하게 됩니다;
-이 그래프의 노드(node)는 Tensor, 엣지(edge)는 입력 Tensor로부터 출력 Tensor를
-만들어내는 함수가 됩니다. 이 그래프를 통해 역전파를 하게 되면 변화도를 쉽게 계산할
-수 있습니다.
-
-이는 복잡하게 들리지만, 실제로 사용하는 것은 매우 간단합니다. 각 Tensor는 연산
-그래프에서 노드로 표현됩니다. 만약 ``x`` 가 ``x.requires_grad=True`` 인 Tensor면
-``x.grad`` 는 어떤 스칼라 값에 대한 ``x`` 의 변화도를 갖는 또 다른 Tensor입니다.
-
-여기에서는 PyTorch Tensor와 autograd를 사용하여 2계층 신경망을 구현합니다;
-이제 더 이상 신경망의 역전파 단계를 직접 구현할 필요가 없습니다:
-
-.. includenodoc:: /beginner/examples_autograd/two_layer_net_autograd.py
-
-PyTorch: 새 autograd 함수 정의하기
---------------------------------------
-
-내부적으로, autograd의 기본(primitive) 연산자는 실제로 Tensor를 조작하는 2개의
-함수입니다. **forward** 함수는 입력 Tensor로부터 출력 Tensor를 계산합니다.
-**backward** 함수는 어떤 스칼라 값에 대한 출력 Tensor의 변화도를 전달받고,
-동일한 스칼라 값에 대한 입력 Tensor의 변화도를 계산합니다.
-
-PyTorch에서 ``torch.autograd.Function`` 의 서브클래스(subclass)를 정의하고
-``forward`` 와 ``backward`` 함수를 구현함으로써 사용자 정의 autograd 연산자를
-손쉽게 정의할 수 있습니다. 그 후, 인스턴스(instance)를 생성하고 이를 함수처럼
-호출하여 입력 데이터를 갖는 Tensor를 전달하는 식으로 새로운 autograd 연산자를
-사용할 수 있습니다.
-
-이 예제에서는 ReLU로 비선형적(nonlinearity)으로 동작하는 사용자 정의 autograd
-함수를 정의하고, 2-계층 신경망에 이를 적용해보도록 하겠습니다:
-
-.. includenodoc:: /beginner/examples_autograd/two_layer_net_custom_function.py
-
-`nn` 모듈
+In the above examples, we had to manually implement both the forward and
+backward passes of our neural network. Manually implementing the
+backward pass is not a big deal for a small two-layer network, but can
+quickly get very hairy for large complex networks.
+
+Thankfully, we can use `automatic
+differentiation <https://en.wikipedia.org/wiki/Automatic_differentiation>`__
+to automate the computation of backward passes in neural networks. The
+**autograd** package in PyTorch provides exactly this functionality.
+When using autograd, the forward pass of your network will define a
+**computational graph**; nodes in the graph will be Tensors, and edges
+will be functions that produce output Tensors from input Tensors.
+Backpropagating through this graph then allows you to easily compute
+gradients.
+
+This sounds complicated, it's pretty simple to use in practice. Each Tensor
+represents a node in a computational graph. If ``x`` is a Tensor that has
+``x.requires_grad=True`` then ``x.grad`` is another Tensor holding the
+gradient of ``x`` with respect to some scalar value.
+
+Here we use PyTorch Tensors and autograd to implement our fitting sine wave
+with third order polynomial example; now we no longer need to manually
+implement the backward pass through the network:
+
+.. includenodoc:: /beginner/examples_autograd/polynomial_autograd.py
+
+PyTorch: Defining new autograd functions
+----------------------------------------
+
+Under the hood, each primitive autograd operator is really two functions
+that operate on Tensors. The **forward** function computes output
+Tensors from input Tensors. The **backward** function receives the
+gradient of the output Tensors with respect to some scalar value, and
+computes the gradient of the input Tensors with respect to that same
+scalar value.
+
+In PyTorch we can easily define our own autograd operator by defining a
+subclass of ``torch.autograd.Function`` and implementing the ``forward``
+and ``backward`` functions. We can then use our new autograd operator by
+constructing an instance and calling it like a function, passing
+Tensors containing input data.
+
+In this example we define our model as :math:`y=a+b P_3(c+dx)` instead of
+:math:`y=a+bx+cx^2+dx^3`, where :math:`P_3(x)=\frac{1}{2}\left(5x^3-3x\right)`
+is the `Legendre polynomial`_ of degree three. We write our own custom autograd
+function for computing forward and backward of :math:`P_3`, and use it to implement
+our model:
+
+.. _Legendre polynomial:
+    https://en.wikipedia.org/wiki/Legendre_polynomials
+
+.. includenodoc:: /beginner/examples_autograd/polynomial_custom_function.py
+
+`nn` module
 ===========
 
 PyTorch: nn
 -----------
 
-연산 그래프와 autograd는 복잡한 연산자를 정의하고 도함수(derivative)를 자동으로
-계산하는 매우 강력한 패러다임입니다; 하지만 규모가 큰 신경망에서는
-autograd 그 자체만으로는 너무 낮은 수준(low-level)일 수 있습니다.
+Computational graphs and autograd are a very powerful paradigm for
+defining complex operators and automatically taking derivatives; however
+for large neural networks raw autograd can be a bit too low-level.
 
-신경망을 구성할 때 종종 연산을 여러 **계층** 에 배열(arrange)하는 것으로
-생각하는데, 이 중 일부는 학습 도중 최적화가 될 **학습 가능한 매개변수** 를 갖고
-있습니다.
+When building neural networks we frequently think of arranging the
+computation into **layers**, some of which have **learnable parameters**
+which will be optimized during learning.
 
-Tensorflow는 `Keras <https://github.com/fchollet/keras>`__,
+In TensorFlow, packages like
+`Keras <https://github.com/fchollet/keras>`__,
 `TensorFlow-Slim <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim>`__,
-나 `TFLearn <http://tflearn.org/>`__ 같은 패키지들이 연산 그래프를 더 높은 수준으로
-추상화(higher-level abstraction)하여 제공하므로 신경망을 구축하는데 있어 유용합니다.
+and `TFLearn <http://tflearn.org/>`__ provide higher-level abstractions
+over raw computational graphs that are useful for building neural
+networks.
 
-PyTorch에서는 ``nn`` 패키지가 동일한 목적으로 제공됩니다. ``nn`` 패키지는
-신경망 계층(layer)들과 거의 동일한 **Module** 의 집합을 정의합니다.
-Module은 입력 Tensor를 받고 출력 Tensor를 계산하는 한편, 학습 가능한 매개변수를
-갖는 Tensor 같은 내부 상태(internal state)를 갖습니다. ``nn`` 패키지는 또한
-신경망을 학습시킬 때 주로 사용하는 유용한 손실 함수들도 정의하고 있습니다.
+In PyTorch, the ``nn`` package serves this same purpose. The ``nn``
+package defines a set of **Modules**, which are roughly equivalent to
+neural network layers. A Module receives input Tensors and computes
+output Tensors, but may also hold internal state such as Tensors
+containing learnable parameters. The ``nn`` package also defines a set
+of useful loss functions that are commonly used when training neural
+networks.
 
-이번 예제에서는 ``nn`` 패키지를 사용하여 2계층 신경망을 구성해보겠습니다:
+In this example we use the ``nn`` package to implement our polynomial model
+network:
 
-.. includenodoc:: /beginner/examples_nn/two_layer_net_nn.py
+.. includenodoc:: /beginner/examples_nn/polynomial_nn.py
 
 PyTorch: optim
 --------------
 
-지금까지는 (autograd의 추적 기록을 피하기 위해 ``torch.no_grad ()`` 또는 ``.data``
-를 사용하는 식으로) 학습 가능한 매개변수를 갖는 Tensor를 직접 조작하며 모델의
-가중치를 갱신하였습니다. 이것은 확률적 경사 하강법(SGD)과 같은 간단한 최적화
-알고리즘에서는 크게 부담이 되지는 않지만, 실제로 신경망을 학습할 때는 주로 AdaGrad,
-RMSProp, Adam 등과 같은 좀 더 정교한 Optimizer를 사용하곤 합니다.
+Up to this point we have updated the weights of our models by manually
+mutating the Tensors holding learnable parameters with ``torch.no_grad()``.
+This is not a huge burden for simple optimization algorithms like stochastic
+gradient descent, but in practice we often train neural networks using more
+sophisticated optimizers like AdaGrad, RMSProp, Adam, etc.
 
-PyTorch의 ``optim`` 패키지는 최적화 알고리즘에 대한 아이디어를 추상화하고 일반적으로
-사용하는 최적화 알고리즘의 구현체(implementation)를 제공합니다.
+The ``optim`` package in PyTorch abstracts the idea of an optimization
+algorithm and provides implementations of commonly used optimization
+algorithms.
 
-이 에제에서는 지금까지와 같이 ``nn`` 패키지를 사용하여 모델을 정의하지만, ``optim``
-패키지가 제공하는 Adam 알고리즘을 이용하여 모델을 최적화하겠습니다:
+In this example we will use the ``nn`` package to define our model as
+before, but we will optimize the model using the RMSprop algorithm provided
+by the ``optim`` package:
 
-.. includenodoc:: /beginner/examples_nn/two_layer_net_optim.py
+.. includenodoc:: /beginner/examples_nn/polynomial_optim.py
 
-PyTorch: 사용자 정의 nn.Module
--------------------------------
+PyTorch: Custom nn Modules
+--------------------------
 
-때때로 기존 모듈의 구성(sequence)보다 더 복잡한 모델을 구성해야 할 때가 있습니다;
-이럴 때는 ``nn.Module`` 의 서브클래스로 새 모듈을 정의하고, 입력 Tensor를 받아
-다른 모듈 또는 Tensor의 autograd 연산을 사용하여 출력 Tensor를 만드는
-``forward`` 를 정의합니다.
+Sometimes you will want to specify models that are more complex than a
+sequence of existing Modules; for these cases you can define your own
+Modules by subclassing ``nn.Module`` and defining a ``forward`` which
+receives input Tensors and produces output Tensors using other
+modules or other autograd operations on Tensors.
 
-이 예제에서는 2계층 신경망을 직접 정의한 nn.Module 서브클래스로 구현해보겠습니다:
+In this example we implement our third order polynomial as a custom Module
+subclass:
 
-.. includenodoc:: /beginner/examples_nn/two_layer_net_module.py
+.. includenodoc:: /beginner/examples_nn/polynomial_module.py
 
-PyTorch: 제어 흐름(Control Flow) + 가중치 공유(Weight Sharing)
----------------------------------------------------------------
+PyTorch: Control Flow + Weight Sharing
+--------------------------------------
 
-동적 그래프와 가중치 공유의 예로, 매우 이상한 모델을 구현해보겠습니다:
-각 순전파 단계에서 많은 은닉 계층을 갖는 완전히 연결(fully-connected)된 ReLU
-신경망이 무작위로 0 ~ 3 사이의 숫자를 선택하고, 가장 안쪽(innermost)의 은닉층들을
-계산하기 위해 동일한 가중치를 여러 번 재사용합니다.
+As an example of dynamic graphs and weight sharing, we implement a very
+strange model: a third-fifth order polynomial that on each forward pass
+chooses a random number between 3 and 5 and uses that many orders, reusing
+the same weights multiple times to compute the fourth and fifth order.
 
-이 모델에서는 일반적인 Python 제어 흐름을 사용하여 반복(loop)을 구현할 수 있으며,
-순전파 단계를 정의할 때 단지 동일한 Module을 여러번 재사용함으로써 내부(innermost)
-계층들 간의 가중치 공유를 구현할 수 있습니다.
+For this model we can use normal Python flow control to implement the loop,
+and we can implement weight sharing by simply reusing the same parameter multiple
+times when defining the forward pass.
 
-이러한 모델을 Module을 상속받는 서브클래스로 간단히 구현해보겠습니다:
+We can easily implement this model as a Module subclass:
 
 .. includenodoc:: /beginner/examples_nn/dynamic_net.py
 
 
 .. _examples-download:
 
-예제 코드
-=========
+Examples
+========
 
-이상의 예제들을 여기에서 찾아볼 수 있습니다.
+You can browse the above examples here.
 
-Tensor
+Tensors
 -------
 
 .. toctree::
    :maxdepth: 2
    :hidden:
 
-   /beginner/examples_tensor/two_layer_net_numpy
-   /beginner/examples_tensor/two_layer_net_tensor
+   /beginner/examples_tensor/polynomial_numpy
+   /beginner/examples_tensor/polynomial_tensor
 
-.. galleryitem:: /beginner/examples_tensor/two_layer_net_numpy.py
+.. galleryitem:: /beginner/examples_tensor/polynomial_numpy.py
 
-.. galleryitem:: /beginner/examples_tensor/two_layer_net_tensor.py
+.. galleryitem:: /beginner/examples_tensor/polynomial_tensor.py
 
 .. raw:: html
 
@@ -217,39 +249,36 @@ Autograd
    :maxdepth: 2
    :hidden:
 
-   /beginner/examples_autograd/two_layer_net_autograd
-   /beginner/examples_autograd/two_layer_net_custom_function
-   /beginner/examples_autograd/tf_two_layer_net
-
+   /beginner/examples_autograd/polynomial_autograd
+   /beginner/examples_autograd/polynomial_custom_function
 
-.. galleryitem:: /beginner/examples_autograd/two_layer_net_autograd.py
 
-.. galleryitem:: /beginner/examples_autograd/two_layer_net_custom_function.py
+.. galleryitem:: /beginner/examples_autograd/polynomial_autograd.py
 
-.. galleryitem:: /beginner/examples_autograd/tf_two_layer_net.py
+.. galleryitem:: /beginner/examples_autograd/polynomial_custom_function.py
 
 .. raw:: html
 
     <div style='clear:both'></div>
 
-`nn` 모듈
+`nn` module
 -----------
 
 .. toctree::
    :maxdepth: 2
    :hidden:
 
-   /beginner/examples_nn/two_layer_net_nn
-   /beginner/examples_nn/two_layer_net_optim
-   /beginner/examples_nn/two_layer_net_module
+   /beginner/examples_nn/polynomial_nn
+   /beginner/examples_nn/polynomial_optim
+   /beginner/examples_nn/polynomial_module
    /beginner/examples_nn/dynamic_net
 
 
-.. galleryitem:: /beginner/examples_nn/two_layer_net_nn.py
+.. galleryitem:: /beginner/examples_nn/polynomial_nn.py
 
-.. galleryitem:: /beginner/examples_nn/two_layer_net_optim.py
+.. galleryitem:: /beginner/examples_nn/polynomial_optim.py
 
-.. galleryitem:: /beginner/examples_nn/two_layer_net_module.py
+.. galleryitem:: /beginner/examples_nn/polynomial_module.py
 
 .. galleryitem:: /beginner/examples_nn/dynamic_net.py
 
diff --git a/beginner_source/saving_loading_models.py b/beginner_source/saving_loading_models.py
index d7c350071..4fe456d4e 100644
--- a/beginner_source/saving_loading_models.py
+++ b/beginner_source/saving_loading_models.py
@@ -169,6 +169,15 @@
 #    함수에 전달하기 전에 반드시 역직렬화를 해야 합니다. 예를 들어,
 #    ``model.load_state_dict(PATH)`` 과 같은 식으로는 사용하면 안됩니다.
 #
+# .. Note ::
+#
+#    만약 (검증 손실(validation loss) 결과에 따라) 가장 성능이 좋은 모델만 유지할
+#    계획이라면, ``best_model_state = model.state_dict()`` 은 모델의 복사본이 아닌
+#    모델의 현재 상태에 대한 참조(reference)만 반환한다는 사실을 잊으시면 안됩니다!
+#    따라서 ``best_model_state``` 을 직렬화(serialize)하거나,
+#    ``best_model_state = deepcopy(model.state_dict())`` 을 사용해야 합니다.
+#    그렇지 않으면, 제일 좋은 성능을 내는 ``best_model_state`` 은 계속되는 학습 단계에서
+#    갱신될 것입니다. 결과적으로, 최종 모델의 상태는 과적합(overfit)된 상태가 됩니다.
 #
 # 전체 모델 저장하기/불러오기
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -245,7 +254,8 @@
 # *state_dict* 보다 많은 것들을 저장해야 합니다. 모델이 학습을 하며 갱신되는
 # 버퍼와 매개변수가 포함된 옵티마이저의 *state_dict* 도 함께 저장하는 것이
 # 중요합니다. 그 외에도 마지막 에폭(epoch), 최근에 기록된 학습 손실, 외부
-# ``torch.nn.Embedding`` 계층 등도 함께 저장합니다.
+# ``torch.nn.Embedding`` 계층 등도 함께 저장합니다. 결과적으로, 이런 체크포인트는
+# 종종 모델만 저장하는 것보다 2~3배 정도 커지게 됩니다.
 #
 # 여러가지를 함께 저장하려면, 사전(dictionary) 자료형으로 만든 후
 # ``torch.save()`` 를 사용하여 직렬화합니다. PyTorch가 이러한 체크포인트를 저장할
diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py
index ba4066090..ddc9af80b 100644
--- a/beginner_source/text_sentiment_ngrams_tutorial.py
+++ b/beginner_source/text_sentiment_ngrams_tutorial.py
@@ -1,68 +1,151 @@
 """
-TorchText로 텍스트 분류하기
-==================================
+torchtext 라이브러리로 텍스트 분류하기
+========================================
 **번역**: `김강민 <https://github.com/gangsss>`_ , `김진현 <https://github.com/lewhe0>`_
 
-이 튜토리얼에서는 ``torchtext`` 에 포함되어 있는 텍스트 분류
-데이터셋의 사용 방법을 살펴 봅니다. 데이터셋은 다음을 포함합니다.
+이 튜토리얼에서는 torchtext 라이브러리를 사용하여 어떻게 텍스트 분류 분석을 위한 데이터셋을 만드는지를 살펴보겠습니다.
+다음과 같은 내용들을 알게 됩니다:
 
-::
+   - 반복자(iterator)로 가공되지 않은 데이터(raw data)에 접근하기
+   - 가공되지 않은 텍스트 문장들을 모델 학습에 사용할 수 있는 ``torch.Tensor`` 로 변환하는 데이터 처리 파이프라인 만들기
+   - `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`__ 를 사용하여 데이터를 섞고 반복하기(shuffle and iterate)
+"""
 
-   - AG_NEWS,
-   - SogouNews,
-   - DBpedia,
-   - YelpReviewPolarity,
-   - YelpReviewFull,
-   - YahooAnswers,
-   - AmazonReviewPolarity,
-   - AmazonReviewFull
+######################################################################
+# 기초 데이터셋 반복자(raw data iterator)에 접근하기
+# -------------------------------------------------------------
+#
+# torchtext 라이브러리는 가공되지 않은 텍스트 문장들을 만드는(yield) 몇 가지 기초 데이터셋 반복자(raw dataset iterator)를 제공합니다.
+# 예를 들어, ``AG_NEWS`` 데이터셋 반복자는 레이블(label)과 문장의 튜플(tuple) 형태로 가공되지 않은 데이터를 만듭니다.
 
-이 예제에서는 ``TextClassification`` 의 데이터셋들 중 하나를 이용해 분류를 위한
- 지도 학습 알고리즘을 훈련하는 방법을 보여줍니다.
+import torch
+from torchtext.datasets import AG_NEWS
+train_iter = AG_NEWS(split='train')
 
-ngrams를 이용하여 데이터 불러오기
------------------------------------
+######################################################################
+# ::
+#
+#     next(train_iter)
+#     >>> (3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters -
+#     Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green
+#     again.")
+#
+#     next(train_iter)
+#     >>> (3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private
+#     investment firm Carlyle Group,\\which has a reputation for making well-timed
+#     and occasionally\\controversial plays in the defense industry, has quietly
+#     placed\\its bets on another part of the market.')
+#
+#     next(train_iter)
+#     >>> (3, "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring
+#     crude prices plus worries\\about the economy and the outlook for earnings are
+#     expected to\\hang over the stock market next week during the depth of
+#     the\\summer doldrums.")
+#
 
-Bag of ngrams 피쳐는 지역(local) 단어 순서에 대한 부분적인 정보를 포착하기 위해 적용합니다.
-실제 상황에서는 bi-gram이나 tri-gram은 단 하나의 단어를 이용하는 것보다 더 많은 이익을 주기 때문에 적용됩니다.
-예를 들면 다음과 같습니다.
+######################################################################
+# 데이터 처리 파이프라인 준비하기
+# ---------------------------------
+#
+# 어휘집(vocab), 단어 벡터(word vector), 토크나이저(tokenizer)를 포함하여 torchtext 라이브러리의 가장 기본적인 구성요소를 재검토했습니다.
+# 이들은 가공되지 않은 텍스트 문자열에 대한 기본적인 데이터 처리 빌딩 블록(data processing building block)입니다.
+#
+# 다음은 토크나이저 및 어휘집을 사용한 일반적인 NLP 데이터 처리의 예입니다.
+# 첫번째 단계는 가공되지 않은 학습 데이터셋으로 어휘집을 만드는 것입니다.
+# 사용자는 Vocab 클래스의 생성자에 인자를 설정하여 사용자 정의된 어휘집(customized vocab)을 만들 수 있습니다.
+# 토큰(token)들의 최소 빈도 ``min_freq`` 에 대한 예시는 아래와 같습니다.
 
-::
+"""
 
-   "load data with ngrams"
-   Bi-grams 결과: "load data", "data with", "with ngrams"
-   Tri-grams 결과: "load data with", "data with ngrams"
+from torchtext.data.utils import get_tokenizer
+from collections import Counter
+from torchtext.vocab import Vocab
 
-``TextClassification`` 데이터셋은 ngrams method을 지원합니다. ngrams을 2로 설정하면,
-데이터셋 안의 예제 텍스트는 각각의(single) 단어들에 bi-grams 문자열이 더해진 리스트가 될 것입니다.
+tokenizer = get_tokenizer('basic_english')
+train_iter = AG_NEWS(split='train')
+counter = Counter()
+for (label, line) in train_iter:
+    counter.update(tokenizer(line))
+vocab = Vocab(counter, min_freq=1)
 
-"""
 
-import torch
-import torchtext
-from torchtext.datasets import text_classification
-NGRAMS = 2
-import os
-if not os.path.isdir('./.data'):
-	os.mkdir('./.data')
-train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
-    root='./.data', ngrams=NGRAMS, vocab=None)
-BATCH_SIZE = 16
+######################################################################
+# 어휘집 블록(vocabulary block)은 토큰 목록을 정수로 변환합니다.
+#
+# ::
+#
+#     [vocab[token] for token in ['here', 'is', 'an', 'example']]
+#     >>> [476, 22, 31, 5298]
+#
+# 토크나이저와 어휘집을 갖춘 텍스트 처리 파이프라인을 준비합니다.
+# 텍스트 파이프라인과 레이블(label) 파이프라인은 데이터셋 반복자로부터 얻어온 가공되지 않은 문장 데이터를 처리하기 위해 사용됩니다.
+
+text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
+label_pipeline = lambda x: int(x) - 1
+
+
+######################################################################
+# 텍스트 파이프라인은 어휘집에 정의된 룩업 테이블(순람표; lookup table)에 기반하여 텍스트 문장을 정수 목록으로 변환합니다.
+# 레이블(label) 파이프라인은 레이블을 정수로 변환합니다. 예를 들어,
+#
+# ::
+#
+#     text_pipeline('here is the an example')
+#     >>> [475, 21, 2, 30, 5286]
+#     label_pipeline('10')
+#     >>> 9
+#
+
+
+
+######################################################################
+# 데이터 배치(batch)와 반복자 생성하기
+# ----------------------------------------
+#
+# `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`__ 를
+# 권장합니다. (튜토리얼은 `여기 <https://tutorials.pytorch.kr/beginner/data_loading_tutorial.html>`__ 있습니다.)
+# 이는 ``getitem()`` 과 ``len()`` 프로토콜을 구현한 맵 형태(map-style)의 데이터셋으로 동작하며, 맵(map)처럼 인덱스/키로 데이터 샘플을 얻어옵니다.
+# 또한, 셔플(shuffle) 인자를 ``False`` 로 설정하면 반복 가능한(iteratable) 데이터셋처럼 동작합니다.
+#
+# 모델로 보내기 전, ``collate_fn`` 함수는 ``DataLoader`` 로부터 생성된 샘플 배치로 동작합니다.
+# ``collate_fn`` 의 입력은 ``DataLoader`` 에 배치 크기(batch size)가 있는 배치(batch) 데이터이며,
+# ``collate_fn`` 은 이를 미리 선언된 데이터 처리 파이프라인에 따라 처리합니다.
+# ``collate_fn`` 이 최상위 수준으로 정의(top level def)되었는지 확인합니다. 이렇게 하면 모든 워커에서 이 함수를 사용할 수 있습니다.
+#
+# 아래 예제에서, 주어진(original) 데이터 배치의 텍스트 항목들은 리스트(list)에 담긴(pack) 뒤 ``nn.EmbeddingBag`` 의 입력을 위한 하나의 텐서(tensor)로 합쳐(concatenate)집니다.
+# 오프셋(offset)은 텍스트 텐서(text tensor)에서 개별 시퀀스 시작 인덱스를 표현하기 위한 구분자(delimiter) 텐서입니다.
+# 레이블(label)은 개별 텍스트 항목의 레이블을 저장하는 텐서입니다.
+
+
+from torch.utils.data import DataLoader
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
+def collate_batch(batch):
+    label_list, text_list, offsets = [], [], [0]
+    for (_label, _text) in batch:
+         label_list.append(label_pipeline(_label))
+         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
+         text_list.append(processed_text)
+         offsets.append(processed_text.size(0))
+    label_list = torch.tensor(label_list, dtype=torch.int64)
+    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
+    text_list = torch.cat(text_list)
+    return label_list.to(device), text_list.to(device), offsets.to(device)
+
+train_iter = AG_NEWS(split='train')
+dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
 
 
 ######################################################################
 # 모델 정의하기
-# -------------
+# ---------------
 #
-# 우리의 모델은
-# `EmbeddingBag <https://pytorch.org/docs/stable/nn.html?highlight=embeddingbag#torch.nn.EmbeddingBag>`__
-# 레이어와 선형 레이어로 구성됩니다 (아래 그림 참고).
-# ``nn.EmbeddingBag``는 임베딩들로 구성된 '가방'의 평균을 계산합니다.
-# 이때 텍스트(text)의 각 원소는 그 길이가 다를 수 있습니다. 텍스트의
-# 길이는 오프셋(offset)에 저장되어 있으므로 여기서 ``nn.EmbeddingBag``
-# 에 패딩을 사용할 필요는 없습니다.
+# 모델은
+# `nn.EmbeddingBag <https://pytorch.org/docs/stable/nn.html?highlight=embeddingbag#torch.nn.EmbeddingBag>`__
+# 레이어와 분류(classification) 목적을 위한 선형 레이어로 구성됩니다.
+# 기본 모드가 "평균(mean)"인 ``nn.EmbeddingBag`` 은 임베딩들의 "가방(bag)"의 평균 값을 계산합니다.
+# 이때 텍스트(text) 항목들은 각기 그 길이가 다를 수 있지만, ``nn.EmbeddingBag`` 모듈은 텍스트의 길이를
+# 오프셋(offset)으로 저장하고 있으므로 패딩(padding)이 필요하지는 않습니다.
 #
 # 덧붙여서, ``nn.EmbeddingBag`` 은 임베딩의 평균을 즉시 계산하기 때문에,
 # 텐서들의 시퀀스를 처리할 때 성능 및 메모리 효율성 측면에서의 장점도
@@ -72,11 +155,12 @@
 #
 
 
-import torch.nn as nn
-import torch.nn.functional as F
-class TextSentiment(nn.Module):
+from torch import nn
+
+class TextClassificationModel(nn.Module):
+
     def __init__(self, vocab_size, embed_dim, num_class):
-        super().__init__()
+        super(TextClassificationModel, self).__init__()
         self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
         self.fc = nn.Linear(embed_dim, num_class)
         self.init_weights()
@@ -96,7 +180,7 @@ def forward(self, text, offsets):
 # 인스턴스 생성하기
 # -----------------
 #
-# AG_NEWS 데이터셋에는 4 종류의 레이블이 달려 있으며, 따라서 클래스의 개수도 4개 입니다.
+# ``AG_NEWS`` 데이터셋에는 4종류의 레이블이 존재하므로 클래스의 개수도 4개입니다.
 #
 # ::
 #
@@ -105,51 +189,16 @@ def forward(self, text, offsets):
 #    3 : Business (경제)
 #    4 : Sci/Tec (과학/기술)
 #
-# 어휘집의 크기(Vocab size)는 어휘집(vocab)의 길이와 같습니다 (여기에는
-# 각각의 단어와 ngrame이 모두 포함됩니다). 클래스의 개수는 레이블의 종류
-# 수와 같으며, AG_NEWS의 경우에는 4개 입니다.
+# 임베딩 차원이 64인 모델을 만듭니다.
+# 어휘집의 크기(Vocab size)는 어휘집(vocab)의 길이와 같습니다.
+# 클래스의 개수는 레이블의 개수와 같습니다.
 #
 
-VOCAB_SIZE = len(train_dataset.get_vocab())
-EMBED_DIM = 32
-NUN_CLASS = len(train_dataset.get_labels())
-model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)
-
-
-######################################################################
-# 배치 생성을 위한 함수들
-# -----------------------
-#
-
-
-######################################################################
-# 텍스트 원소의 길이가 다를 수 있으므로, 데이터 배치와 오프셋을 생성하기
-# 위한 사용자 함수 generate_batch()를 사용하려 합니다. 이 함수는
-# ``torch.utils.data.DataLoader`` 의 ``collate_fn`` 인자로 넘겨줍니다.
-#
-# ``collate_fn`` 의 입력은 그 크기가 batch_size인 텐서들의 리스트이며,
-# ``collate_fn`` 은 이들을 미니배치로 묶는 역할을 합니다. 여러분이
-# 주의해야 할 점은, ``collate_fn`` 를 선언할 때 최상위 레벨에서 정의해야
-# 한다는 점입니다. 그래야 이 함수를 각각의 워커에서 사용할 수 있음이
-# 보장됩니다.
-#
-# 원본 데이터 배치 입력의 텍스트 원소들은 리스트 형태이며, 이들을 하나의
-# 텐서가 되도록 이어 붙인 것이 ``nn.EmbeddingBag`` 의 입력이 됩니다.
-# 오프셋은 텍스트의 경계를 나타내는 텐서이며, 각 원소가 텍스트 텐서의
-# 어느 인덱스에서 시작하는지를 나타냅니다. 레이블은 각 텍스트 원소의
-# 레이블을 담고 있는 텐서입니다.
-#
-
-def generate_batch(batch):
-    label = torch.tensor([entry[0] for entry in batch])
-    text = [entry[1] for entry in batch]
-    offsets = [0] + [len(entry) for entry in text]
-    # torch.Tensor.cumsum은 dim 차원의 요소들의 누적 합계를 반환합니다.
-    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)
-
-    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
-    text = torch.cat(text)
-    return text, offsets, label
+train_iter = AG_NEWS(split='train')
+num_class = len(set([label for (label, text) in train_iter]))
+vocab_size = len(vocab)
+emsize = 64
+model = TextClassificationModel(vocab_size, emsize, num_class).to(device)
 
 
 ######################################################################
@@ -158,60 +207,48 @@ def generate_batch(batch):
 #
 
 
-######################################################################
-# PyTorch 사용자라면
-# `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`__
-# 를 활용하는 것을 추천합니다. 또한 이를 사용하면 데이터를 쉽게 병렬적으로
-# 읽어올 수 있습니다 (이에 대한 튜토리얼은 `이 문서 <https://tutorials.pytorch.kr/beginner/data_loading_tutorial.html>`__
-# 를 참고하시기 바랍니다). 우리는 여기서 ``DataLoader`` 를 이용하여
-# AG_NEWS 데이터셋을 읽어오고, 이를 모델로 넘겨 학습과 검증을 진행합니다.
-#
-
-from torch.utils.data import DataLoader
-
-def train_func(sub_train_):
+import time
 
-    # Train the model
-    # 모델을 학습합니다
-    train_loss = 0
-    train_acc = 0
-    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
-                      collate_fn=generate_batch)
-    for i, (text, offsets, cls) in enumerate(data):
+def train(dataloader):
+    model.train()
+    total_acc, total_count = 0, 0
+    log_interval = 500
+    start_time = time.time()
+    for idx, (label, text, offsets) in enumerate(dataloader):
         optimizer.zero_grad()
-        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
-        output = model(text, offsets)
-        loss = criterion(output, cls)
-        train_loss += loss.item()
+        predited_label = model(text, offsets)
+        loss = criterion(predited_label, label)
         loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
         optimizer.step()
-        train_acc += (output.argmax(1) == cls).sum().item()
+        total_acc += (predited_label.argmax(1) == label).sum().item()
+        total_count += label.size(0)
+        if idx % log_interval == 0 and idx > 0:
+            elapsed = time.time() - start_time
+            print('| epoch {:3d} | {:5d}/{:5d} batches '
+                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
+                                              total_acc/total_count))
+            total_acc, total_count = 0, 0
+            start_time = time.time()
+
+def evaluate(dataloader):
+    model.eval()
+    total_acc, total_count = 0, 0
 
-    # 학습률을 조절합니다
-    scheduler.step()
-
-    return train_loss / len(sub_train_), train_acc / len(sub_train_)
-
-def test(data_):
-    loss = 0
-    acc = 0
-    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
-    for text, offsets, cls in data:
-        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
-        with torch.no_grad():
-            output = model(text, offsets)
-            loss = criterion(output, cls)
-            loss += loss.item()
-            acc += (output.argmax(1) == cls).sum().item()
-
-    return loss / len(data_), acc / len(data_)
+    with torch.no_grad():
+        for idx, (label, text, offsets) in enumerate(dataloader):
+            predited_label = model(text, offsets)
+            loss = criterion(predited_label, label)
+            total_acc += (predited_label.argmax(1) == label).sum().item()
+            total_count += label.size(0)
+    return total_acc/total_count
 
 
 ######################################################################
 # 데이터셋을 분할하고 모델 수행하기
 # ---------------------------------
 #
-# 원본 AG_NEWS에는 검증용 데이터가 포함되어 있지 않기 때문에, 우리는 학습
+# 원본 ``AG_NEWS`` 에는 검증용 데이터가 포함되어 있지 않기 때문에, 우리는 학습
 # 데이터를 학습 및 검증 데이터로 분할하려 합니다. 이때 데이터를 분할하는
 # 비율은 0.95(학습)와 0.05(검증) 입니다. 우리는 여기서 PyTorch의
 # 핵심 라이브러리 중 하나인
@@ -219,85 +256,118 @@ def test(data_):
 # 함수를 사용합니다.
 #
 # `CrossEntropyLoss <https://pytorch.org/docs/stable/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__
-# 기준(criterion)은 각 클래스에 대해 nn.LogSoftmax()와 nn.NLLLoss()를
-# 합쳐 놓은 방식입니다.
+# 기준(criterion)은 각 클래스에 대해 ``nn.LogSoftmax()`` 와 ``nn.NLLLoss()`` 를
+# 합쳐놓은 방식입니다.
 # `SGD <https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html>`__
 # optimizer는 확률적 경사 하강법를 구현해놓은 것입니다. 처음의 학습률은
-# 4.0으로 두었습니다. 매 에폭을 진행하면서 학습률을 조절할 때는
+# 5.0으로 두었습니다. 매 에폭을 진행하면서 학습률을 조절할 때는
 # `StepLR <https://pytorch.org/docs/master/_modules/torch/optim/lr_scheduler.html#StepLR>`__
 # 을 사용합니다.
 #
 
-import time
 from torch.utils.data.dataset import random_split
-N_EPOCHS = 5
-min_valid_loss = float('inf')
-
-criterion = torch.nn.CrossEntropyLoss().to(device)
-optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
-scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
-
-train_len = int(len(train_dataset) * 0.95)
-sub_train_, sub_valid_ = \
-    random_split(train_dataset, [train_len, len(train_dataset) - train_len])
-
-for epoch in range(N_EPOCHS):
-
-    start_time = time.time()
-    train_loss, train_acc = train_func(sub_train_)
-    valid_loss, valid_acc = test(sub_valid_)
-
-    secs = int(time.time() - start_time)
-    mins = secs / 60
-    secs = secs % 60
-
-    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
-    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
-    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
+# Hyperparameters
+EPOCHS = 10 # epoch
+LR = 5  # learning rate
+BATCH_SIZE = 64 # batch size for training
+
+criterion = torch.nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=LR)
+scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
+total_accu = None
+train_iter, test_iter = AG_NEWS()
+train_dataset = list(train_iter)
+test_dataset = list(test_iter)
+num_train = int(len(train_dataset) * 0.95)
+split_train_, split_valid_ = \
+    random_split(train_dataset, [num_train, len(train_dataset) - num_train])
+
+train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
+                              shuffle=True, collate_fn=collate_batch)
+valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
+                              shuffle=True, collate_fn=collate_batch)
+test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
+                             shuffle=True, collate_fn=collate_batch)
+
+for epoch in range(1, EPOCHS + 1):
+    epoch_start_time = time.time()
+    train(train_dataloader)
+    accu_val = evaluate(valid_dataloader)
+    if total_accu is not None and total_accu > accu_val:
+      scheduler.step()
+    else:
+       total_accu = accu_val
+    print('-' * 59)
+    print('| end of epoch {:3d} | time: {:5.2f}s | '
+          'valid accuracy {:8.3f} '.format(epoch,
+                                           time.time() - epoch_start_time,
+                                           accu_val))
+    print('-' * 59)
 
 
 ######################################################################
-# 이 모델을 GPU 상에서 수행했을 때 다음과 같은 결과를 얻었습니다.
-#
-# Epoch: 1 \| time in 0 minutes, 11 seconds (에폭 1, 수행 시간 0분 11초)
-#
-# ::
-#
-#        Loss: 0.0263(train)     |       Acc: 84.5%(train)
-#        Loss: 0.0001(valid)     |       Acc: 89.0%(valid)
-#
-#
-# Epoch: 2 \| time in 0 minutes, 10 seconds (에폭 2, 수행 시간 0분 10초)
-#
-# ::
-#
-#        Loss: 0.0119(train)     |       Acc: 93.6%(train)
-#        Loss: 0.0000(valid)     |       Acc: 89.6%(valid)
-#
-#
-# Epoch: 3 \| time in 0 minutes, 9 seconds (에폭 3, 수행 시간 0분 9초)
-#
-# ::
-#
-#        Loss: 0.0069(train)     |       Acc: 96.4%(train)
-#        Loss: 0.0000(valid)     |       Acc: 90.5%(valid)
-#
-#
-# Epoch: 4 \| time in 0 minutes, 11 seconds (에폭 4, 수행 시간 0분 11초)
-#
-# ::
-#
-#        Loss: 0.0038(train)     |       Acc: 98.2%(train)
-#        Loss: 0.0000(valid)     |       Acc: 90.4%(valid)
-#
-#
-# Epoch: 5 \| time in 0 minutes, 11 seconds (에폭 5, 수행 시간 0분 11초)
-#
-# ::
-#
-#        Loss: 0.0022(train)     |       Acc: 99.0%(train)
-#        Loss: 0.0000(valid)     |       Acc: 91.0%(valid)
-#
+# 이 모델을 GPU 상에서 수행하고 다음과 같은 결과를 얻었습니다:
+#
+#        | epoch   1 |   500/ 1782 batches | accuracy    0.684
+#        | epoch   1 |  1000/ 1782 batches | accuracy    0.852
+#        | epoch   1 |  1500/ 1782 batches | accuracy    0.877
+#        -----------------------------------------------------------
+#        | end of epoch   1 | time:  8.33s | valid accuracy    0.867
+#        -----------------------------------------------------------
+#        | epoch   2 |   500/ 1782 batches | accuracy    0.895
+#        | epoch   2 |  1000/ 1782 batches | accuracy    0.900
+#        | epoch   2 |  1500/ 1782 batches | accuracy    0.903
+#        -----------------------------------------------------------
+#        | end of epoch   2 | time:  8.18s | valid accuracy    0.890
+#        -----------------------------------------------------------
+#        | epoch   3 |   500/ 1782 batches | accuracy    0.914
+#        | epoch   3 |  1000/ 1782 batches | accuracy    0.914
+#        | epoch   3 |  1500/ 1782 batches | accuracy    0.916
+#        -----------------------------------------------------------
+#        | end of epoch   3 | time:  8.20s | valid accuracy    0.897
+#        -----------------------------------------------------------
+#        | epoch   4 |   500/ 1782 batches | accuracy    0.926
+#        | epoch   4 |  1000/ 1782 batches | accuracy    0.924
+#        | epoch   4 |  1500/ 1782 batches | accuracy    0.921
+#        -----------------------------------------------------------
+#        | end of epoch   4 | time:  8.18s | valid accuracy    0.895
+#        -----------------------------------------------------------
+#        | epoch   5 |   500/ 1782 batches | accuracy    0.938
+#        | epoch   5 |  1000/ 1782 batches | accuracy    0.935
+#        | epoch   5 |  1500/ 1782 batches | accuracy    0.937
+#        -----------------------------------------------------------
+#        | end of epoch   5 | time:  8.16s | valid accuracy    0.902
+#        -----------------------------------------------------------
+#        | epoch   6 |   500/ 1782 batches | accuracy    0.939
+#        | epoch   6 |  1000/ 1782 batches | accuracy    0.939
+#        | epoch   6 |  1500/ 1782 batches | accuracy    0.938
+#        -----------------------------------------------------------
+#        | end of epoch   6 | time:  8.16s | valid accuracy    0.906
+#        -----------------------------------------------------------
+#        | epoch   7 |   500/ 1782 batches | accuracy    0.941
+#        | epoch   7 |  1000/ 1782 batches | accuracy    0.939
+#        | epoch   7 |  1500/ 1782 batches | accuracy    0.939
+#        -----------------------------------------------------------
+#        | end of epoch   7 | time:  8.19s | valid accuracy    0.903
+#        -----------------------------------------------------------
+#        | epoch   8 |   500/ 1782 batches | accuracy    0.942
+#        | epoch   8 |  1000/ 1782 batches | accuracy    0.941
+#        | epoch   8 |  1500/ 1782 batches | accuracy    0.942
+#        -----------------------------------------------------------
+#        | end of epoch   8 | time:  8.16s | valid accuracy    0.904
+#        -----------------------------------------------------------
+#        | epoch   9 |   500/ 1782 batches | accuracy    0.942
+#        | epoch   9 |  1000/ 1782 batches | accuracy    0.941
+#        | epoch   9 |  1500/ 1782 batches | accuracy    0.942
+#        -----------------------------------------------------------
+#          end of epoch   9 | time:  8.16s | valid accuracy    0.904
+#        -----------------------------------------------------------
+#        | epoch  10 |   500/ 1782 batches | accuracy    0.940
+#        | epoch  10 |  1000/ 1782 batches | accuracy    0.942
+#        | epoch  10 |  1500/ 1782 batches | accuracy    0.942
+#        -----------------------------------------------------------
+#        | end of epoch  10 | time:  8.15s | valid accuracy    0.904
+#        -----------------------------------------------------------
 
 
 ######################################################################
@@ -305,18 +375,20 @@ def test(data_):
 # ---------------------------
 #
 
-print('Checking the results of test dataset...')
-test_loss, test_acc = test(test_dataset)
-print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')
 
 
 ######################################################################
 # 평가 데이터셋을 통한 결과를 확인합니다...
 
+print('Checking the results of test dataset.')
+accu_test = evaluate(test_dataloader)
+print('test accuracy {:8.3f}'.format(accu_test))
+
+################################################
 #
 # ::
 #
-#        Loss: 0.0237(test)      |       Acc: 90.5%(test)
+#        test accuracy    0.906
 #
 
 
@@ -324,26 +396,17 @@ def test(data_):
 # 임의의 뉴스로 평가하기
 # ----------------------
 #
-# 현재까지 구한 최고의 모델로 골프 뉴스를 테스트해보려 합니다. 레이블에
-# 대한 정보는
-# `여기에 <https://pytorch.org/text/datasets.html?highlight=ag_news#torchtext.datasets.AG_NEWS>`__
-# 나와 있습니다.
+# 현재까지 최고의 모델로 골프 뉴스를 테스트해보겠습니다.
 #
 
-import re
-from torchtext.data.utils import ngrams_iterator
-from torchtext.data.utils import get_tokenizer
-
-ag_news_label = {1 : "World",
-                 2 : "Sports",
-                 3 : "Business",
-                 4 : "Sci/Tec"}
+ag_news_label = {1: "World",
+                 2: "Sports",
+                 3: "Business",
+                 4: "Sci/Tec"}
 
-def predict(text, model, vocab, ngrams):
-    tokenizer = get_tokenizer("basic_english")
+def predict(text, text_pipeline):
     with torch.no_grad():
-        text = torch.tensor([vocab[token]
-                            for token in ngrams_iterator(tokenizer(text), ngrams)])
+        text = torch.tensor(text_pipeline(text))
         output = model(text, torch.tensor([0]))
         return output.argmax(1).item() + 1
 
@@ -359,17 +422,13 @@ def predict(text, model, vocab, ngrams):
     was even more impressive considering he’d never played the \
     front nine at TPC Southwind."
 
-vocab = train_dataset.get_vocab()
 model = model.to("cpu")
 
 print("This is a %s news" %ag_news_label[predict(ex_text_str, model, vocab, 2)])
 
-######################################################################
-# This is a Sports news (스포츠 뉴스)
+################################################
 #
-
-
-######################################################################
-# 이 튜토리얼에서 사용한 예제 코드는
-# `여기에서 <https://github.com/pytorch/text/tree/master/examples/text_classification>`__
-# 확인하실 수 있습니다.
+# ::
+#
+#        This is a Sports news
+#
\ No newline at end of file
diff --git a/beginner_source/torchtext_translation_tutorial.py b/beginner_source/torchtext_translation.py
similarity index 68%
rename from beginner_source/torchtext_translation_tutorial.py
rename to beginner_source/torchtext_translation.py
index 68b64682e..5da1bbe7a 100644
--- a/beginner_source/torchtext_translation_tutorial.py
+++ b/beginner_source/torchtext_translation.py
@@ -2,32 +2,28 @@
 TorchText로 언어 번역하기
 ===================================
 
-이 튜토리얼에서는 ``torchtext`` 의 유용한 여러 클래스들과 시퀀스 투 시퀀스(sequence-to-sequence, seq2seq)모델을 통해
-영어와 독일어 문장들이 포함된 유명한 데이터 셋을 이용해서 독일어 문장을 영어로 번역해 볼 것입니다.
+이 튜토리얼에서는 ``torchtext`` 를 사용하여 영어와 독일어 문장들이 포함된 잘 알려진 데이터셋을 전처리(preprocess)하고
+이를 사용하여 독일어 문장을 영어로 번역하는 시퀀스-투-시퀀스(sequence-to-sequence, seq2seq) 모델을 학습하는 방법을
+살펴보겠습니다.
 
 이 튜토리얼은
 PyTorch 커뮤니티 멤버인 `Ben Trevett <https://github.com/bentrevett>`__ 이 작성한
-`튜토리얼 <https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb>`__ 에 기초하고 있으며
-`Seth Weidman <https://github.com/SethHWeidman/>`__ 이 Ben의 허락을 받고 만들었습니다.
+`튜토리얼 <https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb>`__
+에 기초하고 있으며 Ben의 허락을 받고 만들었습니다. 몇몇 기존 코드들을 제거하고 튜토리얼을 업데이트하였습니다.
 
-이 튜토리얼을 통해 여러분은 다음과 같은 것을 할 수 있게 됩니다:
-
-- ``torchtext`` 의 아래와 같은 유용한 클래스들을 통해 문장들을 NLP모델링에 자주 사용되는 형태로 전처리할 수 있게 됩니다:
-    - `TranslationDataset <https://torchtext.readthedocs.io/en/latest/datasets.html#torchtext.datasets.TranslationDataset>`__
-    - `Field <https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.Field>`__
-    - `BucketIterator <https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.BucketIterator>`__
+이 튜토리얼을 통해 NLP 모델링을 위해 문장들을 텐서(tensor)로 전처리하고, 모델을 학습하고 검증하기 위해
+`torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`__
+을 사용할 수 있게 됩니다.
 
 """
 
 ######################################################################
-# `Field` 와 `TranslationDataset`
+# 데이터 처리하기
 # --------------------------------
 #
-# ``torchtext`` 에는 언어 변환 모델을 만들때 쉽게 사용할 수 있는 데이터셋을 만들기 적합한 다양한 도구가 있습니다.
-# 그 중에서도 중요한 클래스 중 하나인 `Field <https://github.com/pytorch/text/blob/master/torchtext/data/field.py#L64>`__ 는
-# 각 문장이 어떻게 전처리되어야 하는지 지정하며, 또 다른 중요한 클래스로는 `TranslationDataset` 이 있습니다.
-# ``torchtext`` 에는 이 외에도 비슷한 데이터셋들이 있는데, 이번 튜토리얼에서는 `Multi30k dataset <https://github.com/multi30k/dataset>`__ 을 사용할 것입니다.
-# 이 데이터 셋은 평균 약 13개의 단어로 구성된 약 삼만 개의 문장을 영어와 독일어 두 언어로 포함하고 있습니다.
+# ``torchtext`` 에는 언어 변환 모델을 만들 때 쉽게 사용할 수 있는 데이터셋을 만들기 적합한 다양한 도구가 있습니다.
+# 이 예제에서는 가공되지 않은 텍스트 문장(raw text sentence)을 토큰화(tokenize)하고, 어휘집(vocabulary)을 만들고,
+# 토큰을 텐서로 숫자화(numericalize)하는 방법을 알아보겠습니다.
 #
 # 참고 : 이 튜토리얼에서의 토큰화(tokenization)에는 `Spacy <https://spacy.io>`__ 가 필요합니다.
 # Spacy는 영어 이 외의 다른 언어에 대한 강력한 토큰화 기능을 제공하기 때문에 사용합니다. ``torchtext`` 는
@@ -42,72 +38,95 @@
 #
 #    python -m spacy download en
 #    python -m spacy download de
-#
-# Spacy가 설치되어 있다면, 다음 코드는 ``TranslationDataset`` 에 있는 각 문장을 ``Field`` 에 정의된
-# 내용을 기반으로 토큰화할 것입니다.
-
-from torchtext.datasets import Multi30k
-from torchtext.data import Field, BucketIterator
-
-SRC = Field(tokenize = "spacy",
-            tokenizer_language="de",
-            init_token = '<sos>',
-            eos_token = '<eos>',
-            lower = True)
-
-TRG = Field(tokenize = "spacy",
-            tokenizer_language="en",
-            init_token = '<sos>',
-            eos_token = '<eos>',
-            lower = True)
-
-train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
-                                                    fields = (SRC, TRG))
 
-######################################################################
-# 이제 ``train_data`` 를 정의했으니, ``torchtext`` 의 ``Field`` 에 있는 엄청나게 유용한 기능을
-# 보게 될 것입니다 : 바로 ``build_vovab`` 메소드(method)로 각 언어와 연관된 어휘들을 만들어 낼 것입니다.
+import torchtext
+import torch
+from torchtext.data.utils import get_tokenizer
+from collections import Counter
+from torchtext.vocab import Vocab
+from torchtext.utils import download_from_url, extract_archive
+import io
+
+url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
+train_urls = ('train.de.gz', 'train.en.gz')
+val_urls = ('val.de.gz', 'val.en.gz')
+test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')
+
+train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
+val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
+test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]
+
+de_tokenizer = get_tokenizer('spacy', language='de')
+en_tokenizer = get_tokenizer('spacy', language='en')
+
+def build_vocab(filepath, tokenizer):
+  counter = Counter()
+  with io.open(filepath, encoding="utf8") as f:
+    for string_ in f:
+      counter.update(tokenizer(string_))
+  return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
+
+de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
+en_vocab = build_vocab(train_filepaths[1], en_tokenizer)
+
+def data_process(filepaths):
+  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
+  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
+  data = []
+  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
+    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
+                            dtype=torch.long)
+    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
+                            dtype=torch.long)
+    data.append((de_tensor_, en_tensor_))
+  return data
+
+train_data = data_process(train_filepaths)
+val_data = data_process(val_filepaths)
+test_data = data_process(test_filepaths)
 
-SRC.build_vocab(train_data, min_freq = 2)
-TRG.build_vocab(train_data, min_freq = 2)
 
 ######################################################################
-# 위 코드가 실행되면, ``SRC.vocab.stoi`` 는 어휘에 해당하는 토큰을 키로, 관련된 색인을 값으로 가지는
-# 사전(dict)이 됩니다. ``SRC.vocab.itos`` 역시 사전(dict)이지만, 키와 값이 서로 반대입니다. 이 튜토리얼에서는
-# 그다지 중요하지 않은 내용이지만, 이런 특성은 다른 자연어 처리 등에서 유용하게 사용할 수 있습니다.
-
-######################################################################
-# ``BucketIterator``
+# ``DataLoader``
 # --------------------
-# 마지막으로 사용해 볼 ``torchtext`` 에 특화된 기능은 바로 ``BucketIterator`` 입니다.
-# 첫 번째 인자로 ``TranslationDataset`` 을 전달받기 때문에 사용하기가 쉽습니다. 문서에서도 볼 수 있듯
-# 이 기능은 비슷한 길이의 예제들을 묶어주는 반복자(iterator)를 정의합니다. 각각의 새로운 에포크(epoch)마다
-# 새로 섞인 결과를 만드는데 필요한 패딩의 수를 최소화 합니다. 버케팅 과정에서 사용되는 저장 공간을 한번 살펴보시기 바랍니다.
+# 마지막으로 사용해 볼 ``torch`` 에 특화된 기능은 바로 ``DataLoader`` 로,
+# 첫 번째 인자로 데이터를 전달받기 때문에 사용하기가 쉽습니다. 문서에서도 볼 수 있듯이,
+# ``DataLoader 는 데이터셋과 샘플러를 결합하고, 주어진 데이터셋에 반복 기능을 제공합니다.
+# ``DataLoader`` 는 맵 형태(map-style)과 반복 가능한 형태(iteratable-style) 데이터셋을 모두 지원하며,
+# 단일 또는 다중 프로세스로 불러오거나, 불러오는 순서를 조정(customize)하거나
+# 선택적 자동 일괄 처리(optional automatic batching), 메모리 피닝(memory pinning)을 지원합니다.
+#
+# 샘플 목록을 병합(merge)하여 Tensor의 미니배치를 구성하는 ``collate_fn`` (선택 사항)을 살펴보십시오.
+# 맵 형태(map-style) 데이터셋을 일괄로 불러올 때 사용됩니다.
 
 import torch
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 BATCH_SIZE = 128
+PAD_IDX = de_vocab['<pad>']
+BOS_IDX = de_vocab['<bos>']
+EOS_IDX = de_vocab['<eos>']
+
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader
+
+def generate_batch(data_batch):
+  de_batch, en_batch = [], []
+  for (de_item, en_item) in data_batch:
+    de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
+    en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
+  de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
+  en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
+  return de_batch, en_batch
+
+train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
+                        shuffle=True, collate_fn=generate_batch)
+valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
+                        shuffle=True, collate_fn=generate_batch)
+test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
+                       shuffle=True, collate_fn=generate_batch)
 
-train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
-    (train_data, valid_data, test_data),
-    batch_size = BATCH_SIZE,
-    device = device)
-
-######################################################################
-# 이 반복자들은 ``DataLoader`` 와 마찬가지로 호출할 수 있습니다. 아래 ``train`` 과
-# ``evaluation`` 함수에서 보면, 다음과 같이 간단히 호출할 수 있음을 알 수 있습니다 :
-# ::
-#
-#    for i, batch in enumerate(iterator):
-#
-# 각 ``batch`` 는 ``src`` 와 ``trg`` 속성을 가지게 됩니다.
-#
-# ::
-#
-#    src = batch.src
-#    trg = batch.trg
 
 ######################################################################
 # ``nn.Module`` 과 ``Optimizer`` 정의하기
@@ -313,8 +332,8 @@ def forward(self,
         return outputs
 
 
-INPUT_DIM = len(SRC.vocab)
-OUTPUT_DIM = len(TRG.vocab)
+INPUT_DIM = len(de_vocab)
+OUTPUT_DIM = len(en_vocab)
 # ENC_EMB_DIM = 256
 # DEC_EMB_DIM = 256
 # ENC_HID_DIM = 512
@@ -363,7 +382,7 @@ def count_parameters(model: nn.Module):
 # 참고 : 언어 번역의 성능 점수를 기록하려면, ``nn.CrossEntropyLoss`` 함수가 단순한
 # 패딩을 추가하는 부분을 무시할 수 있도록 해당 색인들을 알려줘야 합니다.
 
-PAD_IDX = TRG.vocab.stoi['<pad>']
+PAD_IDX = en_vocab.stoi['<pad>']
 
 criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
 
@@ -375,7 +394,7 @@ def count_parameters(model: nn.Module):
 
 
 def train(model: nn.Module,
-          iterator: BucketIterator,
+          iterator: torch.utils.data.DataLoader,
           optimizer: optim.Optimizer,
           criterion: nn.Module,
           clip: float):
@@ -384,10 +403,8 @@ def train(model: nn.Module,
 
     epoch_loss = 0
 
-    for _, batch in enumerate(iterator):
-
-        src = batch.src
-        trg = batch.trg
+    for _, (src, trg) in enumerate(iterator):
+        src, trg = src.to(device), trg.to(device)
 
         optimizer.zero_grad()
 
@@ -410,7 +427,7 @@ def train(model: nn.Module,
 
 
 def evaluate(model: nn.Module,
-             iterator: BucketIterator,
+             iterator: torch.utils.data.DataLoader,
              criterion: nn.Module):
 
     model.eval()
@@ -419,10 +436,8 @@ def evaluate(model: nn.Module,
 
     with torch.no_grad():
 
-        for _, batch in enumerate(iterator):
-
-            src = batch.src
-            trg = batch.trg
+        for _, (src, trg) in enumerate(iterator):
+            src, trg = src.to(device), trg.to(device)
 
             output = model(src, trg, 0) #turn off teacher forcing
 
@@ -453,8 +468,8 @@ def epoch_time(start_time: int,
 
     start_time = time.time()
 
-    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
-    valid_loss = evaluate(model, valid_iterator, criterion)
+    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
+    valid_loss = evaluate(model, valid_iter, criterion)
 
     end_time = time.time()
 
@@ -464,7 +479,7 @@ def epoch_time(start_time: int,
     print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
     print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
 
-test_loss = evaluate(model, test_iterator, criterion)
+test_loss = evaluate(model, test_iter, criterion)
 
 print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')
 
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index 8fbe8946d..4133b6208 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -53,7 +53,6 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
         super(TransformerModel, self).__init__()
         from torch.nn import TransformerEncoder, TransformerEncoderLayer
         self.model_type = 'Transformer'
-        self.src_mask = None
         self.pos_encoder = PositionalEncoding(ninp, dropout)
         encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
         self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
@@ -63,7 +62,7 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
 
         self.init_weights()
 
-    def _generate_square_subsequent_mask(self, sz):
+    def generate_square_subsequent_mask(self, sz):
         mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
         mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
         return mask
@@ -74,15 +73,10 @@ def init_weights(self):
         self.decoder.bias.data.zero_()
         self.decoder.weight.data.uniform_(-initrange, initrange)
 
-    def forward(self, src):
-        if self.src_mask is None or self.src_mask.size(0) != len(src):
-            device = src.device
-            mask = self._generate_square_subsequent_mask(len(src)).to(device)
-            self.src_mask = mask
-
+    def forward(self, src, src_mask):
         src = self.encoder(src) * math.sqrt(self.ninp)
         src = self.pos_encoder(src)
-        output = self.transformer_encoder(src, self.src_mask)
+        output = self.transformer_encoder(src, src_mask)
         output = self.decoder(output)
         return output
 
@@ -119,7 +113,7 @@ def forward(self, x):
 
 
 ######################################################################
-# 학습 과정에서는 ``torchtext`` 의 Wikitext-2 데이터셋을 이용합니다.
+# 이 튜토리얼에서는 ``torchtext`` 를 사용하여 Wikitext-2 데이터셋을 생성합니다.
 # 단어 오브젝트는 훈련 데이터셋(train dataset) 에 의하여 만들어지고, 토큰을 텐서(tensor)로 수치화하는데 사용됩니다.
 # 시퀀스 데이터로부터 시작하여, ``batchify()`` 함수는 데이터셋을 컬럼들로 배열하고, ``batch_size`` 사이즈의 배치들로 나눈 후에 남은 모든 토큰을 버립니다.
 # 예를 들어, 알파벳을 시퀀스(총 길이 26) 로 생각하고 배치 사이즈를 4라고 한다면, 우리는 알파벳을 길이가 6인 4개의 시퀀스로 나눌 수 있습니다.
@@ -139,18 +133,33 @@ def forward(self, x):
 # 이 컬럼들은 모델에 의해서 독립적으로 취급되며, 이것은 더 효율적인 배치 프로세싱(batch processing) 이 가능하지만, ``G`` 와 ``F`` 의 의존성이 학습될 수 없다는 것을 의미합니다.
 #
 
-import torchtext
+import io
+import torch
+from torchtext.datasets import WikiText2
 from torchtext.data.utils import get_tokenizer
-TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"),
-                            init_token='<sos>',
-                            eos_token='<eos>',
-                            lower=True)
-train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
-TEXT.build_vocab(train_txt)
+from collections import Counter
+from torchtext.vocab import Vocab
+
+train_iter = WikiText2(split='train')
+tokenizer = get_tokenizer('basic_english')
+counter = Counter()
+for line in train_iter:
+    counter.update(tokenizer(line))
+vocab = Vocab(counter)
+
+def data_process(raw_text_iter):
+  data = [torch.tensor([vocab[token] for token in tokenizer(item)],
+                       dtype=torch.long) for item in raw_text_iter]
+  return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
+
+train_iter, val_iter, test_iter = WikiText2()
+train_data = data_process(train_iter)
+val_data = data_process(val_iter)
+test_data = data_process(test_iter)
+
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 def batchify(data, bsz):
-    data = TEXT.numericalize([data.examples[0].text])
     # 데이터셋을 bsz 파트들로 나눕니다.
     nbatch = data.size(0) // bsz
     # 깔끔하게 나누어 떨어지지 않는 추가적인 부분(나머지들) 은 잘라냅니다.
@@ -161,9 +170,9 @@ def batchify(data, bsz):
 
 batch_size = 20
 eval_batch_size = 10
-train_data = batchify(train_txt, batch_size)
-val_data = batchify(val_txt, eval_batch_size)
-test_data = batchify(test_txt, eval_batch_size)
+train_data = batchify(train_data, batch_size)
+val_data = batchify(val_data, eval_batch_size)
+test_data = batchify(test_data, eval_batch_size)
 
 
 ######################################################################
@@ -188,7 +197,7 @@ def batchify(data, bsz):
 def get_batch(source, i):
     seq_len = min(bptt, len(source) - 1 - i)
     data = source[i:i+seq_len]
-    target = source[i+1:i+1+seq_len].view(-1)
+    target = source[i+1:i+1+seq_len].reshape(-1)
     return data, target
 
 
@@ -203,7 +212,7 @@ def get_batch(source, i):
 # 단어 사이즈는 단어 오브젝트의 길이와 일치 합니다.
 #
 
-ntokens = len(TEXT.vocab.stoi) # 단어 사전의 크기
+ntokens = len(vocab.stoi) # 단어 사전(어휘집)의 크기
 emsize = 200 # 임베딩 차원
 nhid = 200 # nn.TransformerEncoder 에서 피드포워드 네트워크(feedforward network) 모델의 차원
 nlayers = 2 # nn.TransformerEncoder 내부의 nn.TransformerEncoderLayer 개수
@@ -242,11 +251,13 @@ def train():
     model.train() # 학습 모드를 시작합니다.
     total_loss = 0.
     start_time = time.time()
-    ntokens = len(TEXT.vocab.stoi)
+    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
     for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
         data, targets = get_batch(train_data, i)
         optimizer.zero_grad()
-        output = model(data)
+        if data.size(0) != bptt:
+            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
+        output = model(data, src_mask)
         loss = criterion(output.view(-1, ntokens), targets)
         loss.backward()
         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
@@ -260,7 +271,7 @@ def train():
             print('| epoch {:3d} | {:5d}/{:5d} batches | '
                   'lr {:02.2f} | ms/batch {:5.2f} | '
                   'loss {:5.2f} | ppl {:8.2f}'.format(
-                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
+                    epoch, batch, len(train_data) // bptt, scheduler.get_last_lr()[0],
                     elapsed * 1000 / log_interval,
                     cur_loss, math.exp(cur_loss)))
             total_loss = 0
@@ -269,11 +280,13 @@ def train():
 def evaluate(eval_model, data_source):
     eval_model.eval() # 평가 모드를 시작합니다.
     total_loss = 0.
-    ntokens = len(TEXT.vocab.stoi)
+    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
     with torch.no_grad():
         for i in range(0, data_source.size(0) - 1, bptt):
             data, targets = get_batch(data_source, i)
-            output = eval_model(data)
+            if data.size(0) != bptt:
+                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
+            output = eval_model(data, src_mask)
             output_flat = output.view(-1, ntokens)
             total_loss += len(data) * criterion(output_flat, targets).item()
     return total_loss / (len(data_source) - 1)
diff --git a/beginner_source/vt_tutorial.py b/beginner_source/vt_tutorial.py
new file mode 100644
index 000000000..2c218ae8a
--- /dev/null
+++ b/beginner_source/vt_tutorial.py
@@ -0,0 +1,287 @@
+"""
+Optimizing Vision Transformer Model for Deployment
+===========================
+
+Vision Transformer models apply the cutting-edge attention-based
+transformer models, introduced in Natural Language Processing to achieve
+all kinds of the state of the art (SOTA) results, to Computer Vision
+tasks. Facebook Data-efficient Image Transformers `DeiT <https://ai.facebook.com/blog/data-efficient-image-transformers-a-promising-new-technique-for-image-classification>`_
+is a Vision Transformer model trained on ImageNet for image
+classification.
+
+In this tutorial, we will first cover what DeiT is and how to use it,
+then go through the complete steps of scripting, quantizing, optimizing,
+and using the model in iOS and Android apps. We will also compare the
+performance of quantized, optimized and non-quantized, non-optimized
+models, and show the benefits of applying quantization and optimization
+to the model along the steps.
+
+"""
+
+
+
+###############################################################################
+# What is DeiT
+# ---------------------
+#
+# Convolutional Neural Networks (CNNs) have been the main models for image
+# classification since deep learning took off in 2012, but CNNs typically
+# require hundreds of millions of images for training to achieve the
+# SOTAresults. DeiT is a vision transformer model that requires a lot less
+# data and computing resources for training to compete with the leading
+# CNNs in performing image classification, which is made possible by two
+# key components of of DeiT:
+#
+# -  Data augmentation that simulates training on a much larger dataset;
+# -  Native distillation that allows the transformer network to learn from
+#    a CNN’s output.
+#
+# DeiT shows that Transformers can be successfully applied to computer
+# vision tasks, with limited access to data and resources. For more
+# details on DeiT, see the `repo <https://github.com/facebookresearch/deit>`_
+# and `paper <https://arxiv.org/abs/2012.12877>`_.
+#
+
+
+######################################################################
+# Classifying Images with DeiT
+# -------------------------------
+#
+# Follow the README at the DeiT repo for detailed information on how to
+# classify images using DeiT, or for a quick test, first install the
+# required packages:
+#
+# ::
+#
+#    pip install torch torchvision
+#    pip install timm
+#    pip install pandas
+#    pip install requests 
+#
+# then run the script below:
+#
+
+
+from PIL import Image
+import torch
+import timm
+import requests
+import torchvision.transforms as transforms
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+
+print(torch.__version__)
+# should be 1.8.0
+
+
+model = torch.hub.load('facebookresearch/deit:main', 'deit_base_patch16_224', pretrained=True)
+model.eval()
+
+transform = transforms.Compose([
+    transforms.Resize(256, interpolation=3),
+    transforms.CenterCrop(224),
+    transforms.ToTensor(),
+    transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+])
+
+img = Image.open(requests.get("https://raw.githubusercontent.com/pytorch/ios-demo-app/master/HelloWorld/HelloWorld/HelloWorld/image.png", stream=True).raw)
+img = transform(img)[None,]
+out = model(img)
+clsidx = torch.argmax(out)
+print(clsidx.item())
+
+
+######################################################################
+# The output should be 269, which, according to the ImageNet list of class
+# index to `labels file <https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a>`_, maps to ‘timber
+# wolf, grey wolf, gray wolf, Canis lupus’.
+#
+# Now that we have verified that we can use the DeiT model to classify
+# images, let’s see how to modify the model so it can run on iOS and
+# Android apps.
+#
+
+
+######################################################################
+# Scripting DeiT
+# ----------------------
+# To use the model on mobile, we first need to script the
+# model. See the `Script and Optimize recipe <https://pytorch.org/tutorials/recipes/script_optimized.html>`_ for a
+# quick overview. Run the code below to convert the DeiT model used in the
+# previous step to the TorchScript format that can run on mobile.
+#
+
+
+model = torch.hub.load('facebookresearch/deit:main', 'deit_base_patch16_224', pretrained=True)
+model.eval()
+scripted_model = torch.jit.script(model)
+scripted_model.save("fbdeit_scripted.pt")
+
+
+######################################################################
+# The scripted model file fbdeit_scripted.pt of size about 346MB is
+# generated.
+#
+
+
+######################################################################
+# Quantizing DeiT
+# ---------------------
+# To reduce the trained model size significantly while
+# keeping the inference accuracy about the same, quantization can be
+# applied to the model. Thanks to the transformer model used in DeiT, we
+# can easily apply dynamic-quantization to the model, because dynamic
+# quantization works best for LSTM and transformer models (see `here <https://pytorch.org/docs/stable/quantization.html?highlight=quantization#dynamic-quantization>`_
+# for more details).
+#
+# Now run the code below:
+#
+
+# Use 'fbgemm' for server inference and 'qnnpack' for mobile inference
+backend = "fbgemm" # replaced with qnnpack causing much worse inference speed for quantized model on this notebook
+model.qconfig = torch.quantization.get_default_qconfig(backend)
+torch.backends.quantized.engine = backend
+
+quantized_model = torch.quantization.quantize_dynamic(model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
+scripted_quantized_model = torch.jit.script(quantized_model)
+scripted_quantized_model.save("fbdeit_scripted_quantized.pt")
+
+
+######################################################################
+# This generates the scripted and quantized version of the model
+# fbdeit_quantized_scripted.pt, with size about 89MB, a 74% reduction of
+# the non-quantized model size of 346MB!
+#
+
+######################################################################
+# You can use the ``scripted_quantized_model`` to generate the same
+# inference result:
+#
+
+out = scripted_quantized_model(img)
+clsidx = torch.argmax(out)
+print(clsidx.item())
+# The same output 269 should be printed
+
+######################################################################
+# Optimizing DeiT
+# ---------------------
+# The final step before using the quantized and scripted
+# model on mobile is to optimize it:
+#
+
+from torch.utils.mobile_optimizer import optimize_for_mobile
+optimized_scripted_quantized_model = optimize_for_mobile(scripted_quantized_model)
+optimized_scripted_quantized_model.save("fbdeit_optimized_scripted_quantized.pt")
+
+
+######################################################################
+# The generated fbdeit_optimized_scripted_quantized.pt file has about the
+# same size as the quantized, scripted, but non-optimized model. The
+# inference result remains the same.
+#
+
+
+
+out = optimized_scripted_quantized_model(img)
+clsidx = torch.argmax(out)
+print(clsidx.item())
+# Again, the same output 269 should be printed
+
+
+######################################################################
+# Using Lite Interpreter
+# ------------------------
+#
+# To see how much model size reduction and inference speed up the Lite
+# Interpreter can result in, let’s create the lite version of the model.
+#
+
+optimized_scripted_quantized_model._save_for_lite_interpreter("fbdeit_optimized_scripted_quantized_lite.ptl")
+ptl = torch.jit.load("fbdeit_optimized_scripted_quantized_lite.ptl")
+
+
+######################################################################
+# Although the lite model size is comparable to the non-lite version, when
+# running the lite version on mobile, the inference speed up is expected.
+#
+
+
+######################################################################
+# Comparing Inference Speed
+# ---------------------------
+#
+# To see how the inference speed differs for the four models - the
+# original model, the scripted model, the quantized-and-scripted model,
+# the optimized-quantized-and-scripted model - run the code below:
+#
+
+with torch.autograd.profiler.profile(use_cuda=False) as prof1:
+    out = model(img)
+with torch.autograd.profiler.profile(use_cuda=False) as prof2:
+    out = scripted_model(img)
+with torch.autograd.profiler.profile(use_cuda=False) as prof3:
+    out = scripted_quantized_model(img)
+with torch.autograd.profiler.profile(use_cuda=False) as prof4:
+    out = optimized_scripted_quantized_model(img)
+with torch.autograd.profiler.profile(use_cuda=False) as prof5:
+    out = ptl(img)
+
+print("original model: {:.2f}ms".format(prof1.self_cpu_time_total/1000))
+print("scripted model: {:.2f}ms".format(prof2.self_cpu_time_total/1000))
+print("scripted & quantized model: {:.2f}ms".format(prof3.self_cpu_time_total/1000))
+print("scripted & quantized & optimized model: {:.2f}ms".format(prof4.self_cpu_time_total/1000))
+print("lite model: {:.2f}ms".format(prof5.self_cpu_time_total/1000))
+
+######################################################################
+# The results running on a Google Colab are:
+#
+# ::
+#
+#    original model: 1236.69ms
+#    scripted model: 1226.72ms
+#    scripted & quantized model: 593.19ms
+#    scripted & quantized & optimized model: 598.01ms
+#    lite model: 600.72ms
+#
+
+
+######################################################################
+# The following results summarize the inference time taken by each model
+# and the percentage reduction of each model relative to the original
+# model.
+#
+
+import pandas as pd
+import numpy as np
+
+df = pd.DataFrame({'Model': ['original model','scripted model', 'scripted & quantized model', 'scripted & quantized & optimized model', 'lite model']})
+df = pd.concat([df, pd.DataFrame([
+    ["{:.2f}ms".format(prof1.self_cpu_time_total/1000), "0%"],
+    ["{:.2f}ms".format(prof2.self_cpu_time_total/1000),
+     "{:.2f}%".format((prof1.self_cpu_time_total-prof2.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
+    ["{:.2f}ms".format(prof3.self_cpu_time_total/1000),
+     "{:.2f}%".format((prof1.self_cpu_time_total-prof3.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
+    ["{:.2f}ms".format(prof4.self_cpu_time_total/1000),
+     "{:.2f}%".format((prof1.self_cpu_time_total-prof4.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
+    ["{:.2f}ms".format(prof5.self_cpu_time_total/1000),
+     "{:.2f}%".format((prof1.self_cpu_time_total-prof5.self_cpu_time_total)/prof1.self_cpu_time_total*100)]],
+    columns=['Inference Time', 'Reduction'])], axis=1)
+
+print(df)
+
+"""
+        Model                             Inference Time    Reduction
+0	original model                             1236.69ms           0%
+1	scripted model                             1226.72ms        0.81%
+2	scripted & quantized model                  593.19ms       52.03%
+3	scripted & quantized & optimized model      598.01ms       51.64%
+4	lite model                                  600.72ms       51.43%
+"""
+
+######################################################################
+# Learn More
+# ~~~~~~~~~~~~~~~~~
+#
+# - `Facebook Data-efficient Image Transformers <https://ai.facebook.com/blog/data-efficient-image-transformers-a-promising-new-technique-for-image-classification>`__
+# - `Vision Transformer with ImageNet and MNIST on iOS <https://github.com/pytorch/ios-demo-app/tree/master/ViT4MNIST>`__
+# - `Vision Transformer with ImageNet and MNIST on Android <https://github.com/pytorch/android-demo-app/tree/master/ViT4MNIST>`__
diff --git a/conf.py b/conf.py
index d21ec1416..98f3dd5d1 100644
--- a/conf.py
+++ b/conf.py
@@ -57,7 +57,7 @@
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['sphinx.ext.mathjax',
+extensions = ['sphinx.ext.mathjax', 'sphinx_copybutton',
               'sphinx_gallery.gen_gallery',
               'sphinx_sitemap']
 
diff --git a/index.rst b/index.rst
index 3cf443f62..743d83f29 100644
--- a/index.rst
+++ b/index.rst
@@ -1,6 +1,11 @@
 파이토치(PyTorch) 튜토리얼에 오신 것을 환영합니다
 ==================================================
 
+.. raw:: html
+
+    <p>여러분의 생각을 들려주세요. <a href="https://forms.gle/ZxBgfDRmMYdE3G6RA" target="_blank">(비공식) 한국어 PyTorch 튜토리얼 독자 설문</a>에 참여 부탁드립니다. </p>
+
+
 .. raw:: html
 
     <div class="tutorials-callout-container">
@@ -9,10 +14,10 @@
 .. Add callout items below this line
 
 .. customcalloutitem::
-   :description: 60분 만에 끝장내기는 PyTorch를 어떻게 사용하는지 대략적으로 알아볼 수 있는 일반적인 시작점입니다. 심층신경망 모델 구성에 대한 기본적인 내용을 다룹니다.
-   :header: PyTorch가 처음이신가요?
-   :button_link: beginner/deep_learning_60min_blitz.html
-   :button_text: 60분 만에 끝장내기 시작
+   :description: Familiarize yourself with PyTorch concepts and modules. Learn how to load data, build deep neural networks, train and save your models in this quickstart guide.
+   :header: Learn the Basics
+   :button_link:  beginner/basics/intro.html
+   :button_text: Get started with PyTorch
 
 .. customcalloutitem::
    :description: 한 입 크기의, 바로 사용할 수 있는 PyTorch 코드 예제들을 확인해보세요.
@@ -51,8 +56,8 @@
 .. Learning PyTorch
 
 .. customcarditem::
-   :header: 파이토치(PyTorch)로 딥러닝하기: 60분만에 끝장내기
-   :card_description: 높은 수준에서 PyTorch의 텐서 라이브러리와 신경망을 이해합니다.
+   :header: Learn the Basics
+   :card_description: A step-by-step guide to building a complete ML workflow with PyTorch.
    :image: _static/img/thumbnails/cropped/60-min-blitz.png
    :link: beginner/deep_learning_60min_blitz.html
    :tags: Getting-Started
@@ -94,6 +99,13 @@
    :link: beginner/transfer_learning_tutorial.html
    :tags: Image/Video
 
+.. customcarditem::
+   :header: Optimizing Vision Transformer Model
+   :card_description: Apply cutting-edge, attention-based transformer models to computer vision tasks.
+   :image: _static/img/thumbnails/cropped/60-min-blitz.png
+   :link: beginner/vt_tutorial.html
+   :tags: Image/Video
+
 .. customcarditem::
    :header: 적대적 예제 생성(Adversarial Example Generation)
    :card_description: 가장 많이 사용되는 공격 방법 중 하나인 FGSM (Fast Gradient Sign Attack)을 이용해 MNIST 분류기를 속이는 방법을 배웁니다.
@@ -117,6 +129,13 @@
    :link: beginner/audio_preprocessing_tutorial.html
    :tags: Audio
 
+   .. customcarditem::
+   :header: Speech Command Recognition
+   :card_description: Learn how to correctly format an audio dataset and then train/test an audio classifier network on the dataset.
+   :image: _static/img/thumbnails/cropped/torchaudio-speech.png
+   :link: intermediate/speech_command_recognition_with_torchaudio.html
+   :tags: Audio
+
 .. Text
 
 .. customcarditem::
@@ -158,7 +177,7 @@
    :header: TorchText로 언어 번역하기
    :card_description: 영어와 독어가 포함된 잘 알려진 데이터셋을 torchtext를 사용하여 전처리한 뒤, 시퀀스-투-시퀀스(Seq-to-Seq) 모델을 사용하여 학습합니다.
    :image: _static/img/thumbnails/cropped/Language-Translation-with-TorchText.png
-   :link: beginner/torchtext_translation_tutorial.html
+   :link: beginner/torchtext_translation.html
    :tags: Text
 
 .. Reinforcement Learning
@@ -170,6 +189,14 @@
    :link: intermediate/reinforcement_q_learning.html
    :tags: Reinforcement-Learning
 
+.. customcarditem::
+   :header: Train a Mario-playing RL Agent
+   :card_description: Use PyTorch to train a Double Q-learning agent to play Mario.
+   :image: _static/img/mario.gif
+   :link: intermediate/mario_rl_tutorial.html
+   :tags: Reinforcement-Learning
+
+
 .. Deploying PyTorch Models in Production
 
 .. customcarditem::
@@ -184,14 +211,14 @@
    :card_description: C++과 같은 고성능 환경에서 실행할 수 있도록 (nn.Module의 하위 클래스인) PyTorch 모델의 중간 표현(intermediate representation)을 제공하는 TorchScript를 소개합니다.
    :image: _static/img/thumbnails/cropped/Introduction-to-TorchScript.png
    :link: beginner/Intro_to_TorchScript_tutorial.html
-   :tags: Production
+   :tags: Production,TorchScript
 
 .. customcarditem::
    :header: C++에서 TorchScript 모델 로딩하기
    :card_description: PyTorch가 어떻게 기존의 Python 모델을 직렬화된 표현으로 변환하여 Python 의존성 없이 순수하게 C++에서 불러올 수 있는지 배웁니다.
    :image: _static/img/thumbnails/cropped/Loading-a-TorchScript-Model-in-Cpp.png
    :link: advanced/cpp_export.html
-   :tags: Production
+   :tags: Production,TorchScript
 
 .. customcarditem::
    :header: (optional) Exporting a Model from PyTorch to ONNX and Running it using ONNX Runtime
@@ -200,21 +227,30 @@
    :link: advanced/super_resolution_with_onnxruntime.html
    :tags: Production
 
-.. Frontend APIs
+.. Code Transformations with FX
 
 .. customcarditem::
-   :header: (prototype) Introduction to Named Tensors in PyTorch
-   :card_description: Learn how to use PyTorch to train a Deep Q Learning (DQN) agent on the CartPole-v0 task from the OpenAI Gym.
-   :image: _static/img/thumbnails/cropped/experimental-Introduction-to-Named-Tensors-in-PyTorch.png
-   :link: intermediate/memory_format_tutorial.html
-   :tags: Frontend-APIs,Named-Tensor,Best-Practice
+   :header: Building a Convolution/Batch Norm fuser in FX
+   :card_description: Build a simple FX pass that fuses batch norm into convolution to improve performance during inference.
+   :image: _static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png
+   :link: intermediate/fx_conv_bn_fuser.html
+   :tags: FX
+
+.. customcarditem::
+   :header: Building a Simple Performance Profiler with FX
+   :card_description: Build a simple FX interpreter to record the runtime of op, module, and function calls and report statistics
+   :image: _static/img/thumbnails/cropped/Deploying-PyTorch-in-Python-via-a-REST-API-with-Flask.png
+   :link: intermediate/fx_profiling_tutorial.html
+   :tags: FX
+
+.. Frontend APIs
 
 .. customcarditem::
    :header: (beta) Channels Last Memory Format in PyTorch
    :card_description: Get an overview of Channels Last memory format and understand how it is used to order NCHW tensors in memory preserving dimensions.
    :image: _static/img/thumbnails/cropped/experimental-Channels-Last-Memory-Format-in-PyTorch.png
    :link: intermediate/memory_format_tutorial.html
-   :tags: Memory-Format,Best-Practice
+   :tags: Memory-Format,Best-Practice,Frontend-APIs
 
 .. customcarditem::
    :header: Using the PyTorch C++ Frontend
@@ -228,21 +264,21 @@
    :card_description:  Create a neural network layer with no parameters using numpy. Then use scipy to create a neural network layer that has learnable weights.
    :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png
    :link: advanced/cpp_extension.html
-   :tags: Frontend-APIs,C++,CUDA
+   :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA
 
 .. customcarditem::
    :header: Extending TorchScript with Custom C++ Operators
    :card_description:  Implement a custom TorchScript operator in C++, how to build it into a shared library, how to use it in Python to define TorchScript models and lastly how to load it into a C++ application for inference workloads.
    :image: _static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Operators.png
    :link: advanced/torch_script_custom_ops.html
-   :tags: Frontend-APIs,TorchScript,C++
+   :tags: Extending-PyTorch,Frontend-APIs,TorchScript,C++
 
 .. customcarditem::
    :header: Extending TorchScript with Custom C++ Classes
    :card_description: This is a continuation of the custom operator tutorial, and introduces the API we’ve built for binding C++ classes into TorchScript and Python simultaneously.
    :image: _static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Classes.png
    :link: advanced/torch_script_custom_classes.html
-   :tags: Frontend-APIs,TorchScript,C++
+   :tags: Extending-PyTorch,Frontend-APIs,TorchScript,C++
 
 .. customcarditem::
    :header: Dynamic Parallelism in TorchScript
@@ -258,8 +294,36 @@
    :link: advanced/cpp_autograd.html
    :tags: Frontend-APIs,C++
 
+.. customcarditem::
+   :header: Registering a Dispatched Operator in C++
+   :card_description: The dispatcher is an internal component of PyTorch which is responsible for figuring out what code should actually get run when you call a function like torch::add.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.PNG
+   :link: advanced/dispatcher.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++
+
+.. customcarditem::
+   :header: Extending Dispatcher For a New Backend in C++
+   :card_description: Learn how to extend the dispatcher to add a new device living outside of the pytorch/pytorch repo and maintain it to keep in sync with native PyTorch devices.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.PNG
+   :link: advanced/extend_dispatcher.html
+   :tags: Extending-PyTorch,Frontend-APIs,C++
+
 .. Model Optimization
 
+.. customcarditem::
+   :header: Performance Profiling in PyTorch
+   :card_description: Learn how to use the PyTorch Profiler to benchmark your module's performance.
+   :image: _static/img/thumbnails/cropped/profiler.png
+   :link: beginner/profiler.html
+   :tags: Model-Optimization,Best-Practice,Profiling
+
+.. customcarditem::
+   :header: Hyperparameter Tuning Tutorial
+   :card_description: Learn how to use Ray Tune to find the best performing set of hyperparameters for your model.
+   :image: _static/img/ray-tune.png
+   :link: beginner/hyperparameter_tuning_tutorial.html
+   :tags: Model-Optimization,Best-Practice
+
 .. customcarditem::
    :header: Pruning Tutorial
    :card_description: Learn how to use torch.nn.utils.prune to sparsify your neural networks, and how to extend it to implement your own custom pruning technique.
@@ -281,18 +345,11 @@
    :link: intermediate/dynamic_quantization_bert_tutorial.html
    :tags: Text,Quantization,Model-Optimization
 
-.. customcarditem::
-   :header: (beta) Static Quantization with Eager Mode in PyTorch
-   :card_description: Learn techniques to impove a model's accuracy =  post-training static quantization, per-channel quantization, and quantization-aware training.
-   :image: _static/img/thumbnails/cropped/experimental-Static-Quantization-with-Eager-Mode-in-PyTorch.png
-   :link: advanced/static_quantization_tutorial.html
-   :tags: Image/Video,Quantization,Model-Optimization
-
 .. customcarditem::
    :header: (beta) Quantized Transfer Learning for Computer Vision Tutorial
-   :card_description: Learn techniques to impove a model's accuracy -  post-training static quantization, per-channel quantization, and quantization-aware training.
-   :image: _static/img/thumbnails/cropped/experimental-Quantized-Transfer-Learning-for-Computer-Vision-Tutorial.png
-   :link: advanced/static_quantization_tutorial.html
+   :card_description: Extends the Transfer Learning for Computer Vision Tutorial using a quantized model.
+   :image: _static/img/thumbnails/cropped/60-min-blitz.png
+   :link: intermediate/quantized_transfer_learning_tutorial.html
    :tags: Image/Video,Quantization,Model-Optimization
 
 .. Parallel-and-Distributed-Training
@@ -318,13 +375,6 @@
    :link: intermediate/ddp_tutorial.html
    :tags: Parallel-and-Distributed-Training
 
-.. customcarditem::
-   :header: (advanced) PyTorch 1.0 Distributed Trainer with Amazon AWS
-   :card_description: Set up the distributed package of PyTorch, use the different communication strategies, and go over some the internals of the package.
-   :image: _static/img/thumbnails/cropped/advanced-PyTorch-1point0-Distributed-Trainer-with-Amazon-AWS.png
-   :link: beginner/aws_distributed_training_tutorial.html
-   :tags: Parallel-and-Distributed-Training
-
 .. customcarditem::
    :header: PyTorch로 분산 어플리케이션 개발하기
    :card_description: PyTorch의 분산 패키지를 설정하고, 서로 다른 통신 전략을 사용하고, 내부를 살펴봅니다.
@@ -367,6 +417,36 @@
    :link: advanced/rpc_ddp_tutorial.html
    :tags: Parallel-and-Distributed-Training
 
+.. customcarditem::
+   :header: Training Transformer models using Pipeline Parallelism
+   :card_description: Walk through a through a simple example of how to train a transformer model using pipeline parallelism.
+   :image: _static/img/thumbnails/cropped/Training-Transformer-models-using-Pipeline-Parallelism.png
+   :link: intermediate/pipeline_tutorial.html
+   :tags: Parallel-and-Distributed-Training
+
+.. customcarditem::
+   :header: Training Transformer models using Distributed Data Parallel and Pipeline Parallelism
+   :card_description: Walk through a through a simple example of how to train a transformer model using Distributed Data Parallel and Pipeline Parallelism
+   :image: _static/img/thumbnails/cropped/Training-Transformer-Models-using-Distributed-Data-Parallel-and-Pipeline-Parallelism.png
+   :link: advanced/ddp_pipeline.html
+   :tags: Parallel-and-Distributed-Training
+
+.. Mobile
+
+.. customcarditem::
+   :header: Image Segmentation DeepLabV3 on iOS
+   :card_description: A comprehensive step-by-step tutorial on how to prepare and run the PyTorch DeepLabV3 image segmentation model on iOS.
+   :image: _static/img/thumbnails/cropped/ios.png
+   :link: beginner/deeplabv3_on_ios.html
+   :tags: Mobile
+
+.. customcarditem::
+   :header: Image Segmentation DeepLabV3 on Android
+   :card_description: A comprehensive step-by-step tutorial on how to prepare and run the PyTorch DeepLabV3 image segmentation model on Android.
+   :image: _static/img/thumbnails/cropped/android.png
+   :link: beginner/deeplabv3_on_android.html
+   :tags: Mobile
+
 .. End of tutorial card section
 
 .. raw:: html
@@ -436,6 +516,7 @@
    :caption: 파이토치(PyTorch) 레시피
 
    모든 레시피 보기 <recipes/recipes_index>
+   모든 프로토타입 레시피 보기 <prototype/prototype_index>
 
 .. toctree::
    :maxdepth: 2
@@ -443,6 +524,7 @@
    :includehidden:
    :caption: 파이토치(PyTorch) 배우기
 
+   beginner/basics/intro
    beginner/deep_learning_60min_blitz
    beginner/pytorch_with_examples
    beginner/nn_tutorial
@@ -458,6 +540,7 @@
    beginner/transfer_learning_tutorial
    beginner/fgsm_tutorial
    beginner/dcgan_faces_tutorial
+   beginner/vt_tutorial
 
 .. toctree::
    :maxdepth: 2
@@ -466,6 +549,7 @@
    :caption: 오디오
 
    beginner/audio_preprocessing_tutorial
+   intermediate/speech_command_recognition_with_torchaudio
 
 .. toctree::
    :maxdepth: 2
@@ -478,7 +562,7 @@
    intermediate/char_rnn_generation_tutorial
    intermediate/seq2seq_translation_tutorial
    beginner/text_sentiment_ngrams_tutorial
-   beginner/torchtext_translation_tutorial
+   beginner/torchtext_translation
 
 
 .. toctree::
@@ -488,6 +572,7 @@
    :caption: 강화학습
 
    intermediate/reinforcement_q_learning
+   intermediate/mario_rl_tutorial
 
 .. toctree::
    :maxdepth: 2
@@ -500,21 +585,37 @@
    advanced/cpp_export
    advanced/super_resolution_with_onnxruntime
 
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Code Transforms with FX
+
+   intermediate/fx_conv_bn_fuser
+   intermediate/fx_profiling_tutorial
+
 .. toctree::
    :maxdepth: 2
    :includehidden:
    :hidden:
    :caption: 프론트엔드 API
 
-   intermediate/named_tensor_tutorial
    intermediate/memory_format_tutorial
    advanced/cpp_frontend
+   advanced/torch-script-parallelism
+   advanced/cpp_autograd
+
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: PyTorch 확장하기
+
    advanced/cpp_extension
    advanced/torch_script_custom_ops
    advanced/torch_script_custom_classes
-   advanced/torch-script-parallelism
-   advanced/cpp_autograd
    advanced/dispatcher
+   advanced/extend_dispatcher
 
 .. toctree::
    :maxdepth: 2
@@ -522,10 +623,11 @@
    :hidden:
    :caption: 모델 최적화
 
+   beginner/profiler
+   beginner/hyperparameter_tuning_tutorial
    intermediate/pruning_tutorial
    advanced/dynamic_quantization_tutorial
    intermediate/dynamic_quantization_bert_tutorial
-   advanced/static_quantization_tutorial
    intermediate/quantized_transfer_learning_tutorial
 
 .. toctree::
@@ -539,8 +641,18 @@
    intermediate/ddp_tutorial
    intermediate/dist_tuto
    intermediate/rpc_tutorial
-   beginner/aws_distributed_training_tutorial
    intermediate/rpc_param_server_tutorial
    intermediate/dist_pipeline_parallel_tutorial
    intermediate/rpc_async_execution
-   advanced/rpc_ddp_tutorial
\ No newline at end of file
+   advanced/rpc_ddp_tutorial
+   intermediate/pipeline_tutorial
+   advanced/ddp_pipeline
+
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Mobile
+
+   beginner/deeplabv3_on_ios
+   beginner/deeplabv3_on_android
\ No newline at end of file
diff --git a/intermediate_source/README.txt b/intermediate_source/README.txt
index 27eebcd6d..3d88f8ea9 100644
--- a/intermediate_source/README.txt
+++ b/intermediate_source/README.txt
@@ -31,4 +31,4 @@ Intermediate tutorials
 
 8. flask_rest_api_tutorial.py
 	Deploying PyTorch and Building a REST API using Flask
-	https://pytorch.org/tutorials/beginner/flask_rest_api_tutorial.html
+	https://pytorch.org/tutorials/intermediate/flask_rest_api_tutorial.html
diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py
index da8d22535..0c805dad1 100644
--- a/intermediate_source/char_rnn_classification_tutorial.py
+++ b/intermediate_source/char_rnn_classification_tutorial.py
@@ -227,7 +227,7 @@ def initHidden(self):
 #
 
 input = letterToTensor('A')
-hidden =torch.zeros(1, n_hidden)
+hidden = torch.zeros(1, n_hidden)
 
 output, next_hidden = rnn(input, hidden)
 
diff --git a/intermediate_source/ddp_tutorial.rst b/intermediate_source/ddp_tutorial.rst
index 68409e4da..dfc040444 100644
--- a/intermediate_source/ddp_tutorial.rst
+++ b/intermediate_source/ddp_tutorial.rst
@@ -2,6 +2,8 @@ Getting Started with Distributed Data Parallel
 =================================================
 **Author**: `Shen Li <https://mrshenli.github.io/>`_
 
+**Edited by**: `Joe Zhu <https://github.com/gunandrose4u>`_
+
 Prerequisites:
 
 -  `PyTorch Distributed Overview <../beginner/dist_overview.html>`__
@@ -68,6 +70,7 @@ be found in
 .. code:: python
 
     import os
+    import sys
     import tempfile
     import torch
     import torch.distributed as dist
@@ -77,6 +80,17 @@ be found in
 
     from torch.nn.parallel import DistributedDataParallel as DDP
 
+    # On Windows platform, the torch.distributed package only
+    # supports Gloo backend, FileStore and TcpStore.
+    # For FileStore, set init_method parameter in init_process_group
+    # to a local file. Example as follow:
+    # init_method="file:///f:/libtmp/some_file"
+    # dist.init_process_group(
+    #    "gloo",
+    #    rank=rank,
+    #    init_method=init_method,
+    #    world_size=world_size)
+    # For TcpStore, same way as on Linux.
 
     def setup(rank, world_size):
         os.environ['MASTER_ADDR'] = 'localhost'
@@ -85,7 +99,6 @@ be found in
         # initialize the process group
         dist.init_process_group("gloo", rank=rank, world_size=world_size)
 
-
     def cleanup():
         dist.destroy_process_group()
 
diff --git a/intermediate_source/dist_pipeline_parallel_tutorial.rst b/intermediate_source/dist_pipeline_parallel_tutorial.rst
index 693043478..416ee8018 100644
--- a/intermediate_source/dist_pipeline_parallel_tutorial.rst
+++ b/intermediate_source/dist_pipeline_parallel_tutorial.rst
@@ -316,7 +316,7 @@ where the ``shutdown`` by default will block until all RPC participants finish.
     def run_worker(rank, world_size, num_split):
         os.environ['MASTER_ADDR'] = 'localhost'
         os.environ['MASTER_PORT'] = '29500'
-        options = rpc.ProcessGroupRpcBackendOptions(num_send_recv_threads=128)
+        options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=128)
 
         if rank == 0:
             rpc.init_rpc(
@@ -348,25 +348,3 @@ where the ``shutdown`` by default will block until all RPC participants finish.
             print(f"number of splits = {num_split}, execution time = {tok - tik}")
 
 
-The output below shows the speedup attained by increasing the number of splits
-in each batch.
-
-::
-
-    $ python main.py
-    Processing batch 0
-    Processing batch 1
-    Processing batch 2
-    number of splits = 1, execution time = 16.45062756538391
-    Processing batch 0
-    Processing batch 1
-    Processing batch 2
-    number of splits = 2, execution time = 12.329529762268066
-    Processing batch 0
-    Processing batch 1
-    Processing batch 2
-    number of splits = 4, execution time = 10.164430618286133
-    Processing batch 0
-    Processing batch 1
-    Processing batch 2
-    number of splits = 8, execution time = 9.076049566268921
diff --git a/intermediate_source/dist_tuto.rst b/intermediate_source/dist_tuto.rst
index 3303ea2fa..8f2c4e3d1 100644
--- a/intermediate_source/dist_tuto.rst
+++ b/intermediate_source/dist_tuto.rst
@@ -42,7 +42,7 @@ PyTorch에 포함된 분산 패키지(예. ``torch.distributed``)는 연구자
     import os
     import torch
     import torch.distributed as dist
-    from torch.multiprocessing import Process
+    import torch.multiprocessing as mp
 
     def run(rank, size):
         """ Distributed function to be implemented later. """
@@ -192,18 +192,18 @@ PyTorch에 포함된 분산 패키지(예. ``torch.distributed``)는 연구자
         """ 간단한 점-대-점 간 통신 """
         group = dist.new_group([0, 1])
         tensor = torch.ones(1)
-        dist.all_reduce(tensor, op=dist.reduce_op.SUM, group=group)
+        dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group)
         print('Rank ', rank, ' has data ', tensor[0])
 
-그룹 내의 모든 Tensor들의 합이 필요하기 떄문에, ``dist.reduce_op.SUM`` 을
+그룹 내의 모든 Tensor들의 합이 필요하기 떄문에, ``dist.ReduceOp.SUM`` 을
 리듀스(reduce) 연산자로 사용하였습니다. 일반적으로, 교환 법칙이 허용되는(commutative)
 모든 수학 연산을 연산자로 사용할 수 있습니다. PyTorch는 요소별(element-wise)로
 동작하는 기본적으로 4개의 연산자를 제공합니다.
 
--  ``dist.reduce_op.SUM``,
--  ``dist.reduce_op.PRODUCT``,
--  ``dist.reduce_op.MAX``,
--  ``dist.reduce_op.MIN``.
+-  ``dist.ReduceOp.SUM``,
+-  ``dist.ReduceOp.PRODUCT``,
+-  ``dist.ReduceOp.MAX``,
+-  ``dist.ReduceOp.MIN``.
 
 PyTorch에는 현재 ``dist.all_reduce(tensor, op, group)`` 외에도 6개의 집합 통신이
 구현되어 있습니다.
@@ -350,7 +350,7 @@ PyTorch에는 현재 ``dist.all_reduce(tensor, op, group)`` 외에도 6개의 
     def average_gradients(model):
         size = float(dist.get_world_size())
         for param in model.parameters():
-            dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM)
+            dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
             param.grad.data /= size
 
 *완성(Et voilà)*! 분산 동기(synchronous) SGD를 성공적으로 구현했으며 어떤 모델도
diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst
index 0cb9d2f6d..0d096ad4d 100644
--- a/intermediate_source/dynamic_quantization_bert_tutorial.rst
+++ b/intermediate_source/dynamic_quantization_bert_tutorial.rst
@@ -38,7 +38,7 @@
    온라인 뉴스로부터 자동으로 추출된 두 개의 문장들과 그 두 문장이 같은 뜻인지 사람이
    평가한 정답으로 이루어져 있습니다. 클래스의 비중이 같지 않아(같음 68%, 다름 32%),
    많이 쓰이는 `F1 점수 <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html>`_ 를
-   기록합니다. MRPC는 아래에 나온 것처럼 문장 쌍을 분류하는 자연어처리 문제로 많이 쓰입니다.
+   기록합니다. MRPC는 아래에 나온 것처럼 문장 쌍을 분류하는 자연어처리 문제에 많이 쓰입니다.
 
 .. image:: /_static/img/bert.png
 
@@ -485,11 +485,11 @@ HuggingFace BERT 모델에 동적 양자화를 적용하기 위해
 
    | 정확도  |  F1 점수  |  모델 크기  |  쓰레드 1개 |  쓰레드 4개 |
    |  FP32  |  0.9019  |   438 MB   |   160 초   |   85 초    |
-   |  INT8  |  0.8953  |   181 MB   |    90 초   |   46 초    |
+   |  INT8  |  0.902   |   181 MB   |   90 초    |   46 초    |
 
 
 MRPC 문제에 맞게 미세조정한 BERT 모델에 학습 후 동적 양자화를 적용한
-결과, F1 점수 0.6이 나왔습니다. 참고로, `최근 논문 <https://arxiv.org/pdf/1910.06188.pdf>`_
+결과, 0.6% 낮은 F1 점수가 나왔습니다. 참고로, `최근 논문 <https://arxiv.org/pdf/1910.06188.pdf>`_
 (표 1)에서는 학습 후 동적 양자화를 적용했을 때, F1 점수 0.8788이 나왔고,
 양자화 의식 학습을 적용했을 때는 0.8956이 나왔습니다. 우리는 Pytorch의 비대칭
 양자화를 사용했지만, 참고한 논문에서는 대칭적 양자화만을 사용했다는 점이 주요한
@@ -510,14 +510,21 @@ MRPC 데이터셋을 평가하는데 약 46초가 소요됐습니다.
 3.3 양자화된 모델 직렬화하기
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-나중에 다시 쓸 수 있도록 양자화된 모델을 직렬화하고 저장할 수 있습니다.
+나중에 다시 쓸 수 있도록 `torch.jit.save` 을 사용하여 양자화된 모델을 직렬화하고 저장할 수 있습니다.
 
 .. code:: python
 
-    quantized_output_dir = configs.output_dir + "quantized/"
-    if not os.path.exists(quantized_output_dir):
-        os.makedirs(quantized_output_dir)
-        quantized_model.save_pretrained(quantized_output_dir)
+    input_ids = ids_tensor([8, 128], 2)
+    token_type_ids = ids_tensor([8, 128], 2)
+    attention_mask = ids_tensor([8, 128], vocab_size=2)
+    dummy_input = (input_ids, attention_mask, token_type_ids)
+    traced_model = torch.jit.trace(quantized_model, dummy_input)
+    torch.jit.save(traced_model, "bert_traced_eager_quant.pt")
+
+양자화된 모델을 불러올 때는 `torch.jit.load` 를 사용합니다.
+
+.. code:: python
+    loaded_quantized_model = torch.jit.load("bert_traced_eager_quant.pt")
 
 
 마치며
diff --git a/intermediate_source/fx_conv_bn_fuser.py b/intermediate_source/fx_conv_bn_fuser.py
new file mode 100644
index 000000000..c06f5f768
--- /dev/null
+++ b/intermediate_source/fx_conv_bn_fuser.py
@@ -0,0 +1,262 @@
+# -*- coding: utf-8 -*-
+"""
+(beta) Building a Convolution/Batch Norm fuser in FX
+*******************************************************
+**Author**: `Horace He <https://github.com/chillee>`_
+
+In this tutorial, we are going to use FX, a toolkit for composable function
+transformations of PyTorch, to do the following:
+
+1) Find patterns of conv/batch norm in the data dependencies.
+2) For the patterns found in 1), fold the batch norm statistics into the convolution weights.
+
+Note that this optimization only works for models in inference mode (i.e. `mode.eval()`)
+
+We will be building the fuser that exists here:
+https://github.com/pytorch/pytorch/blob/orig/release/1.8/torch/fx/experimental/fuser.py
+
+"""
+
+
+######################################################################
+# First, let's get some imports out of the way (we will be using all
+# of these later in the code).
+
+from typing import Type, Dict, Any, Tuple, Iterable
+import copy
+import torch.fx as fx
+import torch
+import torch.nn as nn
+
+######################################################################
+# For this tutorial, we are going to create a model consisting of convolutions
+# and batch norms. Note that this model has some tricky components - some of
+# the conv/batch norm patterns are hidden within Sequentials and one of the
+# BatchNorms is wrapped in another Module.
+
+class WrappedBatchNorm(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mod = nn.BatchNorm2d(1)
+    def forward(self, x):
+        return self.mod(x)
+
+class M(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 1, 1)
+        self.bn1 = nn.BatchNorm2d(1)
+        self.conv2 = nn.Conv2d(1, 1, 1)
+        self.nested = nn.Sequential(
+            nn.BatchNorm2d(1),
+            nn.Conv2d(1, 1, 1),
+        )
+        self.wrapped = WrappedBatchNorm()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.conv2(x)
+        x = self.nested(x)
+        x = self.wrapped(x)
+        return x
+
+model = M()
+
+model.eval()
+
+######################################################################
+# Fusing Convolution with Batch Norm
+# -----------------------------------------
+# One of the primary challenges with trying to automatically fuse convolution
+# and batch norm in PyTorch is that PyTorch does not provide an easy way of
+# accessing the computational graph. FX resolves this problem by symbolically
+# tracing the actual operations called, so that we can track the computations
+# through the `forward` call, nested within Sequential modules, or wrapped in
+# an user-defined module.
+
+traced_model = torch.fx.symbolic_trace(model)
+print(traced_model.graph)
+
+######################################################################
+# This gives us a graph representation of our model. Note that both the modules
+# hidden within the sequential as well as the wrapped Module have been inlined
+# into the graph. This is the default level of abstraction, but it can be
+# configured by the pass writer. More information can be found at the FX
+# overview https://pytorch.org/docs/master/fx.html#module-torch.fx
+
+
+####################################
+# Fusing Convolution with Batch Norm
+# ----------------------------------
+# Unlike some other fusions, fusion of convolution with batch norm does not
+# require any new operators. Instead, as batch norm during inference
+# consists of a pointwise add and multiply, these operations can be "baked"
+# into the preceding convolution's weights. This allows us to remove the batch
+# norm entirely from our model! Read
+# https://nenadmarkus.com/p/fusing-batchnorm-and-conv/ for further details. The
+# code here is copied from
+# https://github.com/pytorch/pytorch/blob/orig/release/1.8/torch/nn/utils/fusion.py
+# clarity purposes.
+def fuse_conv_bn_eval(conv, bn):
+    """
+    Given a conv Module `A` and an batch_norm module `B`, returns a conv
+    module `C` such that C(x) == B(A(x)) in inference mode.
+    """
+    assert(not (conv.training or bn.training)), "Fusion only for eval!"
+    fused_conv = copy.deepcopy(conv)
+
+    fused_conv.weight, fused_conv.bias = \
+        fuse_conv_bn_weights(fused_conv.weight, fused_conv.bias,
+                             bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias)
+
+    return fused_conv
+
+def fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
+    if conv_b is None:
+        conv_b = torch.zeros_like(bn_rm)
+    if bn_w is None:
+        bn_w = torch.ones_like(bn_rm)
+    if bn_b is None:
+        bn_b = torch.zeros_like(bn_rm)
+    bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
+
+    conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
+    conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
+
+    return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b)
+
+
+####################################
+# FX Fusion Pass
+# ----------------------------------
+# Now that we have our computational graph as well as a method for fusing
+# convolution and batch norm, all that remains is to iterate over the FX graph
+# and apply the desired fusions.
+
+
+def _parent_name(target : str) -> Tuple[str, str]:
+    """
+    Splits a qualname into parent path and last atom.
+    For example, `foo.bar.baz` -> (`foo.bar`, `baz`)
+    """
+    *parent, name = target.rsplit('.', 1)
+    return parent[0] if parent else '', name
+
+def replace_node_module(node: fx.Node, modules: Dict[str, Any], new_module: torch.nn.Module):
+    assert(isinstance(node.target, str))
+    parent_name, name = _parent_name(node.target)
+    setattr(modules[parent_name], name, new_module)
+
+
+def fuse(model: torch.nn.Module) -> torch.nn.Module:
+    model = copy.deepcopy(model)
+    # The first step of most FX passes is to symbolically trace our model to
+    # obtain a `GraphModule`. This is a representation of our original model
+    # that is functionally identical to our original model, except that we now
+    # also have a graph representation of our forward pass.
+    fx_model: fx.GraphModule = fx.symbolic_trace(model)
+    modules = dict(fx_model.named_modules())
+
+    # The primary representation for working with FX are the `Graph` and the
+    # `Node`. Each `GraphModule` has a `Graph` associated with it - this
+    # `Graph` is also what generates `GraphModule.code`.
+    # The `Graph` itself is represented as a list of `Node` objects. Thus, to
+    # iterate through all of the operations in our graph, we iterate over each
+    # `Node` in our `Graph`.
+    for node in fx_model.graph.nodes:
+        # The FX IR contains several types of nodes, which generally represent
+        # call sites to modules, functions, or methods. The type of node is
+        # determined by `Node.op`.
+        if node.op != 'call_module': # If our current node isn't calling a Module then we can ignore it.
+            continue
+        # For call sites, `Node.target` represents the module/function/method
+        # that's being called. Here, we check `Node.target` to see if it's a
+        # batch norm module, and then check `Node.args[0].target` to see if the
+        # input `Node` is a convolution.
+        if type(modules[node.target]) is nn.BatchNorm2d and type(modules[node.args[0].target]) is nn.Conv2d:
+            if len(node.args[0].users) > 1:  # Output of conv is used by other nodes
+                continue
+            conv = modules[node.args[0].target]
+            bn = modules[node.target]
+            fused_conv = fuse_conv_bn_eval(conv, bn)
+            replace_node_module(node.args[0], modules, fused_conv)
+            # As we've folded the batch nor into the conv, we need to replace all uses
+            # of the batch norm with the conv.
+            node.replace_all_uses_with(node.args[0])
+            # Now that all uses of the batch norm have been replaced, we can
+            # safely remove the batch norm.
+            fx_model.graph.erase_node(node)
+    fx_model.graph.lint()
+    # After we've modified our graph, we need to recompile our graph in order
+    # to keep the generated code in sync.
+    fx_model.recompile()
+    return fx_model
+
+
+######################################################################
+# .. note::
+#       We make some simplifications here for demonstration purposes, such as only
+#       matching 2D convolutions. View
+#       https://github.com/pytorch/pytorch/blob/master/torch/fx/experimental/fuser.py
+#       for a more usable pass.
+
+######################################################################
+# Testing out our Fusion Pass
+# -----------------------------------------
+# We can now run this fusion pass on our initial toy model and verify that our
+# results are identical. In addition, we can print out the code for our fused
+# model and verify that there are no more batch norms.
+
+
+fused_model = fuse(model)
+print(fused_model.code)
+inp = torch.randn(5, 1, 1, 1)
+torch.testing.assert_allclose(fused_model(inp), model(inp))
+
+
+######################################################################
+# Benchmarking our Fusion on ResNet18
+# ----------
+# We can test our fusion pass on a larger model like ResNet18 and see how much
+# this pass improves inference performance.
+import torchvision.models as models
+import time
+
+rn18 = models.resnet18()
+rn18.eval()
+
+inp = torch.randn(10, 3, 224, 224)
+output = rn18(inp)
+
+def benchmark(model, iters=20):
+    for _ in range(10):
+        model(inp)
+    begin = time.time()
+    for _ in range(iters):
+        model(inp)
+    return str(time.time()-begin)
+
+fused_rn18 = fuse(rn18)
+print("Unfused time: ", benchmark(rn18))
+print("Fused time: ", benchmark(fused_rn18))
+######################################################################
+# As we previously saw, the output of our FX transformation is
+# (Torchscriptable) PyTorch code, we can easily `jit.script` the output to try
+# and increase our performance even more. In this way, our FX model
+# transformation composes with Torchscript with no issues.
+jit_rn18 = torch.jit.script(fused_rn18)
+print("jit time: ", benchmark(jit_rn18))
+
+
+############
+# Conclusion
+# ----------
+# As we can see, using FX we can easily write static graph transformations on
+# PyTorch code.
+#
+# Since FX is still in beta, we would be happy to hear any
+# feedback you have about using it. Please feel free to use the
+# PyTorch Forums (https://discuss.pytorch.org/) and the issue tracker
+# (https://github.com/pytorch/pytorch/issues) to provide any feedback
+# you might have.
diff --git a/intermediate_source/fx_profiling_tutorial.py b/intermediate_source/fx_profiling_tutorial.py
new file mode 100644
index 000000000..d54f3ccb6
--- /dev/null
+++ b/intermediate_source/fx_profiling_tutorial.py
@@ -0,0 +1,236 @@
+# -*- coding: utf-8 -*-
+"""
+(beta) Building a Simple CPU Performance Profiler with FX
+*******************************************************
+**Author**: `James Reed <https://github.com/jamesr66a>`_
+
+In this tutorial, we are going to use FX to do the following:
+
+1) Capture PyTorch Python code in a way that we can inspect and gather
+   statistics about the structure and execution of the code
+2) Build out a small class that will serve as a simple performance "profiler",
+   collecting runtime statistics about each part of the model from actual
+   runs.
+
+"""
+
+######################################################################
+# For this tutorial, we are going to use the torchvision ResNet18 model
+# for demonstration purposes.
+
+import torch
+import torch.fx
+import torchvision.models as models
+
+rn18 = models.resnet18()
+rn18.eval()
+
+######################################################################
+# Now that we have our model, we want to inspect deeper into its
+# performance. That is, for the following invocation, which parts
+# of the model are taking the longest?
+input = torch.randn(5, 3, 224, 224)
+output = rn18(input)
+
+######################################################################
+# A common way of answering that question is to go through the program
+# source, add code that collects timestamps at various points in the
+# program, and compare the difference between those timestamps to see
+# how long the regions between the timestamps take.
+#
+# That technique is certainly applicable to PyTorch code, however it
+# would be nicer if we didn't have to copy over model code and edit it,
+# especially code we haven't written (like this torchvision model).
+# Instead, we are going to use FX to automate this "instrumentation"
+# process without needing to modify any source.
+
+######################################################################
+# First, let's get some imports out of the way (we will be using all
+# of these later in the code).
+
+import statistics, tabulate, time
+from typing import Any, Dict, List
+from torch.fx import Interpreter
+
+######################################################################
+# .. note::
+#     ``tabulate`` is an external library that is not a dependency of PyTorch.
+#     We will be using it to more easily visualize performance data. Please
+#     make sure you've installed it from your favorite Python package source.
+
+######################################################################
+# Capturing the Model with Symbolic Tracing
+# -----------------------------------------
+# Next, we are going to use FX's symbolic tracing mechanism to capture
+# the definition of our model in a data structure we can manipulate
+# and examine.
+
+traced_rn18 = torch.fx.symbolic_trace(rn18)
+print(traced_rn18.graph)
+
+######################################################################
+# This gives us a Graph representation of the ResNet18 model. A Graph
+# consists of a series of Nodes connected to each other. Each Node
+# represents a call-site in the Python code (whether to a function,
+# a module, or a method) and the edges (represented as ``args`` and ``kwargs``
+# on each node) represent the values passed between these call-sites. More
+# information about the Graph representation and the rest of FX's APIs ca
+# be found at the FX documentation https://pytorch.org/docs/master/fx.html.
+
+
+######################################################################
+# Creating a Profiling Interpreter
+# --------------------------------
+# Next, we are going to create a class that inherits from ``torch.fx.Interpreter``.
+# Though the ``GraphModule`` that ``symbolic_trace`` produces compiles Python code
+# that is run when you call a ``GraphModule``, an alternative way to run a
+# ``GraphModule`` is by executing each ``Node`` in the ``Graph`` one by one. That is
+# the functionality that ``Interpreter`` provides: It interprets the graph node-
+# by-node.
+#
+# By inheriting from ``Interpreter``, we can override various functionality and
+# install the profiling behavior we want. The goal is to have an object to which
+# we can pass a model, invoke the model 1 or more times, then get statistics about
+# how long the model and each part of the model took during those runs.
+#
+# Let's define our ``ProfilingInterpreter`` class:
+
+class ProfilingInterpreter(Interpreter):
+    def __init__(self, mod : torch.nn.Module):
+        # Rather than have the user symbolically trace their model,
+        # we're going to do it in the constructor. As a result, the
+        # user can pass in any ``Module`` without having to worry about
+        # symbolic tracing APIs
+        gm = torch.fx.symbolic_trace(mod)
+        super().__init__(gm)
+
+        # We are going to store away two things here:
+        #
+        # 1. A list of total runtimes for ``mod``. In other words, we are
+        #    storing away the time ``mod(...)`` took each time this
+        #    interpreter is called.
+        self.total_runtime_sec : List[float] = []
+        # 2. A map from ``Node`` to a list of times (in seconds) that
+        #    node took to run. This can be seen as similar to (1) but
+        #    for specific sub-parts of the model.
+        self.runtimes_sec : Dict[torch.fx.Node, List[float]] = {}
+
+    ######################################################################
+    # Next, let's override our first method: ``run()``. ``Interpreter``'s ``run``
+    # method is the top-level entrypoint for execution of the model. We will
+    # want to intercept this so that we can record the total runtime of the
+    # model.
+
+    def run(self, *args) -> Any:
+        # Record the time we started running the model
+        t_start = time.time()
+        # Run the model by delegating back into Interpreter.run()
+        return_val = super().run(*args)
+        # Record the time we finished running the model
+        t_end = time.time()
+        # Store the total elapsed time this model execution took in the
+        # ProfilingInterpreter
+        self.total_runtime_sec.append(t_end - t_start)
+        return return_val
+
+    ######################################################################
+    # Now, let's override ``run_node``. ``Interpreter`` calls ``run_node`` each
+    # time it executes a single node. We will intercept this so that we
+    # can measure and record the time taken for each individual call in
+    # the model.
+
+    def run_node(self, n : torch.fx.Node) -> Any:
+        # Record the time we started running the op
+        t_start = time.time()
+        # Run the op by delegating back into Interpreter.run_node()
+        return_val = super().run_node(n)
+        # Record the time we finished running the op
+        t_end = time.time()
+        # If we don't have an entry for this node in our runtimes_sec
+        # data structure, add one with an empty list value.
+        self.runtimes_sec.setdefault(n, [])
+        # Record the total elapsed time for this single invocation
+        # in the runtimes_sec data structure
+        self.runtimes_sec[n].append(t_end - t_start)
+        return return_val
+
+    ######################################################################
+    # Finally, we are going to define a method (one which doesn't override
+    # any ``Interpreter`` method) that provides us a nice, organized view of
+    # the data we have collected.
+
+    def summary(self, should_sort : bool = False) -> str:
+        # Build up a list of summary information for each node
+        node_summaries : List[List[Any]] = []
+        # Calculate the mean runtime for the whole network. Because the
+        # network may have been called multiple times during profiling,
+        # we need to summarize the runtimes. We choose to use the
+        # arithmetic mean for this.
+        mean_total_runtime = statistics.mean(self.total_runtime_sec)
+
+        # For each node, record summary statistics
+        for node, runtimes in self.runtimes_sec.items():
+            # Similarly, compute the mean runtime for ``node``
+            mean_runtime = statistics.mean(runtimes)
+            # For easier understanding, we also compute the percentage
+            # time each node took with respect to the whole network.
+            pct_total = mean_runtime / mean_total_runtime * 100
+            # Record the node's type, name of the node, mean runtime, and
+            # percent runtim
+            node_summaries.append(
+                [node.op, str(node), mean_runtime, pct_total])
+
+        # One of the most important questions to answer when doing performance
+        # profiling is "Which op(s) took the longest?". We can make this easy
+        # to see by providing sorting functionality in our summary view
+        if should_sort:
+            node_summaries.sort(key=lambda s: s[2], reverse=True)
+
+        # Use the ``tabulate`` library to create a well-formatted table
+        # presenting our summary information
+        headers : List[str] = [
+            'Op type', 'Op', 'Average runtime (s)', 'Pct total runtime'
+        ]
+        return tabulate.tabulate(node_summaries, headers=headers)
+
+######################################################################
+# .. note::
+#       We use Python's ``time.time`` function to pull wall clock
+#       timestamps and compare them. This is not the most accurate
+#       way to measure performance, and will only give us a first-
+#       order approximation. We use this simple technique only for the
+#       purpose of demonstration in this tutorial.
+
+######################################################################
+# Investigating the Performance of ResNet18
+# -----------------------------------------
+# We can now use ``ProfilingInterpreter`` to inspect the performance
+# characteristics of our ResNet18 model;
+
+interp = ProfilingInterpreter(rn18)
+interp.run(input)
+print(interp.summary(True))
+
+######################################################################
+# There are two things we should call out here:
+#
+# * MaxPool2d takes up the most time. This is a known issue:
+#   https://github.com/pytorch/pytorch/issues/51393
+# * BatchNorm2d also takes up significant time. We can continue this
+#   line of thinking and optimize this in the Conv-BN Fusion with FX
+#   tutorial TODO: link
+#
+#
+# Conclusion
+# ----------
+# As we can see, using FX we can easily capture PyTorch programs (even
+# ones we don't have the source code for!) in a machine-interpretable
+# format and use that for analysis, such as the performance analysis
+# we've done here. FX opens up an exiciting world of possibilities for
+# working with PyTorch programs.
+#
+# Finally, since FX is still in beta, we would be happy to hear any
+# feedback you have about using it. Please feel free to use the
+# PyTorch Forums (https://discuss.pytorch.org/) and the issue tracker
+# (https://github.com/pytorch/pytorch/issues) to provide any feedback
+# you might have.
diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py
new file mode 100755
index 000000000..a2d856c41
--- /dev/null
+++ b/intermediate_source/mario_rl_tutorial.py
@@ -0,0 +1,786 @@
+# -*- coding: utf-8 -*-
+"""
+Train a Mario-playing RL Agent
+================
+
+Authors: `Yuansong Feng <https://github.com/YuansongFeng>`__, `Suraj
+Subramanian <https://github.com/suraj813>`__, `Howard
+Wang <https://github.com/hw26>`__, `Steven
+Guo <https://github.com/GuoYuzhang>`__.
+
+
+This tutorial walks you through the fundamentals of Deep Reinforcement
+Learning. At the end, you will implement an AI-powered Mario (using
+`Double Deep Q-Networks <https://arxiv.org/pdf/1509.06461.pdf>`__) that
+can play the game by itself.
+
+Although no prior knowledge of RL is necessary for this tutorial, you
+can familiarize yourself with these RL
+`concepts <https://spinningup.openai.com/en/latest/spinningup/rl_intro.html>`__,
+and have this handy
+`cheatsheet <https://colab.research.google.com/drive/1eN33dPVtdPViiS1njTW_-r-IYCDTFU7N>`__
+as your companion. The full code is available
+`here <https://github.com/yuansongFeng/MadMario/>`__.
+
+.. figure:: /_static/img/mario.gif
+   :alt: mario
+
+"""
+
+
+######################################################################
+#
+#
+
+# !pip install gym-super-mario-bros==7.3.0
+
+import torch
+from torch import nn
+from torchvision import transforms as T
+from PIL import Image
+import numpy as np
+from pathlib import Path
+from collections import deque
+import random, datetime, os, copy
+
+# Gym is an OpenAI toolkit for RL
+import gym
+from gym.spaces import Box
+from gym.wrappers import FrameStack
+
+# NES Emulator for OpenAI Gym
+from nes_py.wrappers import JoypadSpace
+
+# Super Mario environment for OpenAI Gym
+import gym_super_mario_bros
+
+
+######################################################################
+# RL Definitions
+# """"""""""""""""""
+#
+# **Environment** The world that an agent interacts with and learns from.
+#
+# **Action** :math:`a` : How the Agent responds to the Environment. The
+# set of all possible Actions is called *action-space*.
+#
+# **State** :math:`s` : The current characteristic of the Environment. The
+# set of all possible States the Environment can be in is called
+# *state-space*.
+#
+# **Reward** :math:`r` : Reward is the key feedback from Environment to
+# Agent. It is what drives the Agent to learn and to change its future
+# action. An aggregation of rewards over multiple time steps is called
+# **Return**.
+#
+# **Optimal Action-Value function** :math:`Q^*(s,a)` : Gives the expected
+# return if you start in state :math:`s`, take an arbitrary action
+# :math:`a`, and then for each future time step take the action that
+# maximizes returns. :math:`Q` can be said to stand for the “quality” of
+# the action in a state. We try to approximate this function.
+#
+
+
+######################################################################
+# Environment
+# """"""""""""""""
+#
+# Initialize Environment
+# ------------------------
+#
+# In Mario, the environment consists of tubes, mushrooms and other
+# components.
+#
+# When Mario makes an action, the environment responds with the changed
+# (next) state, reward and other info.
+#
+
+# Initialize Super Mario environment
+env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")
+
+# Limit the action-space to
+#   0. walk right
+#   1. jump right
+env = JoypadSpace(env, [["right"], ["right", "A"]])
+
+env.reset()
+next_state, reward, done, info = env.step(action=0)
+print(f"{next_state.shape},\n {reward},\n {done},\n {info}")
+
+
+######################################################################
+# Preprocess Environment
+# ------------------------
+#
+# Environment data is returned to the agent in ``next_state``. As you saw
+# above, each state is represented by a ``[3, 240, 256]`` size array.
+# Often that is more information than our agent needs; for instance,
+# Mario’s actions do not depend on the color of the pipes or the sky!
+#
+# We use **Wrappers** to preprocess environment data before sending it to
+# the agent.
+#
+# ``GrayScaleObservation`` is a common wrapper to transform an RGB image
+# to grayscale; doing so reduces the size of the state representation
+# without losing useful information. Now the size of each state:
+# ``[1, 240, 256]``
+#
+# ``ResizeObservation`` downsamples each observation into a square image.
+# New size: ``[1, 84, 84]``
+#
+# ``SkipFrame`` is a custom wrapper that inherits from ``gym.Wrapper`` and
+# implements the ``step()`` function. Because consecutive frames don’t
+# vary much, we can skip n-intermediate frames without losing much
+# information. The n-th frame aggregates rewards accumulated over each
+# skipped frame.
+#
+# ``FrameStack`` is a wrapper that allows us to squash consecutive frames
+# of the environment into a single observation point to feed to our
+# learning model. This way, we can identify if Mario was landing or
+# jumping based on the direction of his movement in the previous several
+# frames.
+#
+
+
+class SkipFrame(gym.Wrapper):
+    def __init__(self, env, skip):
+        """Return only every `skip`-th frame"""
+        super().__init__(env)
+        self._skip = skip
+
+    def step(self, action):
+        """Repeat action, and sum reward"""
+        total_reward = 0.0
+        done = False
+        for i in range(self._skip):
+            # Accumulate reward and repeat the same action
+            obs, reward, done, info = self.env.step(action)
+            total_reward += reward
+            if done:
+                break
+        return obs, total_reward, done, info
+
+
+class GrayScaleObservation(gym.ObservationWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        obs_shape = self.observation_space.shape[:2]
+        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
+
+    def permute_orientation(self, observation):
+        # permute [H, W, C] array to [C, H, W] tensor
+        observation = np.transpose(observation, (2, 0, 1))
+        observation = torch.tensor(observation.copy(), dtype=torch.float)
+        return observation
+
+    def observation(self, observation):
+        observation = self.permute_orientation(observation)
+        transform = T.Grayscale()
+        observation = transform(observation)
+        return observation
+
+
+class ResizeObservation(gym.ObservationWrapper):
+    def __init__(self, env, shape):
+        super().__init__(env)
+        if isinstance(shape, int):
+            self.shape = (shape, shape)
+        else:
+            self.shape = tuple(shape)
+
+        obs_shape = self.shape + self.observation_space.shape[2:]
+        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
+
+    def observation(self, observation):
+        transforms = T.Compose(
+            [T.Resize(self.shape), T.Normalize(0, 255)]
+        )
+        observation = transforms(observation).squeeze(0)
+        return observation
+
+
+# Apply Wrappers to environment
+env = SkipFrame(env, skip=4)
+env = GrayScaleObservation(env)
+env = ResizeObservation(env, shape=84)
+env = FrameStack(env, num_stack=4)
+
+
+######################################################################
+# After applying the above wrappers to the environment, the final wrapped
+# state consists of 4 gray-scaled consecutive frames stacked together, as
+# shown above in the image on the left. Each time Mario makes an action,
+# the environment responds with a state of this structure. The structure
+# is represented by a 3-D array of size ``[4, 84, 84]``.
+#
+# .. figure:: /_static/img/mario_env.png
+#    :alt: picture
+#
+#
+
+
+######################################################################
+# Agent
+# """""""""
+#
+# We create a class ``Mario`` to represent our agent in the game. Mario
+# should be able to:
+#
+# -  **Act** according to the optimal action policy based on the current
+#    state (of the environment).
+#
+# -  **Remember** experiences. Experience = (current state, current
+#    action, reward, next state). Mario *caches* and later *recalls* his
+#    experiences to update his action policy.
+#
+# -  **Learn** a better action policy over time
+#
+
+
+class Mario:
+    def __init__():
+        pass
+
+    def act(self, state):
+        """Given a state, choose an epsilon-greedy action"""
+        pass
+
+    def cache(self, experience):
+        """Add the experience to memory"""
+        pass
+
+    def recall(self):
+        """Sample experiences from memory"""
+        pass
+
+    def learn(self):
+        """Update online action value (Q) function with a batch of experiences"""
+        pass
+
+
+######################################################################
+# In the following sections, we will populate Mario’s parameters and
+# define his functions.
+#
+
+
+######################################################################
+# Act
+# --------------
+#
+# For any given state, an agent can choose to do the most optimal action
+# (**exploit**) or a random action (**explore**).
+#
+# Mario randomly explores with a chance of ``self.exploration_rate``; when
+# he chooses to exploit, he relies on ``MarioNet`` (implemented in
+# ``Learn`` section) to provide the most optimal action.
+#
+
+
+class Mario:
+    def __init__(self, state_dim, action_dim, save_dir):
+        self.state_dim = state_dim
+        self.action_dim = action_dim
+        self.save_dir = save_dir
+
+        self.use_cuda = torch.cuda.is_available()
+
+        # Mario's DNN to predict the most optimal action - we implement this in the Learn section
+        self.net = MarioNet(self.state_dim, self.action_dim).float()
+        if self.use_cuda:
+            self.net = self.net.to(device="cuda")
+
+        self.exploration_rate = 1
+        self.exploration_rate_decay = 0.99999975
+        self.exploration_rate_min = 0.1
+        self.curr_step = 0
+
+        self.save_every = 5e5  # no. of experiences between saving Mario Net
+
+    def act(self, state):
+        """
+    Given a state, choose an epsilon-greedy action and update value of step.
+
+    Inputs:
+    state(LazyFrame): A single observation of the current state, dimension is (state_dim)
+    Outputs:
+    action_idx (int): An integer representing which action Mario will perform
+    """
+        # EXPLORE
+        if np.random.rand() < self.exploration_rate:
+            action_idx = np.random.randint(self.action_dim)
+
+        # EXPLOIT
+        else:
+            state = state.__array__()
+            if self.use_cuda:
+                state = torch.tensor(state).cuda()
+            else:
+                state = torch.tensor(state)
+            state = state.unsqueeze(0)
+            action_values = self.net(state, model="online")
+            action_idx = torch.argmax(action_values, axis=1).item()
+
+        # decrease exploration_rate
+        self.exploration_rate *= self.exploration_rate_decay
+        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
+
+        # increment step
+        self.curr_step += 1
+        return action_idx
+
+
+######################################################################
+# Cache and Recall
+# ----------------------
+#
+# These two functions serve as Mario’s “memory” process.
+#
+# ``cache()``: Each time Mario performs an action, he stores the
+# ``experience`` to his memory. His experience includes the current
+# *state*, *action* performed, *reward* from the action, the *next state*,
+# and whether the game is *done*.
+#
+# ``recall()``: Mario randomly samples a batch of experiences from his
+# memory, and uses that to learn the game.
+#
+
+
+class Mario(Mario):  # subclassing for continuity
+    def __init__(self, state_dim, action_dim, save_dir):
+        super().__init__(state_dim, action_dim, save_dir)
+        self.memory = deque(maxlen=100000)
+        self.batch_size = 32
+
+    def cache(self, state, next_state, action, reward, done):
+        """
+        Store the experience to self.memory (replay buffer)
+
+        Inputs:
+        state (LazyFrame),
+        next_state (LazyFrame),
+        action (int),
+        reward (float),
+        done(bool))
+        """
+        state = state.__array__()
+        next_state = next_state.__array__()
+
+        if self.use_cuda:
+            state = torch.tensor(state).cuda()
+            next_state = torch.tensor(next_state).cuda()
+            action = torch.tensor([action]).cuda()
+            reward = torch.tensor([reward]).cuda()
+            done = torch.tensor([done]).cuda()
+        else:
+            state = torch.tensor(state)
+            next_state = torch.tensor(next_state)
+            action = torch.tensor([action])
+            reward = torch.tensor([reward])
+            done = torch.tensor([done])
+
+        self.memory.append((state, next_state, action, reward, done,))
+
+    def recall(self):
+        """
+        Retrieve a batch of experiences from memory
+        """
+        batch = random.sample(self.memory, self.batch_size)
+        state, next_state, action, reward, done = map(torch.stack, zip(*batch))
+        return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
+
+
+######################################################################
+# Learn
+# --------------
+#
+# Mario uses the `DDQN algorithm <https://arxiv.org/pdf/1509.06461>`__
+# under the hood. DDQN uses two ConvNets - :math:`Q_{online}` and
+# :math:`Q_{target}` - that independently approximate the optimal
+# action-value function.
+#
+# In our implementation, we share feature generator ``features`` across
+# :math:`Q_{online}` and :math:`Q_{target}`, but maintain separate FC
+# classifiers for each. :math:`\theta_{target}` (the parameters of
+# :math:`Q_{target}`) is frozen to prevent updation by backprop. Instead,
+# it is periodically synced with :math:`\theta_{online}` (more on this
+# later).
+#
+# Neural Network
+# ~~~~~~~~~~~~~~~~~~
+
+
+class MarioNet(nn.Module):
+    """mini cnn structure
+  input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output
+  """
+
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        c, h, w = input_dim
+
+        if h != 84:
+            raise ValueError(f"Expecting input height: 84, got: {h}")
+        if w != 84:
+            raise ValueError(f"Expecting input width: 84, got: {w}")
+
+        self.online = nn.Sequential(
+            nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
+            nn.ReLU(),
+            nn.Flatten(),
+            nn.Linear(3136, 512),
+            nn.ReLU(),
+            nn.Linear(512, output_dim),
+        )
+
+        self.target = copy.deepcopy(self.online)
+
+        # Q_target parameters are frozen.
+        for p in self.target.parameters():
+            p.requires_grad = False
+
+    def forward(self, input, model):
+        if model == "online":
+            return self.online(input)
+        elif model == "target":
+            return self.target(input)
+
+
+######################################################################
+# TD Estimate & TD Target
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Two values are involved in learning:
+#
+# **TD Estimate** - the predicted optimal :math:`Q^*` for a given state
+# :math:`s`
+#
+# .. math::
+#
+#
+#    {TD}_e = Q_{online}^*(s,a)
+#
+# **TD Target** - aggregation of current reward and the estimated
+# :math:`Q^*` in the next state :math:`s'`
+#
+# .. math::
+#
+#
+#    a' = argmax_{a} Q_{online}(s', a)
+#
+# .. math::
+#
+#
+#    {TD}_t = r + \gamma Q_{target}^*(s',a')
+#
+# Because we don’t know what next action :math:`a'` will be, we use the
+# action :math:`a'` maximizes :math:`Q_{online}` in the next state
+# :math:`s'`.
+#
+# Notice we use the
+# `@torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html#no-grad>`__
+# decorator on ``td_target()`` to disable gradient calculations here
+# (because we don’t need to backpropagate on :math:`\theta_{target}`).
+#
+
+
+class Mario(Mario):
+    def __init__(self, state_dim, action_dim, save_dir):
+        super().__init__(state_dim, action_dim, save_dir)
+        self.gamma = 0.9
+
+    def td_estimate(self, state, action):
+        current_Q = self.net(state, model="online")[
+            np.arange(0, self.batch_size), action
+        ]  # Q_online(s,a)
+        return current_Q
+
+    @torch.no_grad()
+    def td_target(self, reward, next_state, done):
+        next_state_Q = self.net(next_state, model="online")
+        best_action = torch.argmax(next_state_Q, axis=1)
+        next_Q = self.net(next_state, model="target")[
+            np.arange(0, self.batch_size), best_action
+        ]
+        return (reward + (1 - done.float()) * self.gamma * next_Q).float()
+
+
+######################################################################
+# Updating the model
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# As Mario samples inputs from his replay buffer, we compute :math:`TD_t`
+# and :math:`TD_e` and backpropagate this loss down :math:`Q_{online}` to
+# update its parameters :math:`\theta_{online}` (:math:`\alpha` is the
+# learning rate ``lr`` passed to the ``optimizer``)
+#
+# .. math::
+#
+#
+#    \theta_{online} \leftarrow \theta_{online} + \alpha \nabla(TD_e - TD_t)
+#
+# :math:`\theta_{target}` does not update through backpropagation.
+# Instead, we periodically copy :math:`\theta_{online}` to
+# :math:`\theta_{target}`
+#
+# .. math::
+#
+#
+#    \theta_{target} \leftarrow \theta_{online}
+#
+#
+
+
+class Mario(Mario):
+    def __init__(self, state_dim, action_dim, save_dir):
+        super().__init__(state_dim, action_dim, save_dir)
+        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)
+        self.loss_fn = torch.nn.SmoothL1Loss()
+
+    def update_Q_online(self, td_estimate, td_target):
+        loss = self.loss_fn(td_estimate, td_target)
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        return loss.item()
+
+    def sync_Q_target(self):
+        self.net.target.load_state_dict(self.net.online.state_dict())
+
+
+######################################################################
+# Save checkpoint
+# ~~~~~~~~~~~~~~~~~~
+#
+
+
+class Mario(Mario):
+    def save(self):
+        save_path = (
+            self.save_dir / f"mario_net_{int(self.curr_step // self.save_every)}.chkpt"
+        )
+        torch.save(
+            dict(model=self.net.state_dict(), exploration_rate=self.exploration_rate),
+            save_path,
+        )
+        print(f"MarioNet saved to {save_path} at step {self.curr_step}")
+
+
+######################################################################
+# Putting it all together
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+
+
+class Mario(Mario):
+    def __init__(self, state_dim, action_dim, save_dir):
+        super().__init__(state_dim, action_dim, save_dir)
+        self.burnin = 1e4  # min. experiences before training
+        self.learn_every = 3  # no. of experiences between updates to Q_online
+        self.sync_every = 1e4  # no. of experiences between Q_target & Q_online sync
+
+    def learn(self):
+        if self.curr_step % self.sync_every == 0:
+            self.sync_Q_target()
+
+        if self.curr_step % self.save_every == 0:
+            self.save()
+
+        if self.curr_step < self.burnin:
+            return None, None
+
+        if self.curr_step % self.learn_every != 0:
+            return None, None
+
+        # Sample from memory
+        state, next_state, action, reward, done = self.recall()
+
+        # Get TD Estimate
+        td_est = self.td_estimate(state, action)
+
+        # Get TD Target
+        td_tgt = self.td_target(reward, next_state, done)
+
+        # Backpropagate loss through Q_online
+        loss = self.update_Q_online(td_est, td_tgt)
+
+        return (td_est.mean().item(), loss)
+
+
+######################################################################
+# Logging
+# --------------
+#
+
+import numpy as np
+import time, datetime
+import matplotlib.pyplot as plt
+
+
+class MetricLogger:
+    def __init__(self, save_dir):
+        self.save_log = save_dir / "log"
+        with open(self.save_log, "w") as f:
+            f.write(
+                f"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}"
+                f"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}"
+                f"{'TimeDelta':>15}{'Time':>20}\n"
+            )
+        self.ep_rewards_plot = save_dir / "reward_plot.jpg"
+        self.ep_lengths_plot = save_dir / "length_plot.jpg"
+        self.ep_avg_losses_plot = save_dir / "loss_plot.jpg"
+        self.ep_avg_qs_plot = save_dir / "q_plot.jpg"
+
+        # History metrics
+        self.ep_rewards = []
+        self.ep_lengths = []
+        self.ep_avg_losses = []
+        self.ep_avg_qs = []
+
+        # Moving averages, added for every call to record()
+        self.moving_avg_ep_rewards = []
+        self.moving_avg_ep_lengths = []
+        self.moving_avg_ep_avg_losses = []
+        self.moving_avg_ep_avg_qs = []
+
+        # Current episode metric
+        self.init_episode()
+
+        # Timing
+        self.record_time = time.time()
+
+    def log_step(self, reward, loss, q):
+        self.curr_ep_reward += reward
+        self.curr_ep_length += 1
+        if loss:
+            self.curr_ep_loss += loss
+            self.curr_ep_q += q
+            self.curr_ep_loss_length += 1
+
+    def log_episode(self):
+        "Mark end of episode"
+        self.ep_rewards.append(self.curr_ep_reward)
+        self.ep_lengths.append(self.curr_ep_length)
+        if self.curr_ep_loss_length == 0:
+            ep_avg_loss = 0
+            ep_avg_q = 0
+        else:
+            ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
+            ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
+        self.ep_avg_losses.append(ep_avg_loss)
+        self.ep_avg_qs.append(ep_avg_q)
+
+        self.init_episode()
+
+    def init_episode(self):
+        self.curr_ep_reward = 0.0
+        self.curr_ep_length = 0
+        self.curr_ep_loss = 0.0
+        self.curr_ep_q = 0.0
+        self.curr_ep_loss_length = 0
+
+    def record(self, episode, epsilon, step):
+        mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)
+        mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)
+        mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)
+        mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)
+        self.moving_avg_ep_rewards.append(mean_ep_reward)
+        self.moving_avg_ep_lengths.append(mean_ep_length)
+        self.moving_avg_ep_avg_losses.append(mean_ep_loss)
+        self.moving_avg_ep_avg_qs.append(mean_ep_q)
+
+        last_record_time = self.record_time
+        self.record_time = time.time()
+        time_since_last_record = np.round(self.record_time - last_record_time, 3)
+
+        print(
+            f"Episode {episode} - "
+            f"Step {step} - "
+            f"Epsilon {epsilon} - "
+            f"Mean Reward {mean_ep_reward} - "
+            f"Mean Length {mean_ep_length} - "
+            f"Mean Loss {mean_ep_loss} - "
+            f"Mean Q Value {mean_ep_q} - "
+            f"Time Delta {time_since_last_record} - "
+            f"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}"
+        )
+
+        with open(self.save_log, "a") as f:
+            f.write(
+                f"{episode:8d}{step:8d}{epsilon:10.3f}"
+                f"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}"
+                f"{time_since_last_record:15.3f}"
+                f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n"
+            )
+
+        for metric in ["ep_rewards", "ep_lengths", "ep_avg_losses", "ep_avg_qs"]:
+            plt.plot(getattr(self, f"moving_avg_{metric}"))
+            plt.savefig(getattr(self, f"{metric}_plot"))
+            plt.clf()
+
+
+######################################################################
+# Let’s play!
+# """""""""""""""
+#
+# In this example we run the training loop for 10 episodes, but for Mario to truly learn the ways of
+# his world, we suggest running the loop for at least 40,000 episodes!
+#
+use_cuda = torch.cuda.is_available()
+print(f"Using CUDA: {use_cuda}")
+print()
+
+save_dir = Path("checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+save_dir.mkdir(parents=True)
+
+mario = Mario(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir)
+
+logger = MetricLogger(save_dir)
+
+episodes = 10
+for e in range(episodes):
+
+    state = env.reset()
+
+    # Play the game!
+    while True:
+
+        # Run agent on the state
+        action = mario.act(state)
+
+        # Agent performs action
+        next_state, reward, done, info = env.step(action)
+
+        # Remember
+        mario.cache(state, next_state, action, reward, done)
+
+        # Learn
+        q, loss = mario.learn()
+
+        # Logging
+        logger.log_step(reward, loss, q)
+
+        # Update state
+        state = next_state
+
+        # Check if end of game
+        if done or info["flag_get"]:
+            break
+
+    logger.log_episode()
+
+    if e % 20 == 0:
+        logger.record(episode=e, epsilon=mario.exploration_rate, step=mario.curr_step)
+
+
+######################################################################
+# Conclusion
+# """""""""""""""
+#
+# In this tutorial, we saw how we can use PyTorch to train a game-playing AI. You can use the same methods
+# to train an AI to play any of the games at the `OpenAI gym <https://gym.openai.com/>`__. Hope you enjoyed this tutorial, feel free to reach us at
+# `our github <https://github.com/yuansongFeng/MadMario/>`__!
diff --git a/intermediate_source/memory_format_tutorial.py b/intermediate_source/memory_format_tutorial.py
index 7bec8ef8e..7b1730ad2 100644
--- a/intermediate_source/memory_format_tutorial.py
+++ b/intermediate_source/memory_format_tutorial.py
@@ -4,37 +4,33 @@
 *******************************************************
 
 **Author**: `Vitaly Fedyunin <https://github.com/VitalyFedyunin>`_
-
 **번역**: `Choi Yoonjeong <https://github.com/potatochips178>`_
 
-Channels Last가 무엇인가요
+Channels last가 무엇인가요
 ----------------------------
-Channels Last 메모리 형식(memory format)은 차원 순서를 유지하면서 메모리 상의 NCHW 텐서(tensor)를 정렬하는 또 다른 방식입니다.
-Channels Last 텐서는 채널(Channel)이 가장 밀도가 높은(densest) 차원으로 정렬(예. 이미지를 픽셀x픽셀로 저장)됩니다.
+Channels last 메모리 형식(memory format)은 차원 순서를 유지하면서 메모리 상의 NCHW 텐서(tensor)를 정렬하는 또 다른 방식입니다.
+Channels last 텐서는 채널(Channel)이 가장 밀도가 높은(densest) 차원으로 정렬(예. 이미지를 픽셀x픽셀로 저장)됩니다.
 
 예를 들어, (2개의 2 x 2 이미지에 3개의 채널이 존재하는 경우) 전형적인(연속적인) NCHW 텐서의 저장 방식은 다음과 같습니다:
 
 .. figure:: /_static/img/classic_memory_format.png
    :alt: classic_memory_format
 
-Channels Last 메모리 형식은 데이터를 다르게 정렬합니다:
+Channels last 메모리 형식은 데이터를 다르게 정렬합니다:
 
 .. figure:: /_static/img/channels_last_memory_format.png
    :alt: channels_last_memory_format
 
 PyTorch는 기존의 스트라이드(strides) 구조를 사용함으로써 메모리 형식을 지원(하며, eager, JIT 및 TorchScript를 포함한
-기존의 모델들과 하위 호환성을 제공)합니다. 예를 들어, Channels Last 형식에서 10x3x16x16 배치(batch)는 (768, 1, 48, 3)와
+기존의 모델들과 하위 호환성을 제공)합니다. 예를 들어, Channels last 형식에서 10x3x16x16 배치(batch)는 (768, 1, 48, 3)와
 같은 폭(strides)을 가지고 있게 됩니다.
 
 """
 
 ######################################################################
-# Channels Last 메모리 형식은 오직 4D NCWH Tensors에서만 실행할 수 있습니다.
+# Channels last 메모리 형식은 오직 4D NCWH Tensors에서만 실행할 수 있습니다.
 #
 
-import torch
-N, C, H, W = 10, 3, 32, 32
-
 ######################################################################
 # 메모리 형식(Memory Format) API
 # ---------------------------------
@@ -43,23 +39,25 @@
 
 ######################################################################
 # 전형적인 PyTorch의 연속적인 텐서(tensor)
+import torch
+N, C, H, W = 10, 3, 32, 32
 x = torch.empty(N, C, H, W)
 print(x.stride()) # 결과: (3072, 1024, 32, 1)
 
 ######################################################################
 # 변환 연산자
-x = x.contiguous(memory_format=torch.channels_last)
+x = x.to(memory_format=torch.channels_last)
 print(x.shape) # 결과: (10, 3, 32, 32) 차원 순서는 보존함
 print(x.stride()) # 결과: (3072, 1, 96, 3)
 
 ######################################################################
 # 연속적인 형식으로 되돌리기
-x = x.contiguous(memory_format=torch.contiguous_format)
+x = x.to(memory_format=torch.contiguous_format)
 print(x.stride()) # 결과: (3072, 1024, 32, 1)
 
 ######################################################################
 # 다른 방식
-x = x.to(memory_format=torch.channels_last)
+x = x.contiguous(memory_format=torch.channels_last)
 print(x.stride()) # 결과: (3072, 1, 96, 3)
 
 ######################################################################
@@ -67,7 +65,38 @@
 print(x.is_contiguous(memory_format=torch.channels_last)) # 결과: True
 
 ######################################################################
-# Channels Last 방식으로 생성하기
+# ``to`` 와 ``contiguous`` 에는 작은 차이(minor difference)가 있습니다.
+# 명시적으로 텐서(tensor)의 메모리 형식을 변환할 때는 ``to`` 를 사용하는 것을
+# 권장합니다.
+#
+# 대부분의 경우 두 API는 동일하게 동작합니다. 하지만 ``C==1`` 이거나
+# ``H == 1 && W == 1`` 인 ``NCHW`` 4D 텐서의 특수한 경우에는 ``to`` 만이
+# Channel last 메모리 형식으로 표현된 적절한 폭(stride)을 생성합니다.
+#
+# 이는 위의 두가지 경우에 텐서의 메모리 형식이 모호하기 때문입니다.
+# 예를 들어, 크기가 ``N1HW`` 인 연속적인 텐서(contiguous tensor)는
+# ``연속적`` 이면서 Channel last 형식으로 메모리에 저장됩니다.
+# 따라서, 주어진 메모리 형식에 대해 이미 ``is_contiguous`` 로 간주되어
+# ``contiguous`` 호출은 동작하지 않게(no-op) 되어, 폭(stride)을 갱신하지
+# 않게 됩니다. 반면에, ``to`` 는 의도한 메모리 형식으로 적절하게 표현하기 위해
+# 크기가 1인 차원에서 의미있는 폭(stride)으로 재배열(restride)합니다.
+special_x = torch.empty(4, 1, 4, 4)
+print(special_x.is_contiguous(memory_format=torch.channels_last)) # Ouputs: True
+print(special_x.is_contiguous(memory_format=torch.contiguous_format)) # Ouputs: True
+
+######################################################################
+# 명시적 치환(permutation) API인 ``permute`` 에서도 동일하게 적용됩니다.
+# 모호성이 발생할 수 있는 특별한 경우에, ``permute`` 는 의도한 메모리
+# 형식으로 전달되는 폭(stride)을 생성하는 것이 보장되지 않습니다.
+# ``to`` 로 명시적으로 메모리 형식을 지정하여 의도치 않은 동작을 피할
+# 것을 권장합니다.
+#
+# 또한, 3개의 비-배치(non-batch) 차원이 모두 ``1`` 인 극단적인 경우
+#  (``C==1 && H==1 && W==1``), 현재 구현은 텐서를 Channels last 메모리
+# 형식으로 표시할 수 없음을 알려드립니다.
+
+######################################################################
+# Channels last 방식으로 생성하기
 x = torch.empty(N, C, H, W, memory_format=torch.channels_last)
 print(x.stride()) # 결과: (3072, 1, 96, 3)
 
@@ -93,23 +122,35 @@
 print(z.stride()) # 결과: (3072, 1, 96, 3)
 
 ######################################################################
-# Conv, Batchnorm 모듈은 Channels Last를 지원합니다. (단, CudNN >=7.6 에서만 동작)
+# Conv, Batchnorm 모듈은 Channels last를 지원합니다. (단, CudNN >=7.6 에서만 동작)
+# 합성곱(convolution) 모듈은 이진 p-wise 연산자(binary p-wise operator)와는 다르게
+# Channels last가 주된 메모리 형식입니다. 모든 입력은 연속적인 메모리 형식이며,
+# 연산자는 연속된 메모리 형식으로 출력을 생성합니다. 그렇지 않으면, 출력은
+# channels last 메모리 형식입니다.
+
 if torch.backends.cudnn.version() >= 7603:
-    input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device="cuda", requires_grad=True)
-    model = torch.nn.Conv2d(8, 4, 3).cuda().float()
+    model = torch.nn.Conv2d(8, 4, 3).cuda().half()
+    model = model.to(memory_format=torch.channels_last) # 모듈 인자들은 Channels last로 변환이 필요합니다
 
-    input = input.contiguous(memory_format=torch.channels_last)
-    model = model.to(memory_format=torch.channels_last) # 모듈 인자들은 Channels Last로 변환이 필요합니다
+    input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, requires_grad=True)
+    input = input.to(device="cuda", memory_format=torch.channels_last, dtype=torch.float16)
 
     out = model(input)
     print(out.is_contiguous(memory_format=torch.channels_last)) # 결과: True
 
+######################################################################
+# 입력 텐서가 Channels last를 지원하지 않는 연산자를 만나면
+# 치환(permutation)이 커널에 자동으로 적용되어 입력 텐서를 연속적인 형식으로
+# 복원합니다. 이 경우 과부하가 발생하여 channel last 메모리 형식의 전파가
+# 중단됩니다. 그럼에도 불구하고, 올바른 출력은 보장됩니다.
+
 ######################################################################
 # 성능 향상
 # -------------------------------------------------------------------------------------------
-# Tensor Cores를 지원하는 Nvidia의 하드웨어에서 가장 의미심장한 성능 향상을 보였습니다.
-# Nvidia가 제공하는 AMP(Automated Mixed Precision) 학습 스크립트로 22% 이상의 성능 향상을 확인할 수 있었습니다.
-# https://github.com/NVIDIA/apex
+# 정밀도를 줄인(reduced precision ``torch.float16``) 상태에서 Tensor Cores를 지원하는 Nvidia의 하드웨어에서
+# 가장 의미심장한 성능 향상을 보였습니다. `AMP (Automated Mixed Precision)` 학습 스크립트를 활용하여
+# 연속적인 형식에 비해 Channels last 방식이 22% 이상의 성능 향승을 확인할 수 있었습니다.
+# 이 때, Nvidia가 제공하는 AMP를 사용했습니다. https://github.com/NVIDIA/apex
 #
 # ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2  ./data``
 
@@ -146,7 +187,7 @@
 # Epoch: [0][80/125] Time 0.260 (0.335) Speed 770.324 (597.659) Loss 2.2505953312 (1.0879) Prec@1 50.500 (52.938) Prec@5 100.000 (100.000)
 
 ######################################################################
-# ``--channels-last true`` 인자를 전달하여 Channels Last 형식으로 모델을 실행하면 22%의 성능 향상을 보입니다.
+# ``--channels-last true`` 인자를 전달하여 Channels last 형식으로 모델을 실행하면 22%의 성능 향상을 보입니다.
 #
 # ``python main_amp.py -a resnet50 --b 200 --workers 16 --opt-level O2 --channels-last true ./data``
 
@@ -187,7 +228,7 @@
 # Epoch: [0][80/125] Time 0.198 (0.269) Speed 1011.827 (743.883) Loss 2.8196096420 (2.4011) Prec@1 47.500 (50.938) Prec@5 100.000 (100.000)
 
 ######################################################################
-# 아래 목록의 모델들은 Channels Last 형식을 전적으로 지원(full support)하며 Volta 장비에서 8%-35%의 성능 향상을 보입니다:
+# 아래 목록의 모델들은 Channels last 형식을 전적으로 지원(full support)하며 Volta 장비에서 8%-35%의 성능 향상을 보입니다:
 # ``alexnet``, ``mnasnet0_5``, ``mnasnet0_75``, ``mnasnet1_0``, ``mnasnet1_3``, ``mobilenet_v2``, ``resnet101``, ``resnet152``, ``resnet18``, ``resnet34``, ``resnet50``, ``resnext50_32x4d``, ``shufflenet_v2_x0_5``, ``shufflenet_v2_x1_0``, ``shufflenet_v2_x1_5``, ``shufflenet_v2_x2_0``, ``squeezenet1_0``, ``squeezenet1_1``, ``vgg11``, ``vgg11_bn``, ``vgg13``, ``vgg13_bn``, ``vgg16``, ``vgg16_bn``, ``vgg19``, ``vgg19_bn``, ``wide_resnet101_2``, ``wide_resnet50_2``
 #
 
@@ -195,8 +236,9 @@
 # 기존 모델들 변환하기
 # --------------------------
 #
-# Channels Last 지원은 기존 모델이 무엇이냐에 따라 제한되지 않으며, 어떠한 모델도 Channels Last로 변환할 수 있으며
-# 입력(input)의 형식만 맞춰주면 (신경망) 그래프를 통해 바로 전파(propagate)할 수 있습니다.
+# Channels last 지원은 기존 모델이 무엇이냐에 따라 제한되지 않습니다.
+# 어떠한 모델도 Channels last로 변환할 수 있으며
+# 입력(또는 특정 가중치)의 형식만 맞춰주면 (신경망) 그래프를 통해 바로 전파(propagate)할 수 있습니다.
 #
 
 # 모델을 초기화한(또는 불러온) 이후, 한 번 실행이 필요합니다.
@@ -207,8 +249,15 @@
 output = model(input)
 
 #######################################################################
-# 그러나, 모든 연산자들이 Channels Last를 지원하도록 완전히 바뀐 것은 아닙니다(일반적으로는 연속적인 출력을 대신 반환합니다).
-# 즉, Channel Last 지원 연산자 목록 https://github.com/pytorch/pytorch/wiki/Operators-with-Channels-Last-support 에서 사용한 연산자들이 존재하는지 확인하거나,
+# 그러나, 모든 연산자들이 Channels last를 지원하도록 완전히 바뀐 것은 아닙니다(일반적으로는 연속적인 출력을 대신 반환합니다).
+# 위의 예시들에서 Channels last를 지원하지 않는 계층(layer)은 메모리 형식 전파를 멈추게 됩니다.
+# 그럼에도 불구하고, 모델을 channels last 형식으로 변환했으므로, Channels last 메모리 형식으로 4차원의 가중치를 갖는
+# 각 합성곱 계층(convolution layer)에서는 Channels last 형식으로 복원되고 더 빠른 커널(faster kernel)의 이점을 누릴 수 있게 됩니다.
+#
+# 하지만 Channels last를 지원하지 않는 연산자들은 치환(permutation)에 의해 과부하가 발생하게 됩니다.
+# 선택적으로, 변환된 모델의 성능을 향상시키고 싶은 경우 모델의 연산자들 중 channel last를 지원하지 않는 연산자를 조사하고 식별할 수 있습니다.
+#
+# 이는 Channel Last 지원 연산자 목록 https://github.com/pytorch/pytorch/wiki/Operators-with-Channels-Last-support 에서 사용한 연산자들이 존재하는지 확인하거나,
 # eager 실행 모드에서 메모리 형식 검사를 도입하고 모델을 실행해야 합니다.
 #
 # 아래 코드에서, 연산자들의 출력이 입력의 메모리 형식과 일치하지 않으면 예외(exception)를 발생시킵니다.
@@ -263,14 +312,17 @@ def check_cl(*args, **kwargs):
         return result
     return check_cl
 
+old_attrs = dict()
 
 def attribute(m):
+    old_attrs[m] = dict()
     for i in dir(m):
         e = getattr(m, i)
         exclude_functions = ['is_cuda', 'has_names', 'numel',
                              'stride', 'Tensor', 'is_contiguous', '__class__']
         if i not in exclude_functions and not i.startswith('_') and '__call__' in dir(e):
             try:
+                old_attrs[m][i] = e
                 setattr(m, i, check_wrapper(e))
             except Exception as e:
                 print(i)
@@ -283,11 +335,18 @@ def attribute(m):
 
 
 ######################################################################
-# 만약 Channels Last 텐서를 지원하지 않는 연산자를 발견하였고, 기여하기를 원한다면
+# 만약 Channels last 텐서를 지원하지 않는 연산자를 발견하였고, 기여하기를 원한다면
 # 다음 개발 문서를 참고해주세요.
 # https://github.com/pytorch/pytorch/wiki/Writing-memory-format-aware-operators
 #
 
+######################################################################
+# 아래 코드는 torch의 속성(attributes)를 복원합니다.
+
+for (m, attrs) in old_attrs.items():
+  for (k,v) in attrs.items():
+    setattr(m, k, v)
+
 ######################################################################
 # 해야할 일
 # ----------
diff --git a/intermediate_source/model_parallel_tutorial.py b/intermediate_source/model_parallel_tutorial.py
index a99563c8a..9ee3f2d47 100644
--- a/intermediate_source/model_parallel_tutorial.py
+++ b/intermediate_source/model_parallel_tutorial.py
@@ -73,8 +73,9 @@ def forward(self, x):
 # 기존에 존재하는 모듈에 모델 병렬 처리 적용해보기
 # ---------------------------------------------------
 #
-# 기존에 단일 GPU에 존재하는 모듈을 여러 GPU에 할당하는 것은  단지 몇 줄의 코드를 수정하는 것으로도 쉽게 가능합니다.
-# 아래에 있는 코드들은 ResNet50 모델을 분할하는 방법입니다. 이 아이디어는, 기존에 존재하는 ResNet 모듈을 상속받아 설계할 때, 2개의 GPU에 층을 나누어 설계하는 방식으로 진행됩니다.
+# 기존에 단일 GPU에 존재하는 모듈을 여러 GPU에 할당하는 것은 단지 몇 줄의 코드를 수정하는 것으로도 쉽게 가능합니다.
+# 아래에 있는 코드들은 ``torchvision.models.reset50()`` 모델을 2개 GPU로 분할하는 방법입니다.
+# 이 아이디어는, 기존에 존재하는 ResNet 모듈을 상속받아 설계할 때, 2개의 GPU에 층을 나누어 설계하는 방식으로 진행됩니다.
 # 그 후, 2개 GPU에서 계산되는 중간 산출물 텐서값을 적절히 배치하기 위헤 순전파 메소드를 수정합니다.
 
 
@@ -119,7 +120,7 @@ def forward(self, x):
 # 또한, 두 번째 층 (layer2)이 할당된 첫 번째 GPU에서 계산된 결과를 세 번째 층 (layer3)이 할당된 두 번째 GPU로 텐서값을 복사하기 때문에 계산 과정이 더 길어지게 됩니다.
 #
 # 코드 실행 시간을 정량적으로 살펴보기 위해 실험을 하나 해봅시다. 입력 텐서값과 레이블값을 랜덤으로 설정한 후,
-# 이미 존재하는 torchvision.models.reset50() 과, 모델 병렬 처리를 진행한 ``ModelParallelResNet50`` 을 통해 학습을 진행합니다.
+# 이미 존재하는 ``torchvision.models.resnet50()`` 과, 모델 병렬 처리를 진행한 ``ModelParallelResNet50`` 을 통해 학습을 진행합니다.
 # 학습 진행을 완료한 후, 두 모델들은 랜덤으로 생성된 데이터로 학습을 진행했기 때문에 실용적인 예측을 하진 못하지만, 학습 진행 시간을 실용적으로 비교하여 할 수 있습니다.
 
 
diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py
new file mode 100644
index 000000000..45d217893
--- /dev/null
+++ b/intermediate_source/pipeline_tutorial.py
@@ -0,0 +1,456 @@
+"""
+Training Transformer models using Pipeline Parallelism
+======================================================
+
+**Author**: `Pritam Damania <https://github.com/pritamdamania87>`_
+
+This tutorial demonstrates how to train a large Transformer model across
+multiple GPUs using pipeline parallelism. This tutorial is an extension of the
+`Sequence-to-Sequence Modeling with nn.Transformer and TorchText <https://pytorch.org/tutorials/beginner/transformer_tutorial.html>`__ tutorial
+and scales up the same model to demonstrate how pipeline parallelism can be
+used to train Transformer models.
+
+Prerequisites:
+
+    * `Pipeline Parallelism <https://pytorch.org/docs/stable/pipeline.html>`__
+    * `Sequence-to-Sequence Modeling with nn.Transformer and TorchText <https://pytorch.org/tutorials/beginner/transformer_tutorial.html>`__
+"""
+
+
+######################################################################
+# Define the model
+# ----------------
+#
+
+
+######################################################################
+# In this tutorial, we will split a Transformer model across two GPUs and use
+# pipeline parallelism to train the model. The model is exactly the same model
+# used in the `Sequence-to-Sequence Modeling with nn.Transformer and TorchText
+# <https://pytorch.org/tutorials/beginner/transformer_tutorial.html>`__ tutorial,
+# but is split into two stages. The largest number of parameters belong to the
+# `nn.TransformerEncoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html>`__ layer.
+# The `nn.TransformerEncoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html>`__
+# itself consists of ``nlayers`` of `nn.TransformerEncoderLayer <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html>`__.
+# As a result, our focus is on ``nn.TransformerEncoder`` and we split the model
+# such that half of the ``nn.TransformerEncoderLayer`` are on one GPU and the
+# other half are on another. To do this, we pull out the ``Encoder`` and
+# ``Decoder`` sections into seperate modules and then build an nn.Sequential
+# representing the original Transformer module.
+
+import sys
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tempfile
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+
+if sys.platform == 'win32':
+    print('Windows platform is not supported for pipeline parallelism')
+    sys.exit(0)
+if torch.cuda.device_count() < 2:
+    print('Need at least two GPU devices for this tutorial')
+    sys.exit(0)
+
+class Encoder(nn.Module):
+    def __init__(self, ntoken, ninp, dropout=0.5):
+        super(Encoder, self).__init__()
+        self.pos_encoder = PositionalEncoding(ninp, dropout)
+        self.encoder = nn.Embedding(ntoken, ninp)
+        self.ninp = ninp
+        self.init_weights()
+
+    def init_weights(self):
+        initrange = 0.1
+        self.encoder.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, src):
+        # Need (S, N) format for encoder.
+        src = src.t()
+        src = self.encoder(src) * math.sqrt(self.ninp)
+        return self.pos_encoder(src)
+
+class Decoder(nn.Module):
+    def __init__(self, ntoken, ninp):
+        super(Decoder, self).__init__()
+        self.decoder = nn.Linear(ninp, ntoken)
+        self.init_weights()
+
+    def init_weights(self):
+        initrange = 0.1
+        self.decoder.bias.data.zero_()
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, inp):
+        # Need batch dimension first for output of pipeline.
+        return self.decoder(inp).permute(1, 0, 2)
+
+
+######################################################################
+# ``PositionalEncoding`` module injects some information about the
+# relative or absolute position of the tokens in the sequence. The
+# positional encodings have the same dimension as the embeddings so that
+# the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
+# different frequencies.
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+
+
+######################################################################
+# Load and batch data
+# -------------------
+#
+
+
+######################################################################
+# The training process uses Wikitext-2 dataset from ``torchtext``. The
+# vocab object is built based on the train dataset and is used to numericalize
+# tokens into tensors. Starting from sequential data, the ``batchify()``
+# function arranges the dataset into columns, trimming off any tokens remaining
+# after the data has been divided into batches of size ``batch_size``.
+# For instance, with the alphabet as the sequence (total length of 26)
+# and a batch size of 4, we would divide the alphabet into 4 sequences of
+# length 6:
+#
+# .. math::
+#   \begin{bmatrix}
+#   \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z}
+#   \end{bmatrix}
+#   \Rightarrow
+#   \begin{bmatrix}
+#   \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} &
+#   \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} &
+#   \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} &
+#   \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix}
+#   \end{bmatrix}
+#
+# These columns are treated as independent by the model, which means that
+# the dependence of ``G`` and ``F`` can not be learned, but allows more
+# efficient batch processing.
+#
+
+import io
+import torch
+from torchtext.utils import download_from_url, extract_archive
+from torchtext.data.utils import get_tokenizer
+from torchtext.vocab import build_vocab_from_iterator
+
+url = 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'
+test_filepath, valid_filepath, train_filepath = extract_archive(download_from_url(url))
+tokenizer = get_tokenizer('basic_english')
+vocab = build_vocab_from_iterator(map(tokenizer,
+                                      iter(io.open(train_filepath,
+                                                   encoding="utf8"))))
+
+def data_process(raw_text_iter):
+  data = [torch.tensor([vocab[token] for token in tokenizer(item)],
+                       dtype=torch.long) for item in raw_text_iter]
+  return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
+
+train_data = data_process(iter(io.open(train_filepath, encoding="utf8")))
+val_data = data_process(iter(io.open(valid_filepath, encoding="utf8")))
+test_data = data_process(iter(io.open(test_filepath, encoding="utf8")))
+
+device = torch.device("cuda")
+
+def batchify(data, bsz):
+    # Divide the dataset into bsz parts.
+    nbatch = data.size(0) // bsz
+    # Trim off any extra elements that wouldn't cleanly fit (remainders).
+    data = data.narrow(0, 0, nbatch * bsz)
+    # Evenly divide the data across the bsz batches.
+    data = data.view(bsz, -1).t().contiguous()
+    return data.to(device)
+
+batch_size = 20
+eval_batch_size = 10
+train_data = batchify(train_data, batch_size)
+val_data = batchify(val_data, eval_batch_size)
+test_data = batchify(test_data, eval_batch_size)
+
+
+######################################################################
+# Functions to generate input and target sequence
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+
+
+######################################################################
+# ``get_batch()`` function generates the input and target sequence for
+# the transformer model. It subdivides the source data into chunks of
+# length ``bptt``. For the language modeling task, the model needs the
+# following words as ``Target``. For example, with a ``bptt`` value of 2,
+# we’d get the following two Variables for ``i`` = 0:
+#
+# .. image:: ../_static/img/transformer_input_target.png
+#
+# It should be noted that the chunks are along dimension 0, consistent
+# with the ``S`` dimension in the Transformer model. The batch dimension
+# ``N`` is along dimension 1.
+#
+
+bptt = 35
+def get_batch(source, i):
+    seq_len = min(bptt, len(source) - 1 - i)
+    data = source[i:i+seq_len]
+    target = source[i+1:i+1+seq_len].view(-1)
+    # Need batch dimension first for pipeline parallelism.
+    return data.t(), target
+
+######################################################################
+# Model scale and Pipe initialization
+# -----------------------------------
+#
+
+
+######################################################################
+# To demonstrate training large Transformer models using pipeline parallelism,
+# we scale up the Transformer layers appropriately. We use an embedding
+# dimension of 4096, hidden size of 4096, 16 attention heads and 12 total
+# transformer layers (``nn.TransformerEncoderLayer``). This creates a model with
+# **~1.4 billion** parameters.
+#
+# We need to initialize the `RPC Framework <https://pytorch.org/docs/stable/rpc.html>`__
+# since Pipe depends on the RPC framework via `RRef <https://pytorch.org/docs/stable/rpc.html#rref>`__
+# which allows for future expansion to cross host pipelining. We need to
+# initialize the RPC framework with only a single worker since we're using a
+# single process to drive multiple GPUs.
+#
+# The pipeline is then initialized with 8 transformer layers on one GPU and 8
+# transformer layers on the other GPU.
+#
+# .. note::
+#    For efficiency purposes we ensure that the ``nn.Sequential`` passed to
+#    ``Pipe`` only consists of two elements (corresponding to two GPUs), this
+#    allows the Pipe to work with only two partitions and avoid any
+#    cross-partition overheads.
+
+ntokens = len(vocab.stoi) # the size of vocabulary
+emsize = 4096 # embedding dimension
+nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder
+nlayers = 12 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
+nhead = 16 # the number of heads in the multiheadattention models
+dropout = 0.2 # the dropout value
+
+from torch.distributed import rpc
+tmpfile = tempfile.NamedTemporaryFile()
+rpc.init_rpc(
+    name="worker",
+    rank=0,
+    world_size=1,
+    rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
+        init_method="file://{}".format(tmpfile.name),
+        # Specifying _transports and _channels is a workaround and we no longer
+        # will have to specify _transports and _channels for PyTorch
+        # versions >= 1.8.1
+        _transports=["ibv", "uv"],
+        _channels=["cuda_ipc", "cuda_basic"],
+    )
+)
+
+num_gpus = 2
+partition_len = ((nlayers - 1) // num_gpus) + 1
+
+# Add encoder in the beginning.
+tmp_list = [Encoder(ntokens, emsize, dropout).cuda(0)]
+module_list = []
+
+# Add all the necessary transformer blocks.
+for i in range(nlayers):
+    transformer_block = TransformerEncoderLayer(emsize, nhead, nhid, dropout)
+    if i != 0 and i % (partition_len) == 0:
+        module_list.append(nn.Sequential(*tmp_list))
+        tmp_list = []
+    device = i // (partition_len)
+    tmp_list.append(transformer_block.to(device))
+
+# Add decoder in the end.
+tmp_list.append(Decoder(ntokens, emsize).cuda(num_gpus - 1))
+module_list.append(nn.Sequential(*tmp_list))
+
+from torch.distributed.pipeline.sync import Pipe
+
+# Build the pipeline.
+chunks = 8
+model = Pipe(torch.nn.Sequential(*module_list), chunks = chunks)
+
+
+def get_total_params(module: torch.nn.Module):
+    total_params = 0
+    for param in module.parameters():
+        total_params += param.numel()
+    return total_params
+
+print ('Total parameters in model: {:,}'.format(get_total_params(model)))
+
+######################################################################
+# Run the model
+# -------------
+#
+
+
+######################################################################
+# `CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__
+# is applied to track the loss and
+# `SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd#torch.optim.SGD>`__
+# implements stochastic gradient descent method as the optimizer. The initial
+# learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr#torch.optim.lr_scheduler.StepLR>`__ is
+# applied to adjust the learn rate through epochs. During the
+# training, we use
+# `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm#torch.nn.utils.clip_grad_norm_>`__
+# function to scale all the gradient together to prevent exploding.
+#
+
+criterion = nn.CrossEntropyLoss()
+lr = 5.0 # learning rate
+optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
+
+import time
+def train():
+    model.train() # Turn on the train mode
+    total_loss = 0.
+    start_time = time.time()
+    ntokens = len(vocab.stoi)
+
+    # Train only for 50 batches to keep script execution time low.
+    nbatches = min(50 * bptt, train_data.size(0) - 1)
+
+    for batch, i in enumerate(range(0, nbatches, bptt)):
+        data, targets = get_batch(train_data, i)
+        optimizer.zero_grad()
+        # Since the Pipe is only within a single host and process the ``RRef``
+        # returned by forward method is local to this node and can simply
+        # retrieved via ``RRef.local_value()``.
+        output = model(data).local_value()
+        # Need to move targets to the device where the output of the
+        # pipeline resides.
+        loss = criterion(output.view(-1, ntokens), targets.cuda(1))
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+        optimizer.step()
+
+        total_loss += loss.item()
+        log_interval = 10
+        if batch % log_interval == 0 and batch > 0:
+            cur_loss = total_loss / log_interval
+            elapsed = time.time() - start_time
+            print('| epoch {:3d} | {:5d}/{:5d} batches | '
+                  'lr {:02.2f} | ms/batch {:5.2f} | '
+                  'loss {:5.2f} | ppl {:8.2f}'.format(
+                    epoch, batch, nbatches // bptt, scheduler.get_lr()[0],
+                    elapsed * 1000 / log_interval,
+                    cur_loss, math.exp(cur_loss)))
+            total_loss = 0
+            start_time = time.time()
+
+def evaluate(eval_model, data_source):
+    eval_model.eval() # Turn on the evaluation mode
+    total_loss = 0.
+    ntokens = len(vocab.stoi)
+    # Evaluate only for 50 batches to keep script execution time low.
+    nbatches = min(50 * bptt, data_source.size(0) - 1)
+    with torch.no_grad():
+        for i in range(0, nbatches, bptt):
+            data, targets = get_batch(data_source, i)
+            output = eval_model(data).local_value()
+            output_flat = output.view(-1, ntokens)
+            # Need to move targets to the device where the output of the
+            # pipeline resides.
+            total_loss += len(data) * criterion(output_flat, targets.cuda(1)).item()
+    return total_loss / (len(data_source) - 1)
+
+######################################################################
+# Loop over epochs. Save the model if the validation loss is the best
+# we've seen so far. Adjust the learning rate after each epoch.
+
+best_val_loss = float("inf")
+epochs = 3 # The number of epochs
+best_model = None
+
+for epoch in range(1, epochs + 1):
+    epoch_start_time = time.time()
+    train()
+    val_loss = evaluate(model, val_data)
+    print('-' * 89)
+    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
+          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
+                                     val_loss, math.exp(val_loss)))
+    print('-' * 89)
+
+    if val_loss < best_val_loss:
+        best_val_loss = val_loss
+        best_model = model
+
+    scheduler.step()
+
+
+######################################################################
+# Evaluate the model with the test dataset
+# -------------------------------------
+#
+
+
+######################################################################
+# Apply the best model to check the result with the test dataset.
+
+test_loss = evaluate(best_model, test_data)
+print('=' * 89)
+print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
+    test_loss, math.exp(test_loss)))
+print('=' * 89)
+
+
+######################################################################
+# Output
+# ------
+#
+
+
+######################################################################
+#.. code-block:: py
+#
+#   Total parameters in model: 1,847,087,215
+#   | epoch   1 |    10/   50 batches | lr 5.00 | ms/batch 2387.45 | loss 42.16 | ppl 2036775646369743616.00
+#   | epoch   1 |    20/   50 batches | lr 5.00 | ms/batch 2150.93 | loss 48.24 | ppl 891334049215401558016.00
+#   | epoch   1 |    30/   50 batches | lr 5.00 | ms/batch 2155.23 | loss 34.66 | ppl 1125676483188404.62
+#   | epoch   1 |    40/   50 batches | lr 5.00 | ms/batch 2158.42 | loss 38.87 | ppl 76287208340888368.00
+#   -----------------------------------------------------------------------------------------
+#   | end of epoch   1 | time: 119.65s | valid loss  2.95 | valid ppl    19.15
+#   -----------------------------------------------------------------------------------------
+#   | epoch   2 |    10/   50 batches | lr 4.51 | ms/batch 2376.16 | loss 34.92 | ppl 1458001430957104.00
+#   | epoch   2 |    20/   50 batches | lr 4.51 | ms/batch 2160.96 | loss 34.75 | ppl 1232463826541886.50
+#   | epoch   2 |    30/   50 batches | lr 4.51 | ms/batch 2160.66 | loss 28.10 | ppl 1599598251136.51
+#   | epoch   2 |    40/   50 batches | lr 4.51 | ms/batch 2160.07 | loss 20.25 | ppl 621174306.77
+#   -----------------------------------------------------------------------------------------
+#   | end of epoch   2 | time: 119.76s | valid loss  0.87 | valid ppl     2.38
+#   -----------------------------------------------------------------------------------------
+#   | epoch   3 |    10/   50 batches | lr 4.29 | ms/batch 2376.49 | loss 13.20 | ppl 537727.23
+#   | epoch   3 |    20/   50 batches | lr 4.29 | ms/batch 2160.12 | loss 10.98 | ppl 58548.58
+#   | epoch   3 |    30/   50 batches | lr 4.29 | ms/batch 2160.05 | loss 12.01 | ppl 164152.79
+#   | epoch   3 |    40/   50 batches | lr 4.29 | ms/batch 2160.03 | loss 10.63 | ppl 41348.00
+#   -----------------------------------------------------------------------------------------
+#   | end of epoch   3 | time: 119.76s | valid loss  0.78 | valid ppl     2.17
+#   -----------------------------------------------------------------------------------------
+#   =========================================================================================
+#   | End of training | test loss  0.69 | test ppl     1.99
+#   =========================================================================================
diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index c95e1daf9..dd513049b 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -458,13 +458,13 @@ def optimize_model():
         # 다음 상태로 이동
         state = next_state
 
-        # 최적화 한단계 수행(목표 네트워크에서)
+        # (정책 네트워크에서) 최적화 한단계 수행
         optimize_model()
         if done:
             episode_durations.append(t + 1)
             plot_durations()
             break
-    #목표 네트워크 업데이트, 모든 웨이트와 바이어스 복사
+    # 목표 네트워크 업데이트, 모든 웨이트와 바이어스 복사
     if i_episode % TARGET_UPDATE == 0:
         target_net.load_state_dict(policy_net.state_dict())
 
diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py
index bb785fc19..f629f52e6 100644
--- a/intermediate_source/seq2seq_translation_tutorial.py
+++ b/intermediate_source/seq2seq_translation_tutorial.py
@@ -70,17 +70,6 @@
 와 :doc:`/intermediate/char_rnn_generation_tutorial` 는
 각각 인코더, 디코더 모델과 비슷한 컨센을 가지기 때문에 도움이 됩니다.
 
-추가로 이 토픽들을 다루는 논문을 읽어 보십시오:
-
--  `Learning Phrase Representations using RNN Encoder-Decoder for
-   Statistical Machine Translation <https://arxiv.org/abs/1406.1078>`__
--  `Sequence to Sequence Learning with Neural
-   Networks <https://arxiv.org/abs/1409.3215>`__
--  `Neural Machine Translation by Jointly Learning to Align and
-   Translate <https://arxiv.org/abs/1409.0473>`__
--  `A Neural Conversational Model <https://arxiv.org/abs/1506.05869>`__
-
-
 **요구 사항**
 """
 from __future__ import unicode_literals, print_function, division
@@ -103,7 +92,7 @@
 #
 # 이 프로젝트의 데이터는 수천 개의 영어-프랑스어 번역 쌍입니다.
 #
-# `Open Data Stack Exchange <https://opendata.stackexchange.com/questions/3888/dataset-of-sentences-translated-into-many-languages>`__ 
+# `Open Data Stack Exchange <https://opendata.stackexchange.com/questions/3888/dataset-of-sentences-translated-into-many-languages>`__
 # 에 관한 이 질문은 https://tatoeba.org/eng/downloads 에서 다운 로드가 가능한
 # 공개 번역 사이트 https://tatoeba.org/ 를 알려 주었습니다. 더 나은 방법으로
 # 언어 쌍을 개별 텍스트 파일로 분할하는 추가 작업을 수행한
@@ -302,7 +291,7 @@ def prepareData(lang1, lang2, reverse=False):
 #
 # 다음 문장 "Je ne suis pas le chat noir" → "I am not the black cat"
 # 를 살펴 봅시다. 입력 문장의 단어 대부분은 출력 문장에서
-# 직역("chat noir" 와 "black cat")되지만 약간 다른 순서도 있습니다. 
+# 직역("chat noir" 와 "black cat")되지만 약간 다른 순서도 있습니다.
 # "ne/pas" 구조로 인해 입력 문장에 단어가 하나 더 있습니다.
 # 입력 단어의 시퀀스를 직역해서 정확한 번역을 만드는
 # 것은 어려울 것입니다.
@@ -561,7 +550,7 @@ def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, deco
             decoder_output, decoder_hidden, decoder_attention = decoder(
                 decoder_input, decoder_hidden, encoder_outputs)
             topv, topi = decoder_output.topk(1)
-            decoder_input = topi.squeeze().detach()  # 입력으로 사용할 부분을 히스토리에서 분리 
+            decoder_input = topi.squeeze().detach()  # 입력으로 사용할 부분을 히스토리에서 분리
 
             loss += criterion(decoder_output, target_tensor[di])
             if decoder_input.item() == EOS_token:
diff --git a/intermediate_source/spatial_transformer_tutorial.py b/intermediate_source/spatial_transformer_tutorial.py
index f17bd6b61..ce646a265 100644
--- a/intermediate_source/spatial_transformer_tutorial.py
+++ b/intermediate_source/spatial_transformer_tutorial.py
@@ -2,22 +2,22 @@
 """
 공간 변형 네트워크(Spatial Transformer Networks) 튜토리얼
 =====================================
-**저자**: `Ghassen HAMROUNI <https://github.com/GHamrouni>`_  
+**저자**: `Ghassen HAMROUNI <https://github.com/GHamrouni>`_
 **번역**: `황성수 <https://github.com/adonisues>`_ , `정신유 <https://github.com/SSinyu>`_
 .. figure:: /_static/img/stn/FSeq.png
-이 튜토리얼에서는 공간 변형 네트워크(spatial transformer networks, 이하 STN)이라 
-불리는 비주얼 어텐션 메커니즘을 이용해 신경망을 증강(augment)시키는 방법에 대해 
+이 튜토리얼에서는 공간 변형 네트워크(spatial transformer networks, 이하 STN)이라
+불리는 비주얼 어텐션 메커니즘을 이용해 신경망을 증강(augment)시키는 방법에 대해
 학습합니다. 이 방법에 대한 자세한 내용은 `DeepMind paper <https://arxiv.org/abs/1506.02025>`__ 에서
 확인할 수 있습니다.
-STN은 어떠한 공간적 변형(spatial transformation)에도 적용할 수 있는 미분 가능한 
+STN은 어떠한 공간적 변형(spatial transformation)에도 적용할 수 있는 미분 가능한
 어텐션의 일반화입니다. 따라서 STN은 신경망의 기하학적 불변성(geometric invariance)을
 강화하기 위해 입력 이미지를 대상으로 어떠한 공간적 변형을 수행해야 하는지 학습하도록
 합니다.
-예를 들어 이미지의 관심 영역을 잘라내거나, 크기를 조정하거나, 방향(orientation)을 
-수정할 수 있습니다. CNN은 이러한 회전, 크기 조정 등의 일반적인 아핀(affine) 변환된 
+예를 들어 이미지의 관심 영역을 잘라내거나, 크기를 조정하거나, 방향(orientation)을
+수정할 수 있습니다. CNN은 이러한 회전, 크기 조정 등의 일반적인 아핀(affine) 변환된
 입력에 대해 결과의 변동이 크기 때문에 (민감하기 때문에), STN은 이를 극복하는데 매우
 유용한 메커니즘이 될 수 있습니다.
-STN이 가진 장점 중 하나는 아주 작은 수정만으로 기존에 사용하던 CNN에 간단하게 연결 시킬 
+STN이 가진 장점 중 하나는 아주 작은 수정만으로 기존에 사용하던 CNN에 간단하게 연결 시킬
 수 있다는 것입니다.
 """
 # 라이센스: BSD
@@ -39,9 +39,14 @@
 # 데이터 불러오기
 # ----------------
 #
-# 이 튜토리얼에서는 MNIST 데이터셋을 이용해 실험합니다. 실험에는 STN으로 
+# 이 튜토리얼에서는 MNIST 데이터셋을 이용해 실험합니다. 실험에는 STN으로
 # 증강된 일반적인 CNN을 사용합니다.
 
+from six.moves import urllib
+opener = urllib.request.build_opener()
+opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+urllib.request.install_opener(opener)
+
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 # 학습용 데이터셋
@@ -142,7 +147,7 @@ def forward(self, x):
 #
 # 이제 SGD 알고리즘을 이용해 모델을 학습시켜 봅시다. 앞서 구성한 신경망은
 # 감독 학습 방식(supervised way)으로 분류 문제를 학습합니다. 또한 이 모델은
-# end-to-end 방식으로 STN을 자동으로 학습합니다. 
+# end-to-end 방식으로 STN을 자동으로 학습합니다.
 
 
 optimizer = optim.SGD(model.parameters(), lr=0.01)
@@ -205,7 +210,7 @@ def convert_image_np(inp):
     inp = np.clip(inp, 0, 1)
     return inp
 
-# 학습 후 공간 변환 계층의 출력을 시각화하고, 입력 이미지 배치 데이터 및 
+# 학습 후 공간 변환 계층의 출력을 시각화하고, 입력 이미지 배치 데이터 및
 # STN을 사용해 변환된 배치 데이터를 시각화 합니다.
 
 
diff --git a/intermediate_source/speech_command_recognition_with_torchaudio.py b/intermediate_source/speech_command_recognition_with_torchaudio.py
new file mode 100644
index 000000000..519e714ab
--- /dev/null
+++ b/intermediate_source/speech_command_recognition_with_torchaudio.py
@@ -0,0 +1,543 @@
+"""
+Speech Command Recognition with torchaudio
+******************************************
+
+This tutorial will show you how to correctly format an audio dataset and
+then train/test an audio classifier network on the dataset.
+
+Colab has GPU option available. In the menu tabs, select “Runtime” then
+“Change runtime type”. In the pop-up that follows, you can choose GPU.
+After the change, your runtime should automatically restart (which means
+information from executed cells disappear).
+
+First, let’s import the common torch packages such as
+`torchaudio <https://github.com/pytorch/audio>`__ that can be installed
+by following the instructions on the website.
+
+"""
+
+# Uncomment the following line to run in Google Colab
+
+# CPU:
+# !pip install torch==1.7.0+cpu torchvision==0.8.1+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
+
+# GPU:
+# !pip install torch==1.7.0+cu101 torchvision==0.8.1+cu101 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
+
+# For interactive demo at the end:
+# !pip install pydub
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torchaudio
+
+import matplotlib.pyplot as plt
+import IPython.display as ipd
+from tqdm.notebook import tqdm
+
+
+######################################################################
+# Let’s check if a CUDA GPU is available and select our device. Running
+# the network on a GPU will greatly decrease the training/testing runtime.
+#
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(device)
+
+
+######################################################################
+# Importing the Dataset
+# ---------------------
+#
+# We use torchaudio to download and represent the dataset. Here we use
+# `SpeechCommands <https://arxiv.org/abs/1804.03209>`__, which is a
+# datasets of 35 commands spoken by different people. The dataset
+# ``SPEECHCOMMANDS`` is a ``torch.utils.data.Dataset`` version of the
+# dataset. In this dataset, all audio files are about 1 second long (and
+# so about 16000 time frames long).
+#
+# The actual loading and formatting steps happen when a data point is
+# being accessed, and torchaudio takes care of converting the audio files
+# to tensors. If one wants to load an audio file directly instead,
+# ``torchaudio.load()`` can be used. It returns a tuple containing the
+# newly created tensor along with the sampling frequency of the audio file
+# (16kHz for SpeechCommands).
+#
+# Going back to the dataset, here we create a subclass that splits it into
+# standard training, validation, testing subsets.
+#
+
+from torchaudio.datasets import SPEECHCOMMANDS
+import os
+
+
+class SubsetSC(SPEECHCOMMANDS):
+    def __init__(self, subset: str = None):
+        super().__init__("./", download=True)
+
+        def load_list(filename):
+            filepath = os.path.join(self._path, filename)
+            with open(filepath) as fileobj:
+                return [os.path.join(self._path, line.strip()) for line in fileobj]
+
+        if subset == "validation":
+            self._walker = load_list("validation_list.txt")
+        elif subset == "testing":
+            self._walker = load_list("testing_list.txt")
+        elif subset == "training":
+            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
+            excludes = set(excludes)
+            self._walker = [w for w in self._walker if w not in excludes]
+
+
+# Create training and testing split of the data. We do not use validation in this tutorial.
+train_set = SubsetSC("training")
+test_set = SubsetSC("testing")
+
+waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]
+
+
+######################################################################
+# A data point in the SPEECHCOMMANDS dataset is a tuple made of a waveform
+# (the audio signal), the sample rate, the utterance (label), the ID of
+# the speaker, the number of the utterance.
+#
+
+print("Shape of waveform: {}".format(waveform.size()))
+print("Sample rate of waveform: {}".format(sample_rate))
+
+plt.plot(waveform.t().numpy());
+
+
+######################################################################
+# Let’s find the list of labels available in the dataset.
+#
+
+labels = sorted(list(set(datapoint[2] for datapoint in train_set)))
+labels
+
+
+######################################################################
+# The 35 audio labels are commands that are said by users. The first few
+# files are people saying “marvin”.
+#
+
+waveform_first, *_ = train_set[0]
+ipd.Audio(waveform_first.numpy(), rate=sample_rate)
+
+waveform_second, *_ = train_set[1]
+ipd.Audio(waveform_second.numpy(), rate=sample_rate)
+
+
+######################################################################
+# The last file is someone saying “visual”.
+#
+
+waveform_last, *_ = train_set[-1]
+ipd.Audio(waveform_last.numpy(), rate=sample_rate)
+
+
+######################################################################
+# Formatting the Data
+# -------------------
+#
+# This is a good place to apply transformations to the data. For the
+# waveform, we downsample the audio for faster processing without losing
+# too much of the classification power.
+#
+# We don’t need to apply other transformations here. It is common for some
+# datasets though to have to reduce the number of channels (say from
+# stereo to mono) by either taking the mean along the channel dimension,
+# or simply keeping only one of the channels. Since SpeechCommands uses a
+# single channel for audio, this is not needed here.
+#
+
+new_sample_rate = 8000
+transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
+transformed = transform(waveform)
+
+ipd.Audio(transformed.numpy(), rate=new_sample_rate)
+
+
+######################################################################
+# We are encoding each word using its index in the list of labels.
+#
+
+
+def label_to_index(word):
+    # Return the position of the word in labels
+    return torch.tensor(labels.index(word))
+
+
+def index_to_label(index):
+    # Return the word corresponding to the index in labels
+    # This is the inverse of label_to_index
+    return labels[index]
+
+
+word_start = "yes"
+index = label_to_index(word_start)
+word_recovered = index_to_label(index)
+
+print(word_start, "-->", index, "-->", word_recovered)
+
+
+######################################################################
+# To turn a list of data point made of audio recordings and utterances
+# into two batched tensors for the model, we implement a collate function
+# which is used by the PyTorch DataLoader that allows us to iterate over a
+# dataset by batches. Please see `the
+# documentation <https://pytorch.org/docs/stable/data.html#working-with-collate-fn>`__
+# for more information about working with a collate function.
+#
+# In the collate function, we also apply the resampling, and the text
+# encoding.
+#
+
+
+def pad_sequence(batch):
+    # Make all tensor in a batch the same length by padding with zeros
+    batch = [item.t() for item in batch]
+    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
+    return batch.permute(0, 2, 1)
+
+
+def collate_fn(batch):
+
+    # A data tuple has the form:
+    # waveform, sample_rate, label, speaker_id, utterance_number
+
+    tensors, targets = [], []
+
+    # Gather in lists, and encode labels as indices
+    for waveform, _, label, *_ in batch:
+        tensors += [waveform]
+        targets += [label_to_index(label)]
+
+    # Group the list of tensors into a batched tensor
+    tensors = pad_sequence(tensors)
+    targets = torch.stack(targets)
+
+    return tensors, targets
+
+
+batch_size = 256
+
+if device == "cuda":
+    num_workers = 1
+    pin_memory = True
+else:
+    num_workers = 0
+    pin_memory = False
+
+train_loader = torch.utils.data.DataLoader(
+    train_set,
+    batch_size=batch_size,
+    shuffle=True,
+    collate_fn=collate_fn,
+    num_workers=num_workers,
+    pin_memory=pin_memory,
+)
+test_loader = torch.utils.data.DataLoader(
+    test_set,
+    batch_size=batch_size,
+    shuffle=False,
+    drop_last=False,
+    collate_fn=collate_fn,
+    num_workers=num_workers,
+    pin_memory=pin_memory,
+)
+
+
+######################################################################
+# Define the Network
+# ------------------
+#
+# For this tutorial we will use a convolutional neural network to process
+# the raw audio data. Usually more advanced transforms are applied to the
+# audio data, however CNNs can be used to accurately process the raw data.
+# The specific architecture is modeled after the M5 network architecture
+# described in `this paper <https://arxiv.org/pdf/1610.00087.pdf>`__. An
+# important aspect of models processing raw audio data is the receptive
+# field of their first layer’s filters. Our model’s first filter is length
+# 80 so when processing audio sampled at 8kHz the receptive field is
+# around 10ms (and at 4kHz, around 20 ms). This size is similar to speech
+# processing applications that often use receptive fields ranging from
+# 20ms to 40ms.
+#
+
+
+class M5(nn.Module):
+    def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
+        super().__init__()
+        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
+        self.bn1 = nn.BatchNorm1d(n_channel)
+        self.pool1 = nn.MaxPool1d(4)
+        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
+        self.bn2 = nn.BatchNorm1d(n_channel)
+        self.pool2 = nn.MaxPool1d(4)
+        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
+        self.bn3 = nn.BatchNorm1d(2 * n_channel)
+        self.pool3 = nn.MaxPool1d(4)
+        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
+        self.bn4 = nn.BatchNorm1d(2 * n_channel)
+        self.pool4 = nn.MaxPool1d(4)
+        self.fc1 = nn.Linear(2 * n_channel, n_output)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(self.bn1(x))
+        x = self.pool1(x)
+        x = self.conv2(x)
+        x = F.relu(self.bn2(x))
+        x = self.pool2(x)
+        x = self.conv3(x)
+        x = F.relu(self.bn3(x))
+        x = self.pool3(x)
+        x = self.conv4(x)
+        x = F.relu(self.bn4(x))
+        x = self.pool4(x)
+        x = F.avg_pool1d(x, x.shape[-1])
+        x = x.permute(0, 2, 1)
+        x = self.fc1(x)
+        return F.log_softmax(x, dim=2)
+
+
+model = M5(n_input=transformed.shape[0], n_output=len(labels))
+model.to(device)
+print(model)
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+n = count_parameters(model)
+print("Number of parameters: %s" % n)
+
+
+######################################################################
+# We will use the same optimization technique used in the paper, an Adam
+# optimizer with weight decay set to 0.0001. At first, we will train with
+# a learning rate of 0.01, but we will use a ``scheduler`` to decrease it
+# to 0.001 during training after 20 epochs.
+#
+
+optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
+scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)  # reduce the learning after 20 epochs by a factor of 10
+
+
+######################################################################
+# Training and Testing the Network
+# --------------------------------
+#
+# Now let’s define a training function that will feed our training data
+# into the model and perform the backward pass and optimization steps. For
+# training, the loss we will use is the negative log-likelihood. The
+# network will then be tested after each epoch to see how the accuracy
+# varies during the training.
+#
+
+
+def train(model, epoch, log_interval):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+
+        data = data.to(device)
+        target = target.to(device)
+
+        # apply transform and model on whole batch directly on device
+        data = transform(data)
+        output = model(data)
+
+        # negative log-likelihood for a tensor of size (batch x 1 x n_output)
+        loss = F.nll_loss(output.squeeze(), target)
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # print training stats
+        if batch_idx % log_interval == 0:
+            print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")
+
+        # update progress bar
+        pbar.update(pbar_update)
+        # record loss
+        losses.append(loss.item())
+
+
+######################################################################
+# Now that we have a training function, we need to make one for testing
+# the networks accuracy. We will set the model to ``eval()`` mode and then
+# run inference on the test dataset. Calling ``eval()`` sets the training
+# variable in all modules in the network to false. Certain layers like
+# batch normalization and dropout layers behave differently during
+# training so this step is crucial for getting correct results.
+#
+
+
+def number_of_correct(pred, target):
+    # count number of correct predictions
+    return pred.squeeze().eq(target).sum().item()
+
+
+def get_likely_index(tensor):
+    # find most likely label index for each element in the batch
+    return tensor.argmax(dim=-1)
+
+
+def test(model, epoch):
+    model.eval()
+    correct = 0
+    for data, target in test_loader:
+
+        data = data.to(device)
+        target = target.to(device)
+
+        # apply transform and model on whole batch directly on device
+        data = transform(data)
+        output = model(data)
+
+        pred = get_likely_index(output)
+        correct += number_of_correct(pred, target)
+
+        # update progress bar
+        pbar.update(pbar_update)
+
+    print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n")
+
+
+######################################################################
+# Finally, we can train and test the network. We will train the network
+# for ten epochs then reduce the learn rate and train for ten more epochs.
+# The network will be tested after each epoch to see how the accuracy
+# varies during the training.
+#
+
+log_interval = 20
+n_epoch = 2
+
+pbar_update = 1 / (len(train_loader) + len(test_loader))
+losses = []
+
+# The transform needs to live on the same device as the model and the data.
+transform = transform.to(device)
+with tqdm(total=n_epoch) as pbar:
+    for epoch in range(1, n_epoch + 1):
+        train(model, epoch, log_interval)
+        test(model, epoch)
+        scheduler.step()
+
+# Let's plot the training loss versus the number of iteration.
+# plt.plot(losses);
+# plt.title("training loss");
+
+
+######################################################################
+# The network should be more than 65% accurate on the test set after 2
+# epochs, and 85% after 21 epochs. Let’s look at the last words in the
+# train set, and see how the model did on it.
+#
+
+
+def predict(tensor):
+    # Use the model to predict the label of the waveform
+    tensor = tensor.to(device)
+    tensor = transform(tensor)
+    tensor = model(tensor.unsqueeze(0))
+    tensor = get_likely_index(tensor)
+    tensor = index_to_label(tensor.squeeze())
+    return tensor
+
+
+waveform, sample_rate, utterance, *_ = train_set[-1]
+ipd.Audio(waveform.numpy(), rate=sample_rate)
+
+print(f"Expected: {utterance}. Predicted: {predict(waveform)}.")
+
+
+######################################################################
+# Let’s find an example that isn’t classified correctly, if there is one.
+#
+
+for i, (waveform, sample_rate, utterance, *_) in enumerate(test_set):
+    output = predict(waveform)
+    if output != utterance:
+        ipd.Audio(waveform.numpy(), rate=sample_rate)
+        print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.")
+        break
+else:
+    print("All examples in this dataset were correctly classified!")
+    print("In this case, let's just look at the last data point")
+    ipd.Audio(waveform.numpy(), rate=sample_rate)
+    print(f"Data point #{i}. Expected: {utterance}. Predicted: {output}.")
+
+
+######################################################################
+# Feel free to try with one of your own recordings of one of the labels!
+# For example, using Colab, say “Go” while executing the cell below. This
+# will record one second of audio and try to classify it.
+#
+
+from google.colab import output as colab_output
+from base64 import b64decode
+from io import BytesIO
+from pydub import AudioSegment
+
+
+RECORD = """
+const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
+const b2text = blob => new Promise(resolve => {
+  const reader = new FileReader()
+  reader.onloadend = e => resolve(e.srcElement.result)
+  reader.readAsDataURL(blob)
+})
+var record = time => new Promise(async resolve => {
+  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
+  recorder = new MediaRecorder(stream)
+  chunks = []
+  recorder.ondataavailable = e => chunks.push(e.data)
+  recorder.start()
+  await sleep(time)
+  recorder.onstop = async ()=>{
+    blob = new Blob(chunks)
+    text = await b2text(blob)
+    resolve(text)
+  }
+  recorder.stop()
+})
+"""
+
+
+def record(seconds=1):
+    display(ipd.Javascript(RECORD))
+    print(f"Recording started for {seconds} seconds.")
+    s = colab_output.eval_js("record(%d)" % (seconds * 1000))
+    print("Recording ended.")
+    b = b64decode(s.split(",")[1])
+
+    fileformat = "wav"
+    filename = f"_audio.{fileformat}"
+    AudioSegment.from_file(BytesIO(b)).export(filename, format=fileformat)
+    return torchaudio.load(filename)
+
+
+waveform, sample_rate = record()
+print(f"Predicted: {predict(waveform)}.")
+ipd.Audio(waveform.numpy(), rate=sample_rate)
+
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we used torchaudio to load a dataset and resample the
+# signal. We have then defined a neural network that we trained to
+# recognize a given command. There are also other data preprocessing
+# methods, such as finding the mel frequency cepstral coefficients (MFCC),
+# that can reduce the size of the dataset. This transform is also
+# available in torchaudio as ``torchaudio.transforms.MFCC``.
+#
diff --git a/intermediate_source/torchvision_tutorial.rst b/intermediate_source/torchvision_tutorial.rst
index b2207d09d..51919a80e 100644
--- a/intermediate_source/torchvision_tutorial.rst
+++ b/intermediate_source/torchvision_tutorial.rst
@@ -42,7 +42,7 @@ TorchVision 객체 검출 미세조정(Finetuning) 튜토리얼
       새로운 키포인트 표현에 대해 "references/detection/transforms.py" 코드 부분을 수정 해야 할 수도 있습니다.
 
 모델이 위의 방법대로 리턴을 하면, 학습과 평가 둘 다에 대해서 동작을 할 것이며
-평가 스크립트는 ``pycocotools`` 를 사용하게 될 것입니다.
+평가 스크립트는 `pip install pycocotools`` 로 설치 가능한 ``pycocotools`` 를 사용하게 될 것입니다.
 
 .. note ::
   윈도우즈에서는 ``pip install git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI``
@@ -239,7 +239,7 @@ COCO에 대해 미리 학습된 모델에서 시작하여 특정 클래스를 
    # 만약 백본이 텐서를 리턴할때, featmap_names 는 [0] 이 될 것이라고 예상합니다.
    # 일반적으로 백본은 OrderedDict[Tensor] 타입을 리턴해야 합니다.
    # 그리고 특징맵에서 사용할 featmap_names 값을 정할 수 있습니다.
-   roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
+   roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
                                                    output_size=7,
                                                    sampling_ratio=2)
 
@@ -291,10 +291,10 @@ PennFudan 데이터셋을 위한 인스턴스 분할 모델
 모든 것을 하나로 합치기
 ---------------------------
 
-``references/detection/`` 폴더내에 검출 모델들의 학습과 평과를 쉽게 하기 위한 도움 함수들이 있습니다.
+``references/detection/`` 폴더 내에 검출 모델들의 학습과 평과를 쉽게 하기 위한 도움 함수들이 있습니다.
 여기서 ``references/detection/engine.py``, ``references/detection/utils.py``,
 ``references/detection/transforms.py`` 를 사용 할 것입니다.
-위 파일들을 폴더로 복사하고 사용합시다.
+``references/detection`` 아래의 모든 파일과 폴더들을 사용자의 폴더로 복사한 뒤 사용합니다.
 
 데이터 증강 / 변환을 위한 도움 함수를 작성해 봅시다
 
diff --git a/prototype_source/README.txt b/prototype_source/README.txt
index bc32e121f..9b1f04c44 100644
--- a/prototype_source/README.txt
+++ b/prototype_source/README.txt
@@ -7,7 +7,7 @@ Prototype Tutorials
 2. graph_mode_static_quantization_tutorial.py
 	   Graph Mode Post Training Static Quantization in PyTorch
 	   https://pytorch.org/tutorials/prototype/graph_mode_static_quantization_tutorial.html
-	   
+
 3. graph_mode_dynamic_bert_tutorial.rst
 	   Graph Mode Dynamic Quantization on BERT
 	   https://github.com/pytorch/tutorials/blob/master/prototype_source/graph_mode_dynamic_bert_tutorial.rst
@@ -19,3 +19,20 @@ Prototype Tutorials
 5. torchscript_freezing.py
 	   Model Freezing in TorchScript
 	   https://github.com/pytorch/tutorials/blob/master/prototype_source/torchscript_freezing.py
+
+6. vulkan_workflow.rst
+     Vulkan Backend User Workflow
+     https://pytorch.org/tutorials/intermediate/vulkan_workflow.html
+
+7. fx_graph_mode_ptq_static.rst
+	   FX Graph Mode Post Training Static Quantization
+	   https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static.html
+
+8. fx_graph_mode_ptq_dynamic.py
+	   FX Graph Mode Post Training Dynamic Quantization
+	   https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html
+
+9. fx_graph_mode_quant_guide.py
+	   FX Graph Mode Quantization User Guide
+	   https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html
+
diff --git a/prototype_source/fx_graph_mode_ptq_dynamic.py b/prototype_source/fx_graph_mode_ptq_dynamic.py
new file mode 100644
index 000000000..e2d3814f5
--- /dev/null
+++ b/prototype_source/fx_graph_mode_ptq_dynamic.py
@@ -0,0 +1,292 @@
+"""
+(prototype) FX Graph Mode Post Training Dynamic Quantization
+===========================================================
+
+**Author**: `Jerry Zhang <https://github.com/jerryzh168>`_
+
+This tutorial introduces the steps to do post training dynamic quantization in graph mode based on ``torch.fx``. 
+We have a separate tutorial for `FX Graph Mode Post Training Static Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static.html>`_,
+comparison between FX Graph Mode Quantization and Eager Mode Quantization can be found in the `quantization docs <https://pytorch.org/docs/master/quantization.html#quantization-api-summary>`_
+
+tldr; The FX Graph Mode API for dynamic quantization looks like the following:
+
+.. code:: python
+
+    import torch
+    from torch.quantization import default_dynamic_qconfig
+    # Note that this is temporary, we'll expose these functions to torch.quantization after official releasee
+    from torch.quantization.quantize_fx import prepare_fx, convert_fx
+
+    float_model.eval()
+    qconfig = get_default_qconfig("fbgemm")
+    qconfig_dict = {"": qconfig}
+    prepared_model = prepare_fx(float_model, qconfig_dict)  # fuse modules and insert observers
+    # no calibration is required for dynamic quantization
+    quantized_model = convert_fx(prepared_model)  # convert the model to a dynamically quantized model
+
+In this tutorial, we’ll apply dynamic quantization to an LSTM-based next word-prediction model, 
+closely following the word language model from the PyTorch examples. 
+We will copy the code from `Dynamic Quantization on an LSTM Word Language Model <https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html>`_ 
+and omit the descriptions.
+
+"""
+
+
+###################################################
+# 1. Define the Model, Download Data and Model
+# --------------------------------------------
+#
+# Download the `data <https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip>`_ 
+# and unzip to data folder
+# 
+# .. code::
+#     
+#     mkdir data
+#     cd data
+#     wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
+#     unzip wikitext-2-v1.zip
+#
+# Download model to the data folder:
+# 
+# .. code::
+# 
+#     wget https://s3.amazonaws.com/pytorch-tutorial-assets/word_language_model_quantize.pth
+#
+# Define the model:
+
+# imports
+import os
+from io import open
+import time
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Model Definition
+class LSTMModel(nn.Module):
+    """Container module with an encoder, a recurrent module, and a decoder."""
+
+    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
+        super(LSTMModel, self).__init__()
+        self.drop = nn.Dropout(dropout)
+        self.encoder = nn.Embedding(ntoken, ninp)
+        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
+        self.decoder = nn.Linear(nhid, ntoken)
+
+        self.init_weights()
+
+        self.nhid = nhid
+        self.nlayers = nlayers
+
+    def init_weights(self):
+        initrange = 0.1
+        self.encoder.weight.data.uniform_(-initrange, initrange)
+        self.decoder.bias.data.zero_()
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, input, hidden):
+        emb = self.drop(self.encoder(input))
+        output, hidden = self.rnn(emb, hidden)
+        output = self.drop(output)
+        decoded = self.decoder(output)
+        return decoded, hidden
+
+
+def init_hidden(lstm_model, bsz):
+    # get the weight tensor and create hidden layer in the same device
+    weight = lstm_model.encoder.weight
+    # get weight from quantized model
+    if not isinstance(weight, torch.Tensor):
+        weight = weight()
+    device = weight.device
+    nlayers = lstm_model.rnn.num_layers
+    nhid = lstm_model.rnn.hidden_size
+    return (torch.zeros(nlayers, bsz, nhid, device=device),
+            torch.zeros(nlayers, bsz, nhid, device=device))
+    
+
+# Load Text Data
+class Dictionary(object):
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+        return self.word2idx[word]
+
+    def __len__(self):
+        return len(self.idx2word)
+
+
+class Corpus(object):
+    def __init__(self, path):
+        self.dictionary = Dictionary()
+        self.train = self.tokenize(os.path.join(path, 'wiki.train.tokens'))
+        self.valid = self.tokenize(os.path.join(path, 'wiki.valid.tokens'))
+        self.test = self.tokenize(os.path.join(path, 'wiki.test.tokens'))
+
+    def tokenize(self, path):
+        """Tokenizes a text file."""
+        assert os.path.exists(path)
+        # Add words to the dictionary
+        with open(path, 'r', encoding="utf8") as f:
+            for line in f:
+                words = line.split() + ['<eos>']
+                for word in words:
+                    self.dictionary.add_word(word)
+
+        # Tokenize file content
+        with open(path, 'r', encoding="utf8") as f:
+            idss = []
+            for line in f:
+                words = line.split() + ['<eos>']
+                ids = []
+                for word in words:
+                    ids.append(self.dictionary.word2idx[word])
+                idss.append(torch.tensor(ids).type(torch.int64))
+            ids = torch.cat(idss)
+
+        return ids
+
+model_data_filepath = 'data/'
+
+corpus = Corpus(model_data_filepath + 'wikitext-2')
+
+ntokens = len(corpus.dictionary)
+
+# Load Pretrained Model
+model = LSTMModel(
+    ntoken = ntokens,
+    ninp = 512,
+    nhid = 256,
+    nlayers = 5,
+)
+
+model.load_state_dict(
+    torch.load(
+        model_data_filepath + 'word_language_model_quantize.pth',
+        map_location=torch.device('cpu')
+        )
+    )
+
+model.eval()
+print(model)
+
+bptt = 25
+criterion = nn.CrossEntropyLoss()
+eval_batch_size = 1
+
+# create test data set
+def batchify(data, bsz):
+    # Work out how cleanly we can divide the dataset into bsz parts.
+    nbatch = data.size(0) // bsz
+    # Trim off any extra elements that wouldn't cleanly fit (remainders).
+    data = data.narrow(0, 0, nbatch * bsz)
+    # Evenly divide the data across the bsz batches.
+    return data.view(bsz, -1).t().contiguous()
+
+test_data = batchify(corpus.test, eval_batch_size)
+
+# Evaluation functions
+def get_batch(source, i):
+    seq_len = min(bptt, len(source) - 1 - i)
+    data = source[i:i+seq_len]
+    target = source[i+1:i+1+seq_len].reshape(-1)
+    return data, target
+
+def repackage_hidden(h):
+  """Wraps hidden states in new Tensors, to detach them from their history."""
+
+  if isinstance(h, torch.Tensor):
+      return h.detach()
+  else:
+      return tuple(repackage_hidden(v) for v in h)
+
+def evaluate(model_, data_source):
+    # Turn on evaluation mode which disables dropout.
+    model_.eval()
+    total_loss = 0.
+    hidden = init_hidden(model_, eval_batch_size)
+    with torch.no_grad():
+        for i in range(0, data_source.size(0) - 1, bptt):
+            data, targets = get_batch(data_source, i)
+            output, hidden = model_(data, hidden)
+            hidden = repackage_hidden(hidden)
+            output_flat = output.view(-1, ntokens)
+            total_loss += len(data) * criterion(output_flat, targets).item()
+    return total_loss / (len(data_source) - 1)
+
+######################################################################
+# 2. Post Training Dynamic Quantization
+# -------------------------------------
+# Now we can dynamically quantize the model. 
+# We can use the same function as post training static quantization but with a dynamic qconfig.
+
+from torch.quantization.quantize_fx import prepare_fx, convert_fx
+from torch.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig
+
+# Full docs for supported qconfig for floating point modules/ops can be found in docs for quantization (TODO: link)
+# Full docs for qconfig_dict can be found in the documents of prepare_fx (TODO: link)
+qconfig_dict = {
+    "object_type": [
+        (nn.Embedding, float_qparams_weight_only_qconfig),
+        (nn.LSTM, default_dynamic_qconfig),
+        (nn.Linear, default_dynamic_qconfig)
+    ]
+}
+# Deepcopying the original model because quantization api changes the model inplace and we want
+# to keep the original model for future comparison
+model_to_quantize = copy.deepcopy(model)
+prepared_model = prepare_fx(model_to_quantize, qconfig_dict)
+print("prepared model:", prepared_model)
+quantized_model = convert_fx(prepared_model)
+print("quantized model", quantized_model)
+
+
+######################################################################
+# For dynamically quantized objects, we didn't do anything in ``prepare_fx`` for modules,
+# but will insert observers for weight for dynamically quantizable forunctionals and torch ops.
+# We also fuse the modules like Conv + Bn, Linear + ReLU.
+# 
+# In convert we'll convert the float modules to dynamically quantized modules and 
+# convert float ops to dynamically quantized ops. We can see in the example model,
+# ``nn.Embedding``, ``nn.Linear`` and ``nn.LSTM`` are dynamically quantized.
+# 
+# Now we can compare the size and runtime of the quantized model.
+
+def print_size_of_model(model):
+    torch.save(model.state_dict(), "temp.p")
+    print('Size (MB):', os.path.getsize("temp.p")/1e6)
+    os.remove('temp.p')
+
+print_size_of_model(model)
+print_size_of_model(quantized_model)
+
+######################################################################
+# There is a 4x size reduction because we quantized all the weights
+# in the model (nn.Embedding, nn.Linear and nn.LSTM) from float (4 bytes) to quantized int (1 byte).
+
+torch.set_num_threads(1)
+
+def time_model_evaluation(model, test_data):
+    s = time.time()
+    loss = evaluate(model, test_data)
+    elapsed = time.time() - s
+    print('''loss: {0:.3f}\nelapsed time (seconds): {1:.1f}'''.format(loss, elapsed))
+
+time_model_evaluation(model, test_data)
+time_model_evaluation(quantized_model, test_data)
+
+#####################################################################
+# There is a roughly 2x speedup for this model. Also note that the speedup 
+# may vary depending on model, device, build, input batch sizes, threading etc.
+#
+# 3. Conclusion
+# -------------
+# This tutorial introduces the api for post training dynamic quantization in FX Graph Mode, 
+# which dynamically quantizes the same modules as Eager Mode Quantization.
diff --git a/prototype_source/fx_graph_mode_ptq_static.rst b/prototype_source/fx_graph_mode_ptq_static.rst
new file mode 100644
index 000000000..410f5a116
--- /dev/null
+++ b/prototype_source/fx_graph_mode_ptq_static.rst
@@ -0,0 +1,438 @@
+(prototype) FX Graph Mode Post Training Static Quantization 
+=========================================================== 
+**Author**: `Jerry Zhang <https://github.com/jerryzh168>`_ 
+
+This tutorial introduces the steps to do post training static quantization in graph mode based on   
+`torch.fx <https://github.com/pytorch/pytorch/blob/master/torch/fx/__init__.py>`_.  
+The advantage of FX graph mode quantization is that we can perform quantization fully automatically on the model    
+although there might some effort required to make the model compatible with FX Graph Mode Quantizatiion (symbolically traceable with ``torch.fx``), 
+we'll have a separate tutorial to show how to make the part of the model we want to quantize compatibble with FX Graph Mode Quantization.   
+We also have a tutorial for `FX Graph Mode Post Training Dynamic Quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html>`_.
+tldr; The FX Graph Mode API looks like the following:
+
+.. code:: python    
+  import torch    
+  from torch.quantization import get_default_qconfig  
+  # Note that this is temporary, we'll expose these functions to torch.quantization after official releasee   
+  from torch.quantization.quantize_fx import prepare_fx, convert_fx   
+  float_model.eval()  
+  qconfig = get_default_qconfig("fbgemm") 
+  qconfig_dict = {"": qconfig}    
+  def calibrate(model, data_loader):  
+      model.eval()    
+      with torch.no_grad():   
+          for image, target in data_loader:   
+              model(image)    
+  prepared_model = prepare_fx(float_model, qconfig_dict)  # fuse modules and insert observers 
+  calibrate(prepared_model, data_loader_test)  # run calibration on sample data   
+  quantized_model = convert_fx(prepared_model)  # convert the calibrated model to a quantized model   
+
+
+ 
+1. Motivation of FX Graph Mode Quantization   
+-------------------------------------------   
+  
+Currently PyTorch only has eager mode quantization: `Static Quantization with Eager Mode in PyTorch <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_. 
+  
+We can see there are multiple manual steps involved in the process, including:    
+  
+- Explicitly quantize and dequantize activations, this is time consuming when floating point and quantized operations are mixed in a model.   
+- Explicitly fuse modules, this requires manually identifying the sequence of convolutions, batch norms and relus and other fusion patterns.  
+- Special handling is needed for pytorch tensor operations (like add, concat etc.)    
+- Functionals did not have first class support (functional.conv2d and functional.linear would not get quantized)  
+  
+Most of these required modifications comes from the underlying limitations of eager mode quantization. Eager mode works in module level since it can not inspect the code that is actually run (in the forward function), quantization is achieved by module swapping, and we don’t know how the modules are used in forward function in eager mode, so it requires users to insert QuantStub and DeQuantStub manually to mark the points they want to quantize or dequantize.    
+In graph mode, we can inspect the actual code that’s been executed in forward function (e.g. aten function calls) and quantization is achieved by module and graph manipulations. Since graph mode has full visibility of the code that is run, our tool is able to automatically figure out things like which modules to fuse and where to insert observer calls, quantize/dequantize functions etc., we are able to automate the whole quantization process.    
+  
+Advantages of FX Graph Mode Quantization are: 
+  
+- Simple quantization flow, minimal manual steps  
+- Unlocks the possibility of doing higher level optimizations like automatic precision selection  
+  
+2. Define Helper Functions and Prepare Dataset    
+----------------------------------------------    
+  
+We’ll start by doing the necessary imports, defining some helper functions and prepare the data.  
+These steps are identitcal to `Static Quantization with Eager Mode in PyTorch <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.       
+
+To run the code in this tutorial using the entire ImageNet dataset, first download imagenet by following the instructions at here `ImageNet Data <http://www.image-net.org/download>`_. Unzip the downloaded file into the 'data_path' folder.
+
+Download the `torchvision resnet18 model <https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py#L12>`_ and rename it to  
+``data/resnet18_pretrained_float.pth``.   
+
+.. code:: python
+
+    import numpy as np  
+    import torch    
+    import torch.nn as nn   
+    import torchvision  
+    from torch.utils.data import DataLoader 
+    from torchvision import datasets    
+    import torchvision.transforms as transforms 
+    import os   
+    import time 
+    import sys  
+    import torch.quantization   
+
+    # Setup warnings    
+    import warnings 
+    warnings.filterwarnings(    
+        action='ignore',    
+        category=DeprecationWarning,    
+        module=r'.*'    
+    )   
+    warnings.filterwarnings(    
+        action='default',   
+        module=r'torch.quantization'    
+    )   
+
+    # Specify random seed for repeatable results    
+    _ = torch.manual_seed(191009)   
+
+
+    from torchvision.models.resnet import resnet18  
+    from torch.quantization import get_default_qconfig, quantize_jit    
+
+    class AverageMeter(object): 
+        """Computes and stores the average and current value""" 
+        def __init__(self, name, fmt=':f'): 
+            self.name = name    
+            self.fmt = fmt  
+            self.reset()    
+
+        def reset(self):    
+            self.val = 0    
+            self.avg = 0    
+            self.sum = 0    
+            self.count = 0  
+
+        def update(self, val, n=1): 
+            self.val = val  
+            self.sum += val * n 
+            self.count += n 
+            self.avg = self.sum / self.count    
+
+        def __str__(self):  
+            fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 
+            return fmtstr.format(**self.__dict__)   
+
+
+    def accuracy(output, target, topk=(1,)):    
+        """Computes the accuracy over the k top predictions for the specified values of k"""    
+        with torch.no_grad():   
+            maxk = max(topk)    
+            batch_size = target.size(0) 
+
+            _, pred = output.topk(maxk, 1, True, True)  
+            pred = pred.t() 
+            correct = pred.eq(target.view(1, -1).expand_as(pred))   
+
+            res = []    
+            for k in topk:  
+                correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)    
+                res.append(correct_k.mul_(100.0 / batch_size))  
+            return res  
+
+
+    def evaluate(model, criterion, data_loader):    
+        model.eval()    
+        top1 = AverageMeter('Acc@1', ':6.2f')   
+        top5 = AverageMeter('Acc@5', ':6.2f')   
+        cnt = 0 
+        with torch.no_grad():   
+            for image, target in data_loader:   
+                output = model(image)   
+                loss = criterion(output, target)    
+                cnt += 1    
+                acc1, acc5 = accuracy(output, target, topk=(1, 5))  
+                top1.update(acc1[0], image.size(0)) 
+                top5.update(acc5[0], image.size(0)) 
+        print('')   
+
+        return top1, top5   
+
+    def load_model(model_file): 
+        model = resnet18(pretrained=False)  
+        state_dict = torch.load(model_file) 
+        model.load_state_dict(state_dict)   
+        model.to("cpu") 
+        return model    
+
+    def print_size_of_model(model): 
+        if isinstance(model, torch.jit.RecursiveScriptModule):  
+            torch.jit.save(model, "temp.p") 
+        else:   
+            torch.jit.save(torch.jit.script(model), "temp.p")   
+        print("Size (MB):", os.path.getsize("temp.p")/1e6)  
+        os.remove("temp.p") 
+
+    def prepare_data_loaders(data_path):    
+
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],  
+                                         std=[0.229, 0.224, 0.225])
+        dataset = torchvision.datasets.ImageNet(
+               data_path, split="train",
+             transforms.Compose([  
+                       transforms.RandomResizedCrop(224),  
+                       transforms.RandomHorizontalFlip(),  
+                       transforms.ToTensor(),  
+                       normalize,  
+                   ]))  
+        dataset_test = torchvision.datasets.ImageNet(
+              data_path, split="val", 
+                  transforms.Compose([  
+                      transforms.Resize(256), 
+                      transforms.CenterCrop(224), 
+                      transforms.ToTensor(),  
+                      normalize,  
+                  ]))
+
+        train_sampler = torch.utils.data.RandomSampler(dataset) 
+        test_sampler = torch.utils.data.SequentialSampler(dataset_test) 
+
+        data_loader = torch.utils.data.DataLoader(  
+            dataset, batch_size=train_batch_size,   
+            sampler=train_sampler)  
+
+        data_loader_test = torch.utils.data.DataLoader( 
+            dataset_test, batch_size=eval_batch_size,   
+            sampler=test_sampler)   
+
+        return data_loader, data_loader_test    
+
+    data_path = '~/.data/imagenet'
+    saved_model_dir = 'data/'   
+    float_model_file = 'resnet18_pretrained_float.pth'  
+
+    train_batch_size = 30
+    eval_batch_size = 50
+
+    data_loader, data_loader_test = prepare_data_loaders(data_path) 
+    criterion = nn.CrossEntropyLoss()   
+    float_model = load_model(saved_model_dir + float_model_file).to("cpu")  
+    float_model.eval()      
+
+    # deepcopy the model since we need to keep the original model around    
+    import copy 
+    model_to_quantize = copy.deepcopy(float_model)  
+
+3. Set model to eval mode 
+------------------------- 
+For post training quantization, we'll need to set model to eval mode.
+
+.. code:: python
+
+    model_to_quantize.eval()    
+
+  
+4. Specify how to quantize the model with ``qconfig_dict``    
+----------------------------------------------------------    
+  
+.. code:: python  
+  
+  qconfig_dict = {"" : default_qconfig}   
+  
+We use the same qconfig used in eager mode quantization, ``qconfig`` is just a named tuple    
+of the observers for activation and weight. ``qconfig_dict`` is a dictionary with the following configurations:   
+  
+.. code:: python  
+  
+  qconfig = { 
+      " : qconfig_global,
+      "sub" : qconfig_sub,    
+      "sub.fc" : qconfig_fc,  
+      "sub.conv": None    
+  }   
+  qconfig_dict = {    
+      # qconfig? means either a valid qconfig or None 
+      # optional, global config   
+      "": qconfig?,   
+      # optional, used for module and function types  
+      # could also be split into module_types and function_types if we prefer 
+      "object_type": [    
+          (torch.nn.Conv2d, qconfig?),    
+          (torch.nn.functional.add, qconfig?),    
+          ...,    
+      ],  
+      # optional, used for module names   
+      "module_name": [    
+          ("foo.bar", qconfig?)   
+          ...,    
+      ],  
+      # optional, matched in order, first match takes precedence  
+      "module_name_regex": [  
+          ("foo.*bar.*conv[0-9]+", qconfig?)  
+          ...,    
+      ],  
+      # priority (in increasing order): global, object_type, module_name_regex, module_name   
+      # qconfig == None means fusion and quantization should be skipped for anything  
+      # matching the rule 
+      
+      # **api subject to change** 
+      # optional: specify the path for standalone modules 
+      # These modules are symbolically traced and quantized as one unit   
+      # so that the call to the submodule appears as one call_module  
+      # node in the forward graph of the GraphModule  
+      "standalone_module_name": [ 
+          "submodule.standalone"  
+      ],  
+      "standalone_module_class": [    
+          StandaloneModuleClass   
+      ]   
+  }   
+  
+Utility functions related to ``qconfig`` can be found in the `qconfig <https://github.com/pytorch/pytorch/blob/master/torch/quantization/qconfig.py>`_ file.  
+
+.. code:: python
+
+    qconfig = get_default_qconfig("fbgemm") 
+    qconfig_dict = {"": qconfig}    
+
+5. Prepare the Model for Post Training Static Quantization    
+----------------------------------------------------------    
+  
+.. code:: python  
+  
+    prepared_model = prepare_fx(model_to_quantize, qconfig_dict)  
+  
+prepare_fx folds BatchNorm modules into previous Conv2d modules, and insert observers     
+in appropriate places in the model.   
+
+.. code:: python
+
+    prepared_model = prepare_fx(model_to_quantize, qconfig_dict)
+    print(prepared_model.graph) 
+
+6. Calibration    
+--------------    
+Calibration function is run after the observers are inserted in the model.    
+The purpose for calibration is to run through some sample examples that is representative of the workload     
+(for example a sample of the training data set) so that the observers in the model are able to observe    
+the statistics of the Tensors and we can later use this information to calculate quantization parameters. 
+
+.. code:: python
+
+    def calibrate(model, data_loader):  
+        model.eval()    
+        with torch.no_grad():   
+            for image, target in data_loader:   
+                model(image)    
+    calibrate(prepared_model, data_loader_test)  # run calibration on sample data   
+
+7. Convert the Model to a Quantized Model 
+----------------------------------------- 
+``convert_fx`` takes a calibrated model and produces a quantized model.
+
+.. code:: python
+
+    quantized_model = convert_fx(prepared_model)
+    print(quantized_model)
+
+8. Evaluation 
+------------- 
+We can now print the size and accuracy of the quantized model.    
+
+.. code:: python
+
+    print("Size of model before quantization")  
+    print_size_of_model(float_model)    
+    print("Size of model after quantization")   
+    print_size_of_model(quantized_model)    
+    top1, top5 = evaluate(quantized_model, criterion, data_loader_test) 
+    print("[before serilaization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg))  
+
+    fx_graph_mode_model_file_path = saved_model_dir + "resnet18_fx_graph_mode_quantized.pth"    
+
+    # this does not run due to some erros loading convrelu module:  
+    # ModuleAttributeError: 'ConvReLU2d' object has no attribute '_modules' 
+    # save the whole model directly 
+    # torch.save(quantized_model, fx_graph_mode_model_file_path)    
+    # loaded_quantized_model = torch.load(fx_graph_mode_model_file_path)    
+
+    # save with state_dict  
+    # torch.save(quantized_model.state_dict(), fx_graph_mode_model_file_path)   
+    # import copy   
+    # model_to_quantize = copy.deepcopy(float_model)    
+    # prepared_model = prepare_fx(model_to_quantize, {"": qconfig}) 
+    # loaded_quantized_model = convert_fx(prepared_model)   
+    # loaded_quantized_model.load_state_dict(torch.load(fx_graph_mode_model_file_path)) 
+
+    # save with script  
+    torch.jit.save(torch.jit.script(quantized_model), fx_graph_mode_model_file_path)    
+    loaded_quantized_model = torch.jit.load(fx_graph_mode_model_file_path)  
+
+    top1, top5 = evaluate(loaded_quantized_model, criterion, data_loader_test)  
+    print("[after serialization/deserialization] Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg))   
+  
+If you want to get better accuracy or performance,  try changing the `qconfig_dict`.  
+We plan to add support for graph mode in the Numerical Suite so that you can  
+easily determine the sensitivity towards quantization of different modules in a model: `PyTorch Numeric Suite Tutorial <https://pytorch.org/tutorials/prototype/numeric_suite_tutorial.html>`_    
+  
+9. Debugging Quantized Model  
+----------------------------  
+We can also print the weight for quantized an un-quantized conv to see the difference,    
+we'll first call fuse explicitly to fuse the conv and bn in the model:    
+Note that ``fuse_fx`` only works in eval mode.    
+
+.. code:: python
+
+    fused = fuse_fx(float_model)    
+
+    conv1_weight_after_fuse = fused.conv1[0].weight[0]  
+    conv1_weight_after_quant = quantized_model.conv1.weight().dequantize()[0]   
+
+    print(torch.max(abs(conv1_weight_after_fuse - conv1_weight_after_quant)))   
+  
+10. Comparison with Baseline Float Model and Eager Mode Quantization  
+--------------------------------------------------------------------   
+
+.. code:: python
+
+    scripted_float_model_file = "resnet18_scripted.pth" 
+
+    print("Size of baseline model") 
+    print_size_of_model(float_model)    
+
+    top1, top5 = evaluate(float_model, criterion, data_loader_test) 
+    print("Baseline Float Model Evaluation accuracy: %2.2f, %2.2f"%(top1.avg, top5.avg))    
+    torch.jit.save(torch.jit.script(float_model), saved_model_dir + scripted_float_model_file)  
+  
+In this section we compare the model quantized with FX graph mode quantization with the model     
+quantized in eager mode. FX graph mode and eager mode produce very similar quantized models,  
+so the expectation is that the accuracy and speedup are similar as well.  
+
+.. code:: python
+
+    print("Size of Fx graph mode quantized model")  
+    print_size_of_model(quantized_model)    
+    top1, top5 = evaluate(quantized_model, criterion, data_loader_test) 
+    print("FX graph mode quantized model Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg))   
+
+    from torchvision.models.quantization.resnet import resnet18 
+    eager_quantized_model = resnet18(pretrained=True, quantize=True).eval() 
+    print("Size of eager mode quantized model") 
+    eager_quantized_model = torch.jit.script(eager_quantized_model) 
+    print_size_of_model(eager_quantized_model)  
+    top1, top5 = evaluate(eager_quantized_model, criterion, data_loader_test)   
+    print("eager mode quantized model Evaluation accuracy on test dataset: %2.2f, %2.2f"%(top1.avg, top5.avg))  
+    eager_mode_model_file = "resnet18_eager_mode_quantized.pth" 
+    torch.jit.save(eager_quantized_model, saved_model_dir + eager_mode_model_file)  
+  
+We can see that the model size and accuracy of FX graph mode and eager mode quantized model are pretty similar.   
+  
+Running the model in AIBench (with single threading) gives the following result:  
+  
+.. code::
+  
+  Scripted Float Model:   
+  Self CPU time total: 192.48ms   
+  
+  Scripted Eager Mode Quantized Model:    
+  Self CPU time total: 50.76ms    
+  
+  Scripted FX Graph Mode Quantized Model: 
+  Self CPU time total: 50.63ms    
+  
+As we can see for resnet18 both FX graph mode and eager mode quantized model get similar speed up over the floating point model,  
+which is around 2-4x faster than the floating point model. But the actual speedup over floating point model may vary  
+depending on model, device, build, input batch sizes, threading etc.
diff --git a/prototype_source/fx_graph_mode_quant_guide.py b/prototype_source/fx_graph_mode_quant_guide.py
new file mode 100644
index 000000000..9d693f91a
--- /dev/null
+++ b/prototype_source/fx_graph_mode_quant_guide.py
@@ -0,0 +1,339 @@
+# -*- coding: utf-8 -*-
+"""
+(prototype) FX Graph Mode Quantization User Guide
+===========================================================
+
+**Author**: `Jerry Zhang <https://github.com/jerryzh168>`_
+
+FX Graph Mode Quantization requires a symbolically traceable model. 
+We use the FX framework (TODO: link) to convert a symbolically traceable nn.Module instance to IR,
+and we operate on the IR to execute the quantization passes.  
+Please post your question about symbolically tracing your model in `PyTorch Discussion Forum <https://discuss.pytorch.org/c/quantization/17>`_
+
+Quantization will only work on the symbolically traceable parts of your model. 
+Data dependent control flow (if statements / for loops etc using symbolically traced values) are one common pattern which is not supported. 
+If your model is not symbolically traceable end to end, you have a couple of options to enable FX Graph Mode Quantization only on a part of the model.
+You can use any combination of these options:
+
+1. Non traceable code doesn’t need to be quantized
+    a. Symbolically trace only the code that needs to be quantized
+    b. Skip symbolic tracing the non-traceable code
+
+2. Non traceable code needs to be quantized
+    a. Refactor your code to make it symbolically traceable
+    b. Write your own observed and quantized submodule
+
+"""
+
+####################################################################
+# If the code that is not symbolically traceable does not need to be quantized, we have the following two options
+# to run FX Graph Mode Quantization:
+#
+# 1.a. Symbolically trace only the code that needs to be quantized
+# -----------------------------------------------------------------
+#
+# When the whole model is not symbolically traceable but the submodule we want to quantize is 
+# symbolically traceable, we can run quantization only on that submodule.
+#
+#
+# before:
+#
+# .. code:: python
+#
+#   class M(nn.Module):
+#
+#       def forward(self, x):
+#           x = non_traceable_code_1(x)
+#           x = traceable_code(x)
+#           x = non_traceable_code_2(x)
+#           return x
+#
+#    
+# after:
+#
+# .. code:: python
+#
+#   class FP32Traceable(nn.Module):
+#
+#       def forward(self, x):
+#           x = traceable_code(x)
+#           return x
+#        
+#   class M(nn.Module):
+#
+#       def __init__(self):
+#           self.traceable_submodule = FP32Traceable(...)
+#
+#       def forward(self, x):
+#           x = self.traceable_code_1(x)
+#           # We'll only symbolic trace/quantize this submodule
+#           x = self.traceable_submodule(x)
+#           x = self.traceable_code_2(x)
+#           return x
+#
+#
+# quantization code:
+# 
+# .. code:: python
+#
+#   qconfig_dict = {"": qconfig}
+#   model_fp32.traceable_submodule = \
+#     prepare_fx(model_fp32.traceable_submodule, qconfig_dict)
+#
+# Note if original model needs to be preserved, you will have to
+# copy it yourself before calling the quantization APIs.
+#
+
+#####################################################
+# 1.b. Skip symbolically trace the non-traceable code
+# ---------------------------------------------------
+# When we have some non-traceable code in the module, and this part of code doesn’t need to be quantized, 
+# we can factor out this part of the code into a submodule and skip symbolically trace that submodule.
+#
+#
+# before
+#
+# .. code:: python
+#
+#   class M(nn.Module):
+#
+#       def forward(self, x):
+#           x = self.traceable_code_1(x)
+#           x = non_traceable_code(x)
+#           x = self.traceable_code_2(x)
+#           return x
+#
+#
+# after, non-traceable parts moved to a module and marked as a leaf
+#
+# .. code:: python
+#
+#   class FP32NonTraceable(nn.Module):
+#
+#       def forward(self, x):
+#           x = non_traceable_code(x)
+#           return x
+#    
+#   class M(nn.Module):
+#
+#       def __init__(self):
+#           ...
+#           self.non_traceable_submodule = FP32NonTraceable(...)
+#
+#       def forward(self, x):
+#           x = self.traceable_code_1(x)
+#           # we will configure the quantization call to not trace through
+#           # this submodule
+#           x = self.non_traceable_submodule(x)
+#           x = self.traceable_code_2(x)
+#           return x
+#
+# quantization code:
+#
+# .. code:: python
+#
+#   qconfig_dict = {"": qconfig}
+#
+#   prepare_custom_config_dict = {
+#       # option 1
+#       "non_traceable_module_name": "non_traceable_submodule",
+#       # option 2
+#       "non_traceable_module_class": [MNonTraceable],
+#   }
+#   model_prepared = prepare_fx(
+#       model_fp32, 
+#       qconfig_dict,
+#       prepare_custom_config_dict=prepare_custom_config_dict,
+#   )
+#
+# If the code that is not symbolically traceable needs to be quantized, we have the following two options:
+
+##########################################################
+# 2.a Refactor your code to make it symbolically traceable
+# --------------------------------------------------------
+# If it is easy to refactor the code and make the code symbolically traceable, 
+# we can refactor the code and remove the use of non-traceable constructs in python.
+#
+# More information about symbolic tracing support can be found in: (TODO: link)
+#
+# before:
+# 
+# .. code:: python
+#
+#   def transpose_for_scores(self, x):
+#       new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+#       x = x.view(*new_x_shape)
+#       return x.permute(0, 2, 1, 3)
+#
+#    
+# This is not symbolically traceable because in x.view(*new_x_shape) 
+# unpacking is not supported, however, it is easy to remove the unpacking
+# since x.view also supports list input.
+#
+#
+# after:
+# 
+# .. code:: python
+#
+#   def transpose_for_scores(self, x):
+#       new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+#       x = x.view(new_x_shape)
+#       return x.permute(0, 2, 1, 3)
+#
+#
+# quantization code:
+#
+# This can be combined with other approaches and the quantization code
+# depends on the model.
+#
+#
+
+#######################################################
+# 2.b. Write your own observed and quantized submodule
+# -----------------------------------------------------
+#
+# If the non-traceable code can’t be refactored to be symbolically traceable, 
+# for example it has some loops that can’t be eliminated, like nn.LSTM, 
+# we’ll need to factor out the non-traceable code to a submodule (we call it CustomModule in fx graph mode quantization) and 
+# define the observed and quantized version of the submodule (in post training static quantization or quantization aware training for static quantization) 
+# or define the quantized version (in post training dynamic and weight only quantization)
+#
+#
+# before:
+#
+# .. code:: python
+#
+#   class M(nn.Module):
+#
+#       def forward(self, x):
+#           x = traceable_code_1(x)
+#           x = non_traceable_code(x)
+#           x = traceable_code_1(x)
+#           return x
+#
+# after:
+#
+# 1. Factor out non_traceable_code to FP32NonTraceable
+# non-traceable logic, wrapped in a module
+#
+# .. code:: python
+#
+#   class FP32NonTraceable:
+#       ...
+#
+#
+# 2. Define observed version of FP32NonTraceable
+#
+# .. code:: python
+#
+#   class ObservedNonTraceable:
+#
+#       @classmethod
+#       def from_float(cls, ...):
+#           ...
+#
+# 3. Define statically quantized version of FP32NonTraceable
+# and a class method "from_observed" to convert from ObservedNonTraceable
+# to StaticQuantNonTraceable
+#
+# .. code:: python
+#
+#   class StaticQuantNonTraceable:
+#
+#       @classmethod
+#       def from_observed(cls, ...):
+#           ...
+#  
+#
+# .. code:: python
+#
+#   # refactor parent class to call FP32NonTraceable
+#   class M(nn.Module):
+#
+#      def __init__(self):
+#           ...
+#           self.non_traceable_submodule = FP32NonTraceable(...)
+#
+#       def forward(self, x):
+#           x = self.traceable_code_1(x)
+#           # this part will be quantized manually
+#           x = self.non_traceable_submodule(x)
+#           x = self.traceable_code_1(x)
+#           return x
+#    
+#
+# quantization code:
+# 
+#
+# .. code:: python
+#
+#   # post training static quantization or
+#   # quantization aware training (that produces a statically quantized module)v
+#   prepare_custom_config_dict = {
+#       "float_to_observed_custom_module_class": {
+#           "static": {
+#               FP32NonTraceable: ObservedNonTraceable,
+#           }
+#       },
+#   }
+#
+#   model_prepared = prepare_fx(
+#       model_fp32, 
+#       qconfig_dict, 
+#       prepare_custom_config_dict=prepare_custom_config_dict)
+#
+# calibrate / train (not shown)
+#
+# .. code:: python
+# 
+#   convert_custom_config_dict = {
+#       "observed_to_quantized_custom_module_class": {
+#           "static": {
+#               ObservedNonTraceable: StaticQuantNonTraceable,
+#           }
+#       },
+#   }
+#   model_quantized = convert_fx(
+#       model_prepared,
+#       convert_custom_config_dict)
+#
+# post training dynamic/weight only quantization
+# in these two modes we don't need to observe the original model, so we 
+# only need to define thee quantized model
+#
+# .. code:: python
+#
+#   class DynamicQuantNonTraceable: # or WeightOnlyQuantMNonTraceable
+#       ...
+#       @classmethod
+#       def from_observed(cls, ...):
+#           ...
+#
+#       prepare_custom_config_dict = {
+#           "non_traceable_module_class": [
+#               FP32NonTraceable
+#           ]
+#       }
+#
+#
+# .. code:: python
+#
+#   # The example is for post training quantization
+#   model_fp32.eval()
+#   model_prepared = prepare_fx(
+#       model_fp32, 
+#       qconfig_dict, 
+#       prepare_custom_config_dict=prepare_custom_config_dict)
+#
+#   convert_custom_config_dict = {
+#       "observed_to_quantized_custom_module_class": {
+#           "dynamic": {
+#               FP32NonTraceable: DynamicQuantNonTraceable,
+#           }
+#       },
+#   }
+#   model_quantized = convert_fx(
+#       model_prepared,
+#       convert_custom_config_dict)
+#
+# You can also find examples for custom modules in test ``test_custom_module_class`` in ``torch/test/quantization/test_quantize_fx.py``.
diff --git a/prototype_source/graph_mode_static_quantization_tutorial.py b/prototype_source/graph_mode_static_quantization_tutorial.py
deleted file mode 100644
index af649a15f..000000000
--- a/prototype_source/graph_mode_static_quantization_tutorial.py
+++ /dev/null
@@ -1,459 +0,0 @@
-"""
-(prototype) Graph Mode Post Training Static Quantization in PyTorch
-=========================================================
-
-**Author**: `Jerry Zhang <https://github.com/jerryzh168>`_
-
-This tutorial introduces the steps to do post training static quantization in graph mode. 
-The advantage of graph mode quantization is that as long as the model can be scripted or traced, 
-we can perform quantization fully automatically on the model. 
-Right now we can do post training static and post training dynamic quantization 
-and quantization aware training support will come later. 
-We have a separate tutorial for `Graph Mode Post Training Dynamic Quantization <https://pytorch.org/tutorials/prototype_source/graph_mode_dynamic_bert_tutorial.html>`_.
-
-tldr; The graph mode API looks like the following:
-
-.. code:: python
-
-    import torch
-    from torch.quantization import get_default_qconfig, quantize_jit
-    
-    ts_model = torch.jit.script(float_model.eval()) # or torch.jit.trace(float_model, input)
-    qconfig = get_default_qconfig('fbgemm')
-    def calibrate(model, data_loader):
-        model.eval()
-        with torch.no_grad():
-            for image, target in data_loader:
-                model(image)
-    quantized_model = quantize_jit(
-        ts_model, # TorchScript model
-        {'': qconfig}, # qconfig dict
-        calibrate, # calibration function
-        [data_loader_test]) # positional arguments to calibration function, typically some sample dataset
-
-"""
-######################################################################
-# 1. Motivation of Graph Mode Quantization
-# ---------------------
-# Currently PyTorch only has eager mode quantization: `Static Quantization with Eager Mode in PyTorch <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
-#
-# We can see there are multiple manual steps involved in the process, including:
-#
-# - Explicitly quantize and dequantize activations, this is time consuming when floating point and quantized operations are mixed in a model.
-# - Explicitly fuse modules, this requires manually identifying the sequence of convolutions, batch norms and relus and other fusion patterns.
-# - Special handling is needed for pytorch tensor operations (like add, concat etc.)
-# - Functionals did not have first class support (functional.conv2d and functional.linear would not get quantized)
-#
-# Most of these required modifications comes from the underlying limitations of eager mode quantization. Eager mode works in module level since it can not inspect the code that is actually run (in the forward function), quantization is achieved by module swapping, and we don’t know how the modules are used in forward function in eager mode, so it requires users to insert QuantStub and DeQuantStub manually to mark the points they want to quantize or dequantize. 
-# In graph mode, we can inspect the actual code that’s been executed in forward function (e.g. aten function calls) and quantization is achieved by module and graph manipulations. Since graph mode has full visibility of the code that is run, our tool is able to automatically figure out things like which modules to fuse and where to insert observer calls, quantize/dequantize functions etc., we are able to automate the whole quantization process.
-#
-# Advantages of graph mode quantization are:
-# 
-# - Simple quantization flow, minimal manual steps
-# - Unlocks the possibility of doing higher level optimizations like automatic precision selection
-#
-# Limitations of graph mode quantization is that quantization is configurable only at the level of module and the set of operators that are quantized is not configurable by user currently.
-#
-# 2. Define Helper Functions and Prepare Dataset
-# ---------------------
-# We’ll start by doing the necessary imports, defining some helper functions and prepare the data. 
-# These steps are identitcal to `Static Quantization with Eager Mode in PyTorch <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.    
-#
-# Download dataset:
-#
-# .. code::
-#
-#     wget https://s3.amazonaws.com/pytorch-tutorial-assets/imagenet_1k.zip
-#
-# and unzip to `data` folder.
-# Download the `torchvision resnet18 model <https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py#L12>`_ and rename it to
-# ``data/resnet18_pretrained_float.pth``.
-
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torchvision
-from torch.utils.data import DataLoader
-from torchvision import datasets
-import torchvision.transforms as transforms
-import os
-import time
-import sys
-import torch.quantization
-
-# # Setup warnings
-import warnings
-warnings.filterwarnings(
-    action='ignore',
-    category=DeprecationWarning,
-    module=r'.*'
-)
-warnings.filterwarnings(
-    action='default',
-    module=r'torch.quantization'
-)
-
-# Specify random seed for repeatable results
-_ = torch.manual_seed(191009)
-
-
-from torchvision.models.resnet import resnet18
-from torch.quantization import get_default_qconfig, quantize_jit
-
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-    def __init__(self, name, fmt=':f'):
-        self.name = name
-        self.fmt = fmt
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-    def __str__(self):
-        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
-        return fmtstr.format(**self.__dict__)
-
-
-def accuracy(output, target, topk=(1,)):
-    """Computes the accuracy over the k top predictions for the specified values of k"""
-    with torch.no_grad():
-        maxk = max(topk)
-        batch_size = target.size(0)
-
-        _, pred = output.topk(maxk, 1, True, True)
-        pred = pred.t()
-        correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-        res = []
-        for k in topk:
-            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
-            res.append(correct_k.mul_(100.0 / batch_size))
-        return res
-
-
-def evaluate(model, criterion, data_loader):
-    model.eval()
-    top1 = AverageMeter('Acc@1', ':6.2f')
-    top5 = AverageMeter('Acc@5', ':6.2f')
-    cnt = 0
-    with torch.no_grad():
-        for image, target in data_loader:
-            output = model(image)
-            loss = criterion(output, target)
-            cnt += 1
-            acc1, acc5 = accuracy(output, target, topk=(1, 5))
-            top1.update(acc1[0], image.size(0))
-            top5.update(acc5[0], image.size(0))
-    print('')
-
-    return top1, top5
-
-def load_model(model_file):
-    model = resnet18(pretrained=False)
-    state_dict = torch.load(model_file)
-    model.load_state_dict(state_dict)
-    model.to('cpu')
-    return model
-
-def print_size_of_model(model):
-    if isinstance(model, torch.jit.RecursiveScriptModule):
-        torch.jit.save(model, "temp.p")
-    else:
-        torch.jit.save(torch.jit.script(model), "temp.p")
-    print('Size (MB):', os.path.getsize("temp.p")/1e6)
-    os.remove('temp.p')
-
-def prepare_data_loaders(data_path):
-
-    traindir = os.path.join(data_path, 'train')
-    valdir = os.path.join(data_path, 'val')
-    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                     std=[0.229, 0.224, 0.225])
-
-    dataset = torchvision.datasets.ImageFolder(
-        traindir,
-        transforms.Compose([
-            transforms.RandomResizedCrop(224),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            normalize,
-        ]))
-
-    dataset_test = torchvision.datasets.ImageFolder(
-        valdir,
-        transforms.Compose([
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            normalize,
-        ]))
-
-    train_sampler = torch.utils.data.RandomSampler(dataset)
-    test_sampler = torch.utils.data.SequentialSampler(dataset_test)
-
-    data_loader = torch.utils.data.DataLoader(
-        dataset, batch_size=train_batch_size,
-        sampler=train_sampler)
-
-    data_loader_test = torch.utils.data.DataLoader(
-        dataset_test, batch_size=eval_batch_size,
-        sampler=test_sampler)
-
-    return data_loader, data_loader_test
-
-data_path = 'data/imagenet_1k'
-saved_model_dir = 'data/'
-float_model_file = 'resnet18_pretrained_float.pth'
-
-train_batch_size = 30
-eval_batch_size = 30
-
-data_loader, data_loader_test = prepare_data_loaders(data_path)
-criterion = nn.CrossEntropyLoss()
-float_model = load_model(saved_model_dir + float_model_file).to('cpu')
-float_model.eval();
-
-
-######################################################################
-# 3. Script/Trace the model
-# --------------------------
-# The input for graph mode quantization is a TorchScript model, so we'll need to either script or trace the model first.
-# 
-
-ts_model = torch.jit.script(float_model).eval() # ts_model = torch.jit.trace(float_model, input)
-
-######################################################################
-# 4. Specify how to quantize the model with ``qconfig_dict``
-# -------------------------
-#
-# .. code:: python
-#
-#   qconfig_dict = {'' : default_qconfig}
-#
-# We use the same ``qconfig`` used in eager mode quantization, ``qconfig`` is just a named tuple of the observers for ``activation`` and ``weight``. `qconfig_dict` is a dictionary with names of sub modules as key and qconfig for that module as value, empty key means the qconfig will be applied to whole model unless it’s overwritten by more specific configurations, the qconfig for each module is either found in the dictionary or fallback to the qconfig of parent module.
-#
-# Right now ``qconfig_dict`` is the only way to configure how the model is quantized, and it is done in the granularity of module, that is, we only support one type of ``qconfig`` for each ``torch.nn.Module``, for example, if we have:
-#
-# .. code:: python
-#
-#   qconfig = {
-#         '' : qconfig_global,
-#        'sub' : qconfig_sub,
-#         'sub.fc' : qconfig_fc,
-#        'sub.conv': None
-#   }
-#
-# Module ``sub.fc`` will be configured with ``qconfig_fc``, and all other child modules in ``sub`` will be configured with ``qconfig_sub`` and ``sub.conv`` will not be quantized. All other modules in the model will be quantized with ``qconfig_global``
-# Utility functions related to ``qconfig`` can be found in https://github.com/pytorch/pytorch/blob/master/torch/quantization/qconfig.py.
-
-qconfig = get_default_qconfig('fbgemm')
-qconfig_dict = {'': qconfig}
-
-
-######################################################################
-# 5. Define Calibration Function
-# -------------------------
-#
-# .. code:: python
-#
-#   def calibrate(model, sample_data, ...):
-#       model(sample_data, ...)
-#
-#
-# Calibration function is run after the observers are inserted in the model. 
-# The purpose for calibration is to run through some sample examples that is representative of the workload 
-# (for example a sample of the training data set) so that the observers in the model are able to observe
-# the statistics of the Tensors and we can later use this information to calculate quantization parameters.
-#
-
-def calibrate(model, data_loader):
-    model.eval()
-    with torch.no_grad():
-        for image, target in data_loader:
-            model(image)
-
-
-######################################################################
-# 6. Quantize
-# ---------------------
-#
-# .. code:: python
-#
-#     quantized_model = quantize_jit(
-#         ts_model, # TorchScript model
-#         {'': qconfig}, # qconfig dict
-#         calibrate, # calibration function
-#         [data_loader_test], # positional arguments to calibration function, typically some sample dataset
-#         inplace=False, # whether to modify the model inplace or not
-#         debug=True) # whether to prduce a debug friendly model or not
-#
-# There are three things we do in ``quantize_jit``:
-#
-# 1. ``prepare_jit`` folds BatchNorm modules into previous Conv2d modules, and insert observers in appropriate places in the Torchscript model.
-# 2. Run calibrate function on the provided sample dataset.
-# 3. ``convert_jit`` takes a calibrated model and produces a quantized model.
-#
-# If ``debug`` is False (default option), ``convert_jit`` will:
-#
-# - Calculate quantization parameters using the observers in the model
-# - Ifnsert quantization ops like ``aten::quantize_per_tensor`` and ``aten::dequantize`` to the model, and remove the observer modules after that.
-# - Replace floating point ops with quantized ops
-# - Freeze the model (remove constant attributes and make them as Constant node in the graph).
-# - Fold the quantize and prepack ops like ``quantized::conv2d_prepack`` into an attribute, so we don't need to quantize and prepack the weight everytime we run the model.
-#
-# If ``debug`` is set to ``True``:
-# 
-# - We can still access the attributes of the quantized model the same way as the original floating point model, e.g. ``model.conv1.weight`` (might be harder if you use a module list or sequential)
-# - The arithmetic operations all occur in floating point with the numerics being identical to the final quantized model, allowing for debugging.
-
-quantized_model = quantize_jit(
-    ts_model,
-    {'': qconfig},
-    calibrate,
-    [data_loader_test])
-
-print(quantized_model.graph)
-
-######################################################################
-# As we can see ``aten::conv2d`` is changed to ``quantized::conv2d`` and the floating point weight has been quantized 
-# and packed into an attribute (``quantized._jit_pass_packed_weight_30``), so we don't need to quantize/pack in runtime.
-# Also we can't access the weight attributes anymore after the debug option since they are frozen.
-#
-# 7. Evaluation
-# --------------
-# We can now print the size and accuracy of the quantized model.
-
-print('Size of model before quantization')
-print_size_of_model(ts_model)
-print('Size of model after quantization')
-print_size_of_model(quantized_model)
-top1, top5 = evaluate(quantized_model, criterion, data_loader_test)
-print('[before serilaization] Evaluation accuracy on test dataset: %2.2f, %2.2f'%(top1.avg, top5.avg))
-
-graph_mode_model_file = 'resnet18_graph_mode_quantized.pth'
-torch.jit.save(quantized_model, saved_model_dir + graph_mode_model_file)
-quantized_model = torch.jit.load(saved_model_dir + graph_mode_model_file)
-top1, top5 = evaluate(quantized_model, criterion, data_loader_test)
-print('[after serialization/deserialization] Evaluation accuracy on test dataset: %2.2f, %2.2f'%(top1.avg, top5.avg))
-
-######################################################################
-# If you want to get better accuracy or performance,  try changing the `qconfig_dict`. 
-# We plan to add support for graph mode in the Numerical Suite so that you can 
-# easily determine the sensitivity towards quantization of different modules in a model: `PyTorch Numeric Suite Tutorial <https://pytorch.org/tutorials/prototype/numeric_suite_tutorial.html>`_
-#
-# 8. Debugging Quantized Model
-# ---------------------------
-# We can also use debug option:
-
-quantized_debug_model = quantize_jit(
-    ts_model,
-    {'': qconfig},
-    calibrate,
-    [data_loader_test],
-    debug=True)
-
-top1, top5 = evaluate(quantized_debug_model, criterion, data_loader_test)
-print('[debug=True] quantized model Evaluation accuracy on test dataset: %2.2f, %2.2f'%(top1.avg, top5.avg))
-
-######################################################################
-# Note that the accuracy of the debug version is close to, but not exactly the same as the non-debug 
-# version as the debug version uses floating point ops to emulate quantized ops and the numerics match 
-# is approximate. We are working on making this even more exact.
-#
-
-print(quantized_debug_model.graph)
-
-######################################################################
-# We can see that there is no ``quantized::conv2d`` in the model, but the numerically equivalent pattern 
-# of ``aten::dequnatize - aten::conv2d - aten::quantize_per_tensor``.
-
-print_size_of_model(quantized_debug_model)
-
-######################################################################
-# Size of the debug model is the close to the floating point model because all the weights are 
-# in float and not yet quantized and frozen, this allows people to inspect the weight. 
-# You may access the weight attributes directly in the torchscript model, except for batch norm as
-# it is fused into the preceding convolutions. We will also develop graph mode ``Numeric Suite`` 
-# to allow easier inspection of weights in the future. Accessing the weight in the debug model is 
-# the same as accessing the weight in a TorchScript model:
-
-def get_first_conv_weight(model):
-    return model.conv1.weight
-w1 = get_first_conv_weight(ts_model)
-w2 = get_first_conv_weight(quantized_debug_model)
-print('first conv weight for input model:', str(w1)[:200])
-print('first conv weight for quantized model:', str(w2)[:200])
-
-######################################################################
-# The weights are different because we fold the weights of BatchNorm to the previous conv before we quantize the model.
-# More instructions on how to debug TorchScript model can be found `here <https://pytorch.org/docs/stable/jit.html#debugging>`_.
-#
-#
-# As we can see, this is not as straightforward as eager mode, that's why we also plan to support graph mode ``Numeric Suite``,
-# and it will probably be the primary tool people use to debug numerical issues.
-#
-# 9. Comparison with Baseline Float Model and Eager Mode Quantization
-# ---------------------------
-
-scripted_float_model_file = 'resnet18_scripted.pth'
-
-print('Size of baseline model')
-print_size_of_model(float_model)
-
-top1, top5 = evaluate(float_model, criterion, data_loader_test)
-print('Baseline Float Model Evaluation accuracy: %2.2f, %2.2f'%(top1.avg, top5.avg))
-torch.jit.save(torch.jit.script(float_model), saved_model_dir + scripted_float_model_file)
-
-######################################################################
-# In this section we compare the model quantized with graph mode quantization with the model 
-# quantized in eager mode. Graph mode and eager mode produce very similar quantized models, 
-# so the expectation is that the accuracy and speedup are similar as well.
-
-print('Size of graph mode quantized model')
-print_size_of_model(quantized_model)
-top1, top5 = evaluate(quantized_model, criterion, data_loader_test)
-print('graph mode quantized model Evaluation accuracy on test dataset: %2.2f, %2.2f'%(top1.avg, top5.avg))
-
-from torchvision.models.quantization.resnet import resnet18
-eager_quantized_model = resnet18(pretrained=True, quantize=True).eval()
-print('Size of eager mode quantized model')
-eager_quantized_model = torch.jit.script(eager_quantized_model)
-print_size_of_model(eager_quantized_model)
-top1, top5 = evaluate(eager_quantized_model, criterion, data_loader_test)
-print('eager mode quantized model Evaluation accuracy on test dataset: %2.2f, %2.2f'%(top1.avg, top5.avg))
-eager_mode_model_file = 'resnet18_eager_mode_quantized.pth'
-torch.jit.save(eager_quantized_model, saved_model_dir + eager_mode_model_file)
-
-######################################################################
-# We can see that the model size and accuracy of graph mode and eager mode quantized model are pretty similar.
-#
-# Running the model in AIBench (with single threading) gives the following result:
-#
-# .. code::
-#
-#   Scripted Float Model:
-#   Self CPU time total: 418.472ms
-#
-#   Scripted Eager Mode Quantized Model:
-#   Self CPU time total: 177.768ms
-#
-#   Graph Mode Quantized Model:
-#   Self CPU time total: 157.256ms
-#
-# As we can see for resnet18 both graph mode and eager mode quantized model get similar speed up over the floating point model,
-# which is around 2-3x faster than the floating point model. But the actual speedup over floating point model may vary 
-# depending on model, device, build, input batch sizes, threading etc.
-#
-
-
diff --git a/prototype_source/ios_gpu_workflow.rst b/prototype_source/ios_gpu_workflow.rst
new file mode 100644
index 000000000..3f12c3d19
--- /dev/null
+++ b/prototype_source/ios_gpu_workflow.rst
@@ -0,0 +1,110 @@
+(Prototype) Use iOS GPU in PyTorch
+==================================
+
+**Author**: `Tao Xu <https://github.com/xta0>`_
+
+Introduction
+------------
+
+This tutorial introduces the steps to run your models on iOS GPU. We'll be using the mobilenetv2 model as an example. Since the mobile GPU features are currently in the prototype stage, you'll need to build a custom pytorch binary from source. For the time being, only a limited number of operators are supported, and certain client side APIs are subject to change in the future versions.
+
+Model Preparation
+-------------------
+
+Since GPUs consume weights in a different order, the first step we need to do is to convert our TorchScript model to a GPU compatible model. This step is also known as "prepacking". To do that, we'll build a custom pytorch binary from source that includes the Metal backend. Go ahead checkout the pytorch source code from github and run the command below
+
+.. code:: shell
+
+    cd PYTORCH_ROOT
+    USE_PYTORCH_METAL=ON python setup.py install --cmake
+
+The command above will build a custom pytorch binary from master. The ``install`` argument simply tells ``setup.py`` to override the existing PyTorch on your desktop. Once the build finished, open another terminal to check the PyTorch version to see if the installation was successful. As the time of writing of this recipe, the version is ``1.8.0a0+41237a4``. You might be seeing different numbers depending on when you check out the code from master, but it should be greater than 1.7.0.
+
+.. code:: python
+
+    import torch
+    torch.__version__ #1.8.0a0+41237a4
+
+
+The next step is going to be converting the mobilenetv2 torchscript model to a Metal compatible model. We'll be leveraging the ``optimize_for_mobile`` API from the ``torch.utils`` module. As shown below
+
+.. code:: python
+
+    import torch
+    import torchvision
+    from torch.utils.mobile_optimizer import optimize_for_mobile
+
+    model = torchvision.models.mobilenet_v2(pretrained=True)
+    scripted_model = torch.jit.script(model)
+    optimized_model = optimize_for_mobile(scripted_model, backend='metal')
+    print(torch.jit.export_opnames(optimized_model))
+    torch.jit.save(optimized_model, './mobilenetv2_metal.pt')
+
+Note that the ``torch.jit.export_opnames(optimized_model)`` is going to dump all the optimized operators from the ``optimized_mobile``. If everything works well, you should be able to see the following ops being printed out from the console
+
+
+.. code:: shell
+
+    ['aten::adaptive_avg_pool2d', 
+    'aten::add.Tensor', 
+    'aten::addmm', 
+    'aten::reshape', 
+    'aten::size.int', 
+    'metal::copy_to_host', 
+    'metal_prepack::conv2d_run']
+
+Those are all the ops we need to run the mobilenetv2 model on iOS GPU. Cool! Now that you have the ``mobilenetv2_metal.pt`` saved on your disk, let's move on to the iOS part.
+
+
+Use C++ APIs
+---------------------
+
+In this section, we'll be using the `HelloWorld example <https://github.com/pytorch/ios-demo-app>`_ to demonstrate how to use the C++ APIs. The first thing we need to do is to build a custom LibTorch from Source. Make sure you have deleted the **build** folder from the previous step in PyTorch root directory. Then run the command below
+
+.. code:: shell
+    
+    IOS_ARCH=arm64 USE_PYTORCH_METAL=1 ./scripts/build_ios.sh
+
+Note ``IOS_ARCH`` tells the script to build a arm64 version of Libtorch. This is because in PyTorch, Metal is only available for the iOS devices that support the Apple A9 chip or above. Once the build finished, follow the `Build PyTorch iOS libraries from source <https://pytorch.org/mobile/ios/#build-pytorch-ios-libraries-from-source>`_ section from the iOS tutorial to setup the XCode settings properly. Don't forget to copy the `./mobilenetv2_metal.pt` to your XCode project.
+
+Next we need to make some changes in ``TorchModule.mm``
+
+.. code:: objective-c
+
+    //#import <LibTorch/LibTorch.h>
+    #import <torch/script.h>
+
+    - (NSArray<NSNumber*>*)predictImage:(void*)imageBuffer {
+      torch::jit::GraphOptimizerEnabledGuard opguard(false);
+      at::Tensor tensor = torch::from_blob(imageBuffer, {1, 3, 224, 224}, at::kFloat).metal();
+      auto outputTensor = _impl.forward({tensor}).toTensor().cpu();
+      ...
+    }
+
+As you can see, we simply just call ``.metal()`` to move our input tensor from CPU to GPU, and then call ``.cpu()`` to move the result back. Internally, ``.metal()`` will copy the input data from the CPU buffer to a GPU buffer with a GPU compatible memory format. When `.cpu()` is invoked, the GPU command buffer will be flushed and synced. After `forward` finished, the final result will then be copied back from the GPU buffer back to a CPU buffer.
+
+The last step we have to do is to add the `Accelerate.framework` and the `MetalShaderPerformance.framework` to your xcode project.
+
+If everything works fine, you should be able to see the inference results on your phone. The result below was captured from an iPhone11 device
+
+.. code:: shell
+
+    - timber wolf, grey wolf, gray wolf, Canis lupus
+    - malamute, malemute, Alaskan malamute
+    - Eskimo dog, husky
+
+You may notice that the results are slighly different from the `results <https://pytorch.org/mobile/ios/#install-libtorch-via-cocoapods>`_ we got from the CPU model as shown in the iOS tutorial. This is because by default Metal uses fp16 rather than fp32 to compute. The precision loss is expected.
+
+
+Conclusion
+----------
+
+In this tutorial, we demonstrated how to convert a mobilenetv2 model to a GPU compatible model. We walked through a HelloWorld example to show how to use the C++ APIs to run models on iOS GPU. Please be aware of that GPU feature is still under development, new operators will continue to be added. APIs are subject to change in the future versions.
+
+Thanks for reading! As always, we welcome any feedback, so please create an issue `here <https://github.com/pytorch/pytorch/issues>`_ if you have any.
+
+Learn More
+----------
+
+- The `Mobilenetv2 <https://pytorch.org/hub/pytorch_vision_mobilenet_v2/>`_ from Torchvision
+- To learn more about how to use ``optimize_for_mobile``, please refer to the `Mobile Perf Recipe <https://pytorch.org/tutorials/recipes/mobile_perf.html>`_
diff --git a/prototype_source/lite_interpreter.rst b/prototype_source/lite_interpreter.rst
new file mode 100644
index 000000000..bb3efe44f
--- /dev/null
+++ b/prototype_source/lite_interpreter.rst
@@ -0,0 +1,221 @@
+(Prototype) Introduce lite interpreter workflow in Android and iOS
+==================================================================
+
+**Author**: `Chen Lai <https://github.com/cccclai>`_, `Martin Yuan <https://github.com/iseeyuan>`_
+
+Introduction
+------------
+
+This tutorial introduces the steps to use lite interpreter on iOS and Android. We'll be using the ImageSegmentation demo app as an example. Since lite interpreter is currently in the prototype stage, a custom pytorch binary from source is required.
+
+
+Android
+-------------------
+Get ImageSegmentation demo app in Android: https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation
+
+1. **Prepare model**: Prepare the lite interpreter version of model by run the script below to generate the scripted model `deeplabv3_scripted.pt` and `deeplabv3_scripted.ptl`
+
+.. code:: python
+
+    import torch
+
+    model = torch.hub.load('pytorch/vision:v0.7.0', 'deeplabv3_resnet50', pretrained=True)
+    model.eval()
+
+    scripted_module = torch.jit.script(model)
+    # Export full jit version model (not compatible lite interpreter), leave it here for comparison
+    scripted_module.save("deeplabv3_scripted.pt")
+    # Export lite interpreter version model (compatible with lite interpreter)
+    scripted_module._save_for_lite_interpreter("deeplabv3_scripted.ptl")
+
+2. **Build libtorch lite for android**: Build libtorch for android for all 4 android abis (``armeabi-v7a``, ``arm64-v8a``, ``x86``, ``x86_64``) ``BUILD_LITE_INTERPRETER=1 ./scripts/build_pytorch_android.sh``. For example, if it will be tested on Pixel 4 emulator with ``x86``, use cmd ``BUILD_LITE_INTERPRETER=1 ./scripts/build_pytorch_android.sh x86`` to specify abi to save built time. After the build finish, it will show the library path:
+
+
+.. code-block:: bash
+
+   BUILD SUCCESSFUL in 55s
+   134 actionable tasks: 22 executed, 112 up-to-date
+   + find /Users/chenlai/pytorch/android -type f -name '*aar'
+   + xargs ls -lah
+   -rw-r--r--  1 chenlai  staff    13M Feb 11 11:48 /Users/chenlai/pytorch/android/pytorch_android/build/outputs/aar/pytorch_android-release.aar
+   -rw-r--r--  1 chenlai  staff    36K Feb  9 16:45 /Users/chenlai/pytorch/android/pytorch_android_torchvision/build/outputs/aar/pytorch_android_torchvision-release.aar
+
+3. **Use the PyTorch Android libraries built from source in the ImageSegmentation app**: Create a folder `libs` in the path, the path from repository root will be `ImageSegmentation/app/libs`. Copy `pytorch_android-release` to the path ``ImageSegmentation/app/libs/pytorch_android-release.aar``. Copy `pytorch_android_torchvision` (downloaded from `Pytorch Android Torchvision Nightly <https://oss.sonatype.org/#nexus-search;quick~torchvision_android/>`_) to the path ``ImageSegmentation/app/libs/pytorch_android_torchvision.aar``. Update the `dependencies` part of ``ImageSegmentation/app/build.gradle`` to
+
+.. code:: gradle
+
+   dependencies {
+       implementation 'androidx.appcompat:appcompat:1.2.0'
+       implementation 'androidx.constraintlayout:constraintlayout:2.0.2'
+       testImplementation 'junit:junit:4.12'
+       androidTestImplementation 'androidx.test.ext:junit:1.1.2'
+       androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0'
+
+
+       implementation(name:'pytorch_android-release', ext:'aar')
+       implementation(name:'pytorch_android_torchvision', ext:'aar')
+
+       implementation 'com.android.support:appcompat-v7:28.0.0'
+       implementation 'com.facebook.fbjni:fbjni-java-only:0.0.3'
+   }
+
+Update `all projects` part in ``ImageSegmentation/build.gradle`` to
+
+
+.. code:: gradle
+
+    allprojects {
+        repositories {
+            google()
+            jcenter()
+            flatDir {
+                dirs 'libs'
+            }
+        }
+    }
+
+4. **Update model loader api**: Update ``ImageSegmentation/app/src/main/java/org/pytorch/imagesegmentation/MainActivity.java`` by
+
+  4.1 Add new import: `import org.pytorch.LiteModuleLoader`
+
+  4.2 Replace the way to load pytorch lite model
+
+.. code:: java
+
+    // mModule = Module.load(MainActivity.assetFilePath(getApplicationContext(), "deeplabv3_scripted.pt"));
+    mModule = LiteModuleLoader.load(MainActivity.assetFilePath(getApplicationContext(), "deeplabv3_scripted.ptl"));
+
+5. **Test app**: Build and run the `ImageSegmentation` app in Android Studio
+
+iOS
+-------------------
+Get ImageSegmentation demo app in iOS: https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation
+
+1. **Prepare model**: Same as Android.
+
+2. **Build libtorch lite for iOS**:
+
+.. code-block:: bash
+
+   BUILD_PYTORCH_MOBILE=1 IOS_PLATFORM=SIMULATOR BUILD_LITE_INTERPRETER=1 ./scripts/build_ios.sh
+
+
+3. **Remove Cocoapods from the project** (this step is only needed if you ran `pod install`):
+
+.. code-block:: bash
+
+   pod deintegrate
+
+4. **Link ImageSegmentation demo app with the custom built library**:
+Open your project in XCode, go to your project Target’s **Build Phases - Link Binaries With Libraries**, click the **+** sign and add all the library files located in `build_ios/install/lib`. Navigate to the project **Build Settings**, set the value **Header Search Paths** to `build_ios/install/include` and **Library Search Paths** to `build_ios/install/lib`.
+In the build settings, search for **other linker flags**. Add a custom linker flag below
+```
+-all_load
+```
+Finally, disable bitcode for your target by selecting the Build Settings, searching for Enable Bitcode, and set the value to **No**.
+
+5. **Update library and api**
+
+  5.1 Update ``TorchModule.mm``: To use the custom built libraries the project, replace `#import <LibTorch/LibTorch.h>` (in ``TorchModule.mm``) which is needed when using LibTorch via Cocoapods with the code below:
+
+.. code-block:: swift
+
+    //#import <LibTorch/LibTorch.h>
+    #include "ATen/ATen.h"
+    #include "caffe2/core/timer.h"
+    #include "caffe2/utils/string_utils.h"
+    #include "torch/csrc/autograd/grad_mode.h"
+    #include "torch/script.h"
+    #include <torch/csrc/jit/mobile/function.h>
+    #include <torch/csrc/jit/mobile/import.h>
+    #include <torch/csrc/jit/mobile/interpreter.h>
+    #include <torch/csrc/jit/mobile/module.h>
+    #include <torch/csrc/jit/mobile/observer.h>
+
+.. code-block:: swift
+
+    @implementation TorchModule {
+    @protected
+    // torch::jit::script::Module _impl;
+     torch::jit::mobile::Module _impl;
+    }
+
+    - (nullable instancetype)initWithFileAtPath:(NSString*)filePath {
+      self = [super init];
+      if (self) {
+          try {
+              _impl = torch::jit::_load_for_mobile(filePath.UTF8String);
+             //  _impl = torch::jit::load(filePath.UTF8String);
+             //  _impl.eval();
+            } catch (const std::exception& exception) {
+                NSLog(@"%s", exception.what());
+                return nil;
+            }
+        }
+        return self;
+    }
+
+
+5.2 Update ``ViewController.swift``
+
+.. code-block:: swift
+
+    //  if let filePath = Bundle.main.path(forResource:
+    //      "deeplabv3_scripted", ofType: "pt"),
+    //      let module = TorchModule(fileAtPath: filePath) {
+    //      return module
+    //  } else {
+    //      fatalError("Can't find the model file!")
+    //  }
+    if let filePath = Bundle.main.path(forResource:
+        "deeplabv3_scripted", ofType: "ptl"),
+        let module = TorchModule(fileAtPath: filePath) {
+        return module
+    } else {
+        fatalError("Can't find the model file!")
+    }
+
+6. Build and test the app in Xcode.
+
+How to use lite interpreter + custom build
+------------------------------------------
+1. To dump the operators in your model, say `deeplabv3_scripted`, run the following lines of Python code:
+
+.. code-block:: python
+
+    # Dump list of operators used by deeplabv3_scripted:
+    import torch, yaml
+    model = torch.jit.load('deeplabv3_scripted.ptl')
+    ops = torch.jit.export_opnames(model)
+    with open('deeplabv3_scripted.yaml', 'w') as output:
+        yaml.dump(ops, output)
+
+In the snippet above, you first need to load the ScriptModule. Then, use export_opnames to return a list of operator names of the ScriptModule and its submodules. Lastly, save the result in a yaml file. The yaml file can be generated for any PyTorch 1.4.0 or above version. You can do that by checking the value of `torch.__version__`.
+
+2. To run the build script locally with the prepared yaml list of operators, pass in the yaml file generate from the last step into the environment variable SELECTED_OP_LIST. Also in the arguments, specify BUILD_PYTORCH_MOBILE=1 as well as the platform/architechture type.
+
+**iOS**: Take the simulator build for example, the command should be:
+
+.. code-block:: bash
+
+   SELECTED_OP_LIST=deeplabv3_scripted.yaml BUILD_PYTORCH_MOBILE=1 IOS_PLATFORM=SIMULATOR BUILD_LITE_INTERPRETER=1 ./scripts/build_ios.sh
+
+**Android**: Take the x86 build for example, the command should be:
+
+.. code-block:: bash
+
+   SELECTED_OP_LIST=deeplabv3_scripted.yaml BUILD_LITE_INTERPRETER=1 ./scripts/build_pytorch_android.sh x86
+
+
+Conclusion
+----------
+
+In this tutorial, we demonstrated how to use lite interpreter in Android and iOS app. We walked through an Image Segmentation example to show how to dump the model, build torch library from source and use the new api to run model. Please be aware of that lite interpreter is still under development, more library size reduction will be introduced in the future. APIs are subject to change in the future versions.
+
+Thanks for reading! As always, we welcome any feedback, so please create an issue `here <https://github.com/pytorch/pytorch/issues>`_ if you have any.
+
+Learn More
+----------
+
+- To learn more about PyTorch Mobile, please refer to `PyTorch Mobile Home Page <https://pytorch.org/mobile/home/>`_
+- To learn more about Image Segmentation, please refer to the `Image Segmentation DeepLabV3 on Android Recipe <https://pytorch.org/tutorials/beginner/deeplabv3_on_android.html>`_
diff --git a/prototype_source/nnapi_mobilenetv2.rst b/prototype_source/nnapi_mobilenetv2.rst
new file mode 100644
index 000000000..1e376bdc2
--- /dev/null
+++ b/prototype_source/nnapi_mobilenetv2.rst
@@ -0,0 +1,185 @@
+(Prototype) Convert MobileNetV2 to NNAPI
+========================================
+
+Introduction
+------------
+
+This tutorial shows how to prepare a computer vision model to use
+`Android's Neural Networks API (NNAPI) <https://developer.android.com/ndk/guides/neuralnetworks>`_.
+NNAPI provides access to powerful and efficient computational cores
+on many modern Android devices.
+
+PyTorch's NNAPI is currently in the "prototype" phase and only supports
+a limited range of operators, but we expect to solidify the integration
+and expand our operator support over time.
+
+
+Environment
+-----------
+
+Install PyTorch and torchvision.
+This tutorial is currently incompatible with the latest trunk,
+so we recommend running
+``pip install --upgrade --pre --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html torch==1.8.0.dev20201106+cpu torchvision==0.9.0.dev20201107+cpu``
+until this incompatibility is corrected.
+
+
+Model Preparation
+-----------------
+
+First, we must prepare our model to execute with NNAPI.
+This step runs on your training server or laptop.
+The key conversion function to call is
+``torch.backends._nnapi.prepare.convert_model_to_nnapi``,
+but some extra steps are required to ensure that
+the model is properly structured.
+Most notably, quantizing the model is required
+in order to run the model on certain accelerators.
+
+You can copy/paste this entire Python script and run it,
+or make your own modifications.
+By default, it will save the models to ``~/mobilenetv2-nnapi/``.
+Please create that directory first.
+
+.. code:: python
+
+    #!/usr/bin/env python
+    import sys
+    import os
+    import torch
+    import torch.utils.bundled_inputs
+    import torch.utils.mobile_optimizer
+    import torch.backends._nnapi.prepare
+    import torchvision.models.quantization.mobilenet
+    from pathlib import Path
+
+
+    # This script supports 3 modes of quantization:
+    # - "none": Fully floating-point model.
+    # - "core": Quantize the core of the model, but wrap it a
+    #    quantizer/dequantizer pair, so the interface uses floating point.
+    # - "full": Quantize the model, and use quantized tensors
+    #   for input and output.
+    #
+    # "none" maintains maximum accuracy
+    # "core" sacrifices some accuracy for performance,
+    # but maintains the same interface.
+    # "full" maximized performance (with the same accuracy as "core"),
+    # but requires the application to use quantized tensors.
+    #
+    # There is a fourth option, not supported by this script,
+    # where we include the quant/dequant steps as NNAPI operators.
+    def make_mobilenetv2_nnapi(output_dir_path, quantize_mode):
+        quantize_core, quantize_iface = {
+            "none": (False, False),
+            "core": (True, False),
+            "full": (True, True),
+        }[quantize_mode]
+
+        model = torchvision.models.quantization.mobilenet.mobilenet_v2(pretrained=True, quantize=quantize_core)
+        model.eval()
+
+        # Fuse BatchNorm operators in the floating point model.
+        # (Quantized models already have this done.)
+        # Remove dropout for this inference-only use case.
+        if not quantize_core:
+            model.fuse_model()
+        assert type(model.classifier[0]) == torch.nn.Dropout
+        model.classifier[0] = torch.nn.Identity()
+
+        input_float = torch.zeros(1, 3, 224, 224)
+        input_tensor = input_float
+
+        # If we're doing a quantized model, we need to trace only the quantized core.
+        # So capture the quantizer and dequantizer, use them to prepare the input,
+        # and replace them with identity modules so we can trace without them.
+        if quantize_core:
+            quantizer = model.quant
+            dequantizer = model.dequant
+            model.quant = torch.nn.Identity()
+            model.dequant = torch.nn.Identity()
+            input_tensor = quantizer(input_float)
+
+        # Many NNAPI backends prefer NHWC tensors, so convert our input to channels_last,
+        # and set the "nnapi_nhwc" attribute for the converter.
+        input_tensor = input_tensor.contiguous(memory_format=torch.channels_last)
+        input_tensor.nnapi_nhwc = True
+
+        # Trace the model.  NNAPI conversion only works with TorchScript models,
+        # and traced models are more likely to convert successfully than scripted.
+        with torch.no_grad():
+            traced = torch.jit.trace(model, input_tensor)
+        nnapi_model = torch.backends._nnapi.prepare.convert_model_to_nnapi(traced, input_tensor)
+
+        # If we're not using a quantized interface, wrap a quant/dequant around the core.
+        if quantize_core and not quantize_iface:
+            nnapi_model = torch.nn.Sequential(quantizer, nnapi_model, dequantizer)
+            model.quant = quantizer
+            model.dequant = dequantizer
+            # Switch back to float input for benchmarking.
+            input_tensor = input_float.contiguous(memory_format=torch.channels_last)
+
+        # Optimize the CPU model to make CPU-vs-NNAPI benchmarks fair.
+        model = torch.utils.mobile_optimizer.optimize_for_mobile(torch.jit.script(model))
+
+        # Bundle sample inputs with the models for easier benchmarking.
+        # This step is optional.
+        class BundleWrapper(torch.nn.Module):
+            def __init__(self, mod):
+                super().__init__()
+                self.mod = mod
+            def forward(self, arg):
+                return self.mod(arg)
+        nnapi_model = torch.jit.script(BundleWrapper(nnapi_model))
+        torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
+            model, [(torch.utils.bundled_inputs.bundle_large_tensor(input_tensor),)])
+        torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
+            nnapi_model, [(torch.utils.bundled_inputs.bundle_large_tensor(input_tensor),)])
+
+        # Save both models.
+        model.save(output_dir_path / ("mobilenetv2-quant_{}-cpu.pt".format(quantize_mode)))
+        nnapi_model.save(output_dir_path / ("mobilenetv2-quant_{}-nnapi.pt".format(quantize_mode)))
+
+
+    if __name__ == "__main__":
+        for quantize_mode in ["none", "core", "full"]:
+            make_mobilenetv2_nnapi(Path(os.environ["HOME"]) / "mobilenetv2-nnapi", quantize_mode)
+
+
+Running Benchmarks
+------------------
+
+Now that the models are ready, we can benchmark them on our Android devices.
+See `our performance recipe <https://pytorch.org/tutorials/recipes/mobile_perf.html#android-benchmarking-setup>`_ for details.
+The best-performing models are likely to be the "fully-quantized" models:
+``mobilenetv2-quant_full-cpu.pt`` and ``mobilenetv2-quant_full-nnapi.pt``.
+
+Because these models have bundled inputs, we can run the benchmark as follows:
+
+.. code:: shell
+
+   ./speed_benchmark_torch --pthreadpool_size=1 --model=mobilenetv2-quant_full-nnapi.pt --use_bundled_input=0 --warmup=5 --iter=200
+
+Adjusting increasing the thread pool size can can reduce latency,
+at the cost of increased CPU usage.
+Omitting that argument will use one thread per big core.
+The CPU models can get improved performance (at the cost of memory usage)
+by passing ``--use_caching_allocator=true``.
+
+
+Integration
+-----------
+
+The converted models are ordinary TorchScript models.
+You can use them in your app just like any other PyTorch model.
+See `https://pytorch.org/mobile/android/ <https://pytorch.org/mobile/android/>`_
+for an introduction to using PyTorch on Android.
+
+
+Learn More
+----------
+
+- Learn more about optimization in our
+  `Mobile Performance Recipe <https://pytorch.org/tutorials/recipes/mobile_perf.html>`_
+- `MobileNetV2 <https://pytorch.org/hub/pytorch_vision_mobilenet_v2/>`_ from torchvision
+- Information about `NNAPI <https://developer.android.com/ndk/guides/neuralnetworks>`_
diff --git a/prototype_source/numeric_suite_tutorial.py b/prototype_source/numeric_suite_tutorial.py
index 61b7c670f..df386f4ef 100644
--- a/prototype_source/numeric_suite_tutorial.py
+++ b/prototype_source/numeric_suite_tutorial.py
@@ -50,7 +50,7 @@
 float_model.fuse_model()
 float_model.qconfig = torch.quantization.default_qconfig
 img_data = [(torch.rand(2, 3, 10, 10, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)) for _ in range(2)]
-qmodel = quantize(float_model, default_eval_fn, img_data, inplace=False)
+qmodel = quantize(float_model, default_eval_fn, [img_data], inplace=False)
 
 ##############################################################################
 # 1. Compare the weights of float and quantized models
@@ -124,13 +124,13 @@ def compute_error(x, y):
 
 print("\nkeys of act_compare_dict entry for conv1's output:")
 print(act_compare_dict['conv1.stats'].keys())
-print(act_compare_dict['conv1.stats']['float'].shape)
-print(act_compare_dict['conv1.stats']['quantized'].shape)
+print(act_compare_dict['conv1.stats']['float'][0].shape)
+print(act_compare_dict['conv1.stats']['quantized'][0].shape)
 
 ##############################################################################
 # This dict can be used to compare and compute the quantization error of the activations of float and quantized models as following.
 for key in act_compare_dict:
-    print(key, compute_error(act_compare_dict[key]['float'], act_compare_dict[key]['quantized'].dequantize()))
+    print(key, compute_error(act_compare_dict[key]['float'][0], act_compare_dict[key]['quantized'][0].dequantize()))
 
 ##############################################################################
 # If we want to do the comparison for more than one input data, we can do the following.
@@ -206,7 +206,7 @@ def forward(self, x):
 float_model.fuse_model()
 float_model.qconfig = torch.quantization.default_qconfig
 img_data = [(torch.rand(2, 3, 10, 10, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)) for _ in range(2)]
-qmodel = quantize(float_model, default_eval_fn, img_data, inplace=False)
+qmodel = quantize(float_model, default_eval_fn, [img_data], inplace=False)
 
 ##############################################################################
 # In the following example we call ``compare_model_stub()`` from PyTorch Numeric Suite to compare ``QuantizableBasicBlock`` module with its float point equivalent. This API returns a dict with key corresponding to module names and each entry being a dictionary with two keys 'float' and 'quantized', containing the output tensors of quantized and its matching float shadow module.
@@ -224,14 +224,14 @@ def forward(self, x):
 
 print("\nkeys of ob_dict entry for layer1.0's output:")
 print(ob_dict['layer1.0.stats'].keys())
-print(ob_dict['layer1.0.stats']['float'].shape)
-print(ob_dict['layer1.0.stats']['quantized'].shape)
+print(ob_dict['layer1.0.stats']['float'][0].shape)
+print(ob_dict['layer1.0.stats']['quantized'][0].shape)
 
 ##############################################################################
 # This dict can be then used to compare and compute the module level quantization error.
 
 for key in ob_dict:
-    print(key, compute_error(ob_dict[key]['float'], ob_dict[key]['quantized'].dequantize()))
+    print(key, compute_error(ob_dict[key]['float'][0], ob_dict[key]['quantized'][0].dequantize()))
 
 ##############################################################################
 # If we want to do the comparison for more than one input data, we can do the following.
@@ -370,7 +370,7 @@ def init_hidden(self, bsz):
 
 
 for key in act_compare_dict:
-    print(key, compute_error(act_compare_dict[key]['float'][0], act_compare_dict[key]['quantized'][0]))
+    print(key, compute_error(act_compare_dict[key]['float'][0][0], act_compare_dict[key]['quantized'][0][0]))
 
 ##############################################################################
 #
@@ -405,7 +405,7 @@ def init_hidden(self, bsz):
 # This dict can be then used to compare and compute the module level quantization error.
 
 for key in ob_dict:
-    print(key, compute_error(ob_dict[key]['float'], ob_dict[key]['quantized']))
+    print(key, compute_error(ob_dict[key]['float'][0], ob_dict[key]['quantized'][0]))
 
 ##############################################################################
 # SQNR of 40 dB is high and this is a situation where we have very good numerical alignment between the floating point and quantized model.
diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst
new file mode 100644
index 000000000..46b8c4cfb
--- /dev/null
+++ b/prototype_source/prototype_index.rst
@@ -0,0 +1,147 @@
+PyTorch Prototype Recipes
+---------------------------------------------
+Prototype features are not available as part of binary distributions like PyPI or Conda (except maybe behind run-time flags). To test these features we would, depending on the feature, recommend building from master or using the nightly wheels that are made available on `pytorch.org <https://pytorch.org>`_.
+
+*Level of commitment*: We are committing to gathering high bandwidth feedback only on these features. Based on this feedback and potential further engagement between community members, we as a community will decide if we want to upgrade the level of commitment or to fail fast.
+
+
+.. raw:: html
+
+        </div>
+    </div>
+
+    <div id="tutorial-cards-container">
+
+    <nav class="navbar navbar-expand-lg navbar-light tutorials-nav col-12">
+        <div class="tutorial-tags-container">
+            <div id="dropdown-filter-tags">
+                <div class="tutorial-filter-menu">
+                    <div class="tutorial-filter filter-btn all-tag-selected" data-tag="all">All</div>
+                </div>
+            </div>
+        </div>
+    </nav>
+
+    <hr class="tutorials-hr">
+
+    <div class="row">
+
+    <div id="tutorial-cards">
+    <div class="list">
+
+.. Add prototype tutorial cards below this line
+
+.. Quantization
+
+.. customcarditem::
+   :header: FX Graph Mode Quantization User Guide
+   :card_description: Learn about FX Graph Mode Quantization.
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../prototype/fx_graph_mode_quant_guide.html
+   :tags: FX,Quantization
+
+.. customcarditem::
+   :header: FX Graph Mode Post Training Dynamic Quantization
+   :card_description: Learn how to do post training dynamic quantization in graph mode based on torch.fx. 
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../prototype/fx_graph_mode_ptq_dynamic.html
+   :tags: FX,Quantization
+
+.. customcarditem::
+   :header: FX Graph Mode Post Training Static Quantization
+   :card_description: Learn how to do post training static quantization in graph mode based on torch.fx. 
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../prototype/fx_graph_mode_ptq_static.html
+   :tags: FX,Quantization
+
+.. customcarditem::
+   :header: Graph Mode Dynamic Quantization on BERT
+   :card_description: Learn how to do post training dynamic quantization with graph mode quantization on BERT models.
+   :image: ../_static/img/thumbnails/cropped/graph-mode-dynamic-bert.png
+   :link: ../prototype/graph_mode_dynamic_bert_tutorial.html
+   :tags: Text,Quantization
+
+.. customcarditem::
+   :header: PyTorch Numeric Suite Tutorial
+   :card_description: Learn how to use the PyTorch Numeric Suite to support quantization debugging efforts.
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../prototype/numeric_suite_tutorial.html
+   :tags: Debugging,Quantization
+
+.. Mobile
+
+.. customcarditem::
+   :header: Use iOS GPU in PyTorch
+   :card_description: Learn how to run your models on iOS GPU.
+   :image: ../_static/img/thumbnails/cropped/ios.png
+   :link: ../prototype/ios_gpu_workflow.html
+   :tags: Mobile
+
+.. customcarditem::
+   :header: Convert MobileNetV2 to NNAPI
+   :card_description: Learn how to prepare a computer vision model to use Android’s Neural Networks API (NNAPI).
+   :image: ../_static/img/thumbnails/cropped/android.png
+   :link: ../prototype/nnapi_mobilenetv2.html
+   :tags: Mobile
+
+.. customcarditem::
+   :header: PyTorch Vulkan Backend User Workflow
+   :card_description: Learn how to use the Vulkan backend on mobile GPUs.
+   :image: ../_static/img/thumbnails/cropped/android.png
+   :link: ../prototype/vulkan_workflow.html
+   :tags: Mobile
+
+.. customcarditem::
+   :header: Lite Interpreter Workflow in Android and iOS
+   :card_description: Learn how to use the lite interpreter on iOS and Andriod devices.
+   :image: ../_static/img/thumbnails/cropped/mobile.png
+   :link: ../prototype/lite_interpreter.html
+   :tags: Mobile
+
+.. TorchScript
+
+.. customcarditem::
+   :header: Model Freezing in TorchScript
+   :card_description: Freezing is the process of inlining Pytorch module parameters and attributes values into the TorchScript internal representation.
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../prototype/torchscript_freezing.html
+   :tags: TorchScript
+
+.. vmap
+
+.. customcarditem::
+   :header: Using torch.vmap
+   :card_description: Learn about torch.vmap, an autovectorizer for PyTorch operations.
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../prototype/vmap_recipe.html
+   :tags: vmap
+
+.. End of tutorial card section
+
+.. raw:: html
+
+    </div>
+
+    <div class="pagination d-flex justify-content-center"></div>
+
+    </div>
+
+    </div>
+
+.. -----------------------------------------
+.. Page TOC
+.. -----------------------------------------
+.. toctree::
+   :hidden:
+
+   prototype/fx_graph_mode_quant_guide.html
+   prototype/fx_graph_mode_ptq_dynamic.html
+   prototype/fx_graph_mode_ptq_static.html
+   prototype/graph_mode_dynamic_bert_tutorial.html
+   prototype/ios_gpu_workflow.html
+   prototype/nnapi_mobilenetv2.html
+   prototype/numeric_suite_tutorial.html
+   prototype/torchscript_freezing.html
+   prototype/vmap_recipe.html
+   prototype/vulkan_workflow.html
+   prototype/lite_interpreter.html
diff --git a/prototype_source/vmap_recipe.py b/prototype_source/vmap_recipe.py
new file mode 100644
index 000000000..6fb6e156e
--- /dev/null
+++ b/prototype_source/vmap_recipe.py
@@ -0,0 +1,121 @@
+"""
+torch.vmap
+==========
+This tutorial introduces torch.vmap, an autovectorizer for PyTorch operations.
+torch.vmap is a prototype feature and cannot handle a number of use cases;
+however, we would like to gather use cases for it to inform the design. If you
+are considering using torch.vmap or think it would be really cool for something,
+please contact us at https://github.com/pytorch/pytorch/issues/42368.
+
+So, what is vmap?
+-----------------
+vmap is a higher-order function. It accepts a function `func` and returns a new
+function that maps `func` over some dimension of the inputs. It is highly
+inspired by JAX's vmap.
+
+Semantically, vmap pushes the "map" into PyTorch operations called by `func`,
+effectively vectorizing those operations.
+"""
+import torch
+# NB: vmap is only available on nightly builds of PyTorch.
+# You can download one at pytorch.org if you're interested in testing it out.
+from torch import vmap
+
+####################################################################
+# The first use case for vmap is making it easier to handle
+# batch dimensions in your code. One can write a function `func`
+# that runs on examples and then lift it to a function that can
+# take batches of examples with `vmap(func)`. `func` however
+# is subject to many restrictions:
+# - it must be functional (one cannot mutate a Python data structure
+#   inside of it), with teh exception of in-place PyTorch operations.
+# - batches of examples must be provided as Tensors. This means that
+#   vmap doesn't handle variable-length sequences out of the box.
+#
+# One example of using `vmap` is to compute batched dot products. PyTorch
+# doesn't provide a batched `torch.dot` API; instead of unsuccessfully
+# rummaging through docs, use `vmap` to construct a new function:
+
+torch.dot                            # [D], [D] -> []
+batched_dot = torch.vmap(torch.dot)  # [N, D], [N, D] -> [N]
+x, y = torch.randn(2, 5), torch.randn(2, 5)
+batched_dot(x, y)
+
+####################################################################
+# `vmap` can be helpful in hiding batch dimensions, leading to a simpler
+# model authoring experience.
+batch_size, feature_size = 3, 5
+weights = torch.randn(feature_size, requires_grad=True)
+
+# Note that model doesn't work with a batch of feature vectors because
+# torch.dot must take 1D tensors. It's pretty easy to rewrite this
+# to use `torch.matmul` instead, but if we didn't want to do that or if
+# the code is more complicated (e.g., does some advanced indexing
+# shenanigins), we can simply call `vmap`. `vmap` batches over ALL
+# inputs, unless otherwise specified (with the in_dims argument,
+# please see the documentation for more details).
+def model(feature_vec):
+    # Very simple linear model with activation
+    return feature_vec.dot(weights).relu()
+
+examples = torch.randn(batch_size, feature_size)
+result = torch.vmap(model)(examples)
+expected = torch.stack([model(example) for example in examples.unbind()])
+assert torch.allclose(result, expected)
+
+####################################################################
+# `vmap` can also help vectorize computations that were previously difficult
+# or impossible to batch. This bring us to our second use case: batched
+# gradient computation.
+# - https://github.com/pytorch/pytorch/issues/8304
+# - https://github.com/pytorch/pytorch/issues/23475
+#
+# The PyTorch autograd engine computes vjps (vector-Jacobian products).
+# Using vmap, we can compute (batched vector) - jacobian products.
+#
+# One example of this is computing a full Jacobian matrix (this can also be
+# applied to computing a full Hessian matrix).
+# Computing a full Jacobian matrix for some function f: R^N -> R^N usually
+# requires N calls to `autograd.grad`, one per Jacobian row.
+
+# Setup
+N = 5
+def f(x):
+    return x ** 2
+
+x = torch.randn(N, requires_grad=True)
+y = f(x)
+basis_vectors = torch.eye(N)
+
+# Sequential approach
+jacobian_rows = [torch.autograd.grad(y, x, v, retain_graph=True)[0]
+                 for v in basis_vectors.unbind()]
+jacobian = torch.stack(jacobian_rows)
+
+# Using `vmap`, we can vectorize the whole computation, computing the
+# Jacobian in a single call to `autograd.grad`.
+def get_vjp(v):
+    return torch.autograd.grad(y, x, v)[0]
+
+jacobian_vmap = vmap(get_vjp)(basis_vectors)
+assert torch.allclose(jacobian_vmap, jacobian)
+
+####################################################################
+# The third main use case for vmap is computing per-sample-gradients.
+# This is something that the vmap prototype cannot handle performantly
+# right now. We're not sure what the API for computing per-sample-gradients
+# should be, but if you have ideas, please comment in
+# https://github.com/pytorch/pytorch/issues/7786.
+
+def model(sample, weight):
+    # do something...    
+    return torch.dot(sample, weight)
+
+def grad_sample(sample):
+    return torch.autograd.functional.vjp(lambda weight: model(sample), weight)[1]
+
+# The following doesn't actually work in the vmap prototype. But it
+# could be an API for computing per-sample-gradients.
+
+# batch_of_samples = torch.randn(64, 5)
+# vmap(grad_sample)(batch_of_samples)
diff --git a/prototype_source/vulkan_workflow.rst b/prototype_source/vulkan_workflow.rst
new file mode 100644
index 000000000..c18f57ae2
--- /dev/null
+++ b/prototype_source/vulkan_workflow.rst
@@ -0,0 +1,243 @@
+PyTorch Vulkan Backend User Workflow
+====================================
+
+**Author**: `Ivan Kobzarev <https://github.com/IvanKobzarev>`_
+
+Introduction
+------------
+PyTorch 1.7 supports the ability to run model inference on GPUs that support the Vulkan graphics and compute API. The primary target devices are mobile GPUs on Android devices. The Vulkan backend can also be used on Linux, Mac, and Windows desktop builds to use Vulkan devices like Intel integrated GPUs. This feature is in the prototype stage and is subject to change.
+
+Building PyTorch with Vulkan backend
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Vulkan backend is not included by default. The main switch to include Vulkan backend is cmake option ``USE_VULKAN``, that can be set by environment variable ``USE_VULKAN``.
+
+To use PyTorch with Vulkan backend, we need to build it from source with additional settings. Checkout the PyTorch source code from GitHub master branch.
+
+Optional usage of vulkan wrapper
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By default, Vulkan library will be loaded at runtime using the vulkan_wrapper library. If you specify the environment variable ``USE_VULKAN_WRAPPER=0`` libvulkan will be linked directly.
+
+Desktop build
+^^^^^^^^^^^^^
+
+Vulkan SDK
+^^^^^^^^^^
+Download VulkanSDK from https://vulkan.lunarg.com/sdk/home and set environment variable ``VULKAN_SDK``
+
+Unpack VulkanSDK to ``VULKAN_SDK_ROOT`` folder, install VulkanSDK following VulkanSDK instructions for your system.
+
+For Mac:
+
+::
+
+    cd $VULKAN_SDK_ROOT
+    source setup-env.sh
+    sudo python install_vulkan.py
+
+
+Building PyTorch:
+
+For Linux:
+
+::
+
+    cd PYTORCH_ROOT
+    USE_VULKAN=1 USE_VULKAN_SHADERC_RUNTIME=1 USE_VULKAN_WRAPPER=0 python setup.py install
+
+For Mac:
+
+::
+
+    cd PYTORCH_ROOT
+    USE_VULKAN=1 USE_VULKAN_SHADERC_RUNTIME=1 USE_VULKAN_WRAPPER=0 MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
+
+After successful build, open another terminal and verify the version of installed PyTorch.
+
+::
+
+    import torch
+    print(torch.__version__)
+
+At the time of writing of this recipe, the version is 1.8.0a0+41237a4. You might be seeing different numbers depending on when you check out the code from master, but it should be greater than 1.7.0.
+
+
+Android build
+^^^^^^^^^^^^^
+
+To build LibTorch for android with Vulkan backend for specified ``ANDROID_ABI``.
+
+::
+
+    cd PYTORCH_ROOT
+    ANDROID_ABI=arm64-v8a USE_VULKAN=1 sh ./scripts/build_android.sh
+
+
+To prepare pytorch_android aars that you can use directly in your app:
+
+::
+
+    cd $PYTORCH_ROOT
+    USE_VULKAN=1 sh ./scripts/build_pytorch_android.sh
+
+
+Model preparation
+-----------------
+
+Install torchvision, get the default pretrained float model.
+
+::
+
+    pip install torchvision
+
+Python script to save pretrained mobilenet_v2 to a file:
+
+::
+
+    import torch
+    import torchvision
+
+    model = torchvision.models.mobilenet_v2(pretrained=True)
+    model.eval()
+    script_model = torch.jit.script(model)
+    torch.jit.save(script_model, "mobilenet2.pt")
+
+PyTorch 1.7 Vulkan backend supports only float 32bit operators. The default model needs additional step that will optimize operators fusing 
+
+::
+
+    from torch.utils.mobile_optimizer import optimize_for_mobile
+    script_model_vulkan = optimize_for_mobile(script_model, backend='vulkan')
+    torch.jit.save(script_model_vulkan, "mobilenet2-vulkan.pt")
+
+The result model can be used only on Vulkan backend as it contains specific to the Vulkan backend operators.
+
+Using Vulkan backend in code
+----------------------------
+
+C++ API
+-------
+
+::
+
+    at::is_vulkan_available()
+    auto tensor = at::rand({1, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat));
+    auto tensor_vulkan = t.vulkan();
+    auto module = torch::jit::load("$PATH");
+    auto tensor_output_vulkan = module.forward(inputs).toTensor();
+    auto tensor_output = tensor_output.cpu();
+
+``at::is_vulkan_available()`` function tries to initialize Vulkan backend and if Vulkan device is successfully found and context is created - it will return true, false otherwise.
+
+``.vulkan()`` function called on Tensor will copy tensor to Vulkan device, and for operators called with this tensor as input - the operator will run on Vulkan device, and its output will be on the Vulkan device.
+
+``.cpu()`` function called on Vulkan tensor will copy its data to CPU tensor (default)
+
+Operators called with a tensor on a Vulkan device as an input will be executed on a Vulkan device. If an operator is not supported for the Vulkan backend the exception will be thrown.
+
+List of supported operators:
+
+::
+
+    _adaptive_avg_pool2d
+    _cat
+    add.Scalar
+    add.Tensor
+    add_.Tensor
+    addmm
+    avg_pool2d
+    clamp
+    convolution
+    empty.memory_format
+    empty_strided
+    hardtanh_
+    max_pool2d
+    mean.dim
+    mm
+    mul.Scalar
+    relu_
+    reshape
+    select.int
+    slice.Tensor
+    transpose.int
+    transpose_
+    unsqueeze
+    upsample_nearest2d
+    view
+
+Those operators allow to use torchvision models for image classification on Vulkan backend.
+
+
+Python API
+----------
+
+``torch.is_vulkan_available()`` is exposed to Python API.
+
+``tensor.to(device='vulkan')`` works as ``.vulkan()`` moving tensor to the Vulkan device.
+
+``.vulkan()`` at the moment of writing of this tutorial is not exposed to Python API, but it is planned to be there.
+
+Android Java API
+---------------
+
+For Android API to run model on Vulkan backend we have to specify this during model loading:
+
+::
+
+    import org.pytorch.Device;
+    Module module = Module.load("$PATH", Device.VULKAN)
+    FloatBuffer buffer = Tensor.allocateFloatBuffer(1 * 3 * 224 * 224);
+    Tensor inputTensor = Tensor.fromBlob(buffer, new int[]{1, 3, 224, 224});
+    Tensor outputTensor = mModule.forward(IValue.from(inputTensor)).toTensor();
+
+In this case, all inputs will be transparently copied from CPU to the Vulkan device, and model will be run on Vulkan device, the output will be copied transparently to CPU.
+
+The example of using Vulkan backend can be found in test application within the PyTorch repository:
+https://github.com/pytorch/pytorch/blob/master/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java#L133
+
+Building android test app with Vulkan
+-------------------------------------
+
+1. Build pytorch android with Vulkan backend for all android ABIs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    cd $PYTORCH_ROOT
+    USE_VULKAN=1 sh ./scripts/build_pytorch_android.sh
+
+Or if you need only specific abi you can set it as an argument:
+
+::
+
+    cd $PYTORCH_ROOT
+    USE_VULKAN=1 sh ./scripts/build_pytorch_android.sh $ANDROID_ABI
+
+2. Add vulkan model to test application assets
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Add prepared model ``mobilenet2-vulkan.pt`` to test applocation assets:
+
+::
+  
+  cp mobilenet2-vulkan.pt $PYTORCH_ROOT/android/test_app/app/src/main/assets/
+
+
+3. Build and Install test applocation to connected android device 
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    cd $PYTORCH_ROOT
+    gradle -p android test_app:installMbvulkanLocalBaseDebug
+
+After successful installation, the application with the name 'MBQ' can be launched on the device. 
+
+
+
+
+
+Testing models without uploading to android device
+--------------------------------------------------
+
+Software implementations of Vulkan (e.g. https://swiftshader.googlesource.com/SwiftShader ) can be used to test if a model can be run using PyTorch Vulkan Backend (e.g. check if all model operators are supported).
diff --git a/recipes_source/cuda_rpc.rst b/recipes_source/cuda_rpc.rst
new file mode 100644
index 000000000..0114664d5
--- /dev/null
+++ b/recipes_source/cuda_rpc.rst
@@ -0,0 +1,147 @@
+Direct Device-to-Device Communication with TensorPipe CUDA RPC
+==============================================================
+
+.. note:: Direct device-to-device RPC (CUDA RPC) is introduced in PyTorch 1.8
+    as a prototype feature. This API is subject to change.
+
+In this recipe, you will learn:
+
+- The high-level idea of CUDA RPC.
+- How to use CUDA RPC.
+
+
+Requirements
+------------
+
+- PyTorch 1.8+
+- `Getting Started With Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`_
+
+
+What is CUDA RPC?
+------------------------------------
+
+CUDA RPC supports directly sending Tensors from local CUDA memory to remote
+CUDA memory. Prior to v1.8 release, PyTorch RPC only accepts CPU Tensors. As a
+result, when an application needs to send a CUDA Tensor through RPC, it has
+to first move the Tensor to CPU on the caller, send it via RPC, and then move
+it to the destination device on the callee, which incurs both unnecessary
+synchronizations and D2H and H2D copies. Since v1.8, RPC allows users to
+configure a per-process global device map using the
+`set_device_map <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.TensorPipeRpcBackendOptions.set_device_map>`_
+API, specifying how to map local devices to remote devices. More specifically,
+if ``worker0``'s device map has an entry ``"worker1" : {"cuda:0" : "cuda:1"}``,
+all RPC arguments on ``"cuda:0"`` from ``worker0`` will be directly sent to
+``"cuda:1"`` on ``worker1``. The response of an RPC will use the inverse of
+the caller device map, i.e., if ``worker1`` returns a Tensor on ``"cuda:1"``,
+it will be directly sent to ``"cuda:0"`` on ``worker0``. All intended
+device-to-device direct communication must be specified in the per-process
+device map. Otherwise, only CPU tensors are allowed.
+
+Under the hood, PyTorch RPC relies on `TensorPipe <https://github.com/pytorch/tensorpipe>`_
+as the communication backend. PyTorch RPC extracts all Tensors from each
+request or response into a list and packs everything else into a binary
+payload. Then, TensorPipe will automatically choose a communication channel
+for each Tensor based on Tensor device type and channel availability on both
+the caller and the callee. Existing TensorPipe channels cover NVLink, InfiniBand,
+SHM, CMA, TCP, etc.
+
+How to use CUDA RPC?
+---------------------------------------
+
+The code below shows how to use CUDA RPC. The model contains two linear layers
+and is split into two shards. The two shards are placed on ``worker0`` and
+``worker1`` respectively, and ``worker0`` serves as the master that drives the
+forward and backward passes. Note that we intentionally skipped
+`DistributedOptimizer <https://pytorch.org/docs/master/rpc.html#module-torch.distributed.optim>`_
+to highlight the performance improvements when using CUDA RPC. The experiment
+repeats the forward and backward passes 10 times and measures the total
+execution time. It compares using CUDA RPC against manually staging to CPU
+memory and using CPU RPC.
+
+
+::
+
+    import torch
+    import torch.distributed.autograd as autograd
+    import torch.distributed.rpc as rpc
+    import torch.multiprocessing as mp
+    import torch.nn as nn
+
+    import os
+    import time
+
+
+    class MyModule(nn.Module):
+        def __init__(self, device, comm_mode):
+            super().__init__()
+            self.device = device
+            self.linear = nn.Linear(1000, 1000).to(device)
+            self.comm_mode = comm_mode
+
+        def forward(self, x):
+            # x.to() is a no-op if x is already on self.device
+            y = self.linear(x.to(self.device))
+            return y.cpu() if self.comm_mode == "cpu" else y
+
+        def parameter_rrefs(self):
+            return [rpc.RRef(p) for p in self.parameters()]
+
+
+    def measure(comm_mode):
+        # local module on "worker0/cuda:0"
+        lm = MyModule("cuda:0", comm_mode)
+        # remote module on "worker1/cuda:1"
+        rm = rpc.remote("worker1", MyModule, args=("cuda:1", comm_mode))
+        # prepare random inputs
+        x = torch.randn(1000, 1000).cuda(0)
+
+        tik = time.time()
+        for _ in range(10):
+            with autograd.context() as ctx:
+                y = rm.rpc_sync().forward(lm(x))
+                autograd.backward(ctx, [y.sum()])
+        # synchronize on "cuda:0" to make sure that all pending CUDA ops are
+        # included in the measurements
+        torch.cuda.current_stream("cuda:0").synchronize()
+        tok = time.time()
+        print(f"{comm_mode} RPC total execution time: {tok - tik}")
+
+
+    def run_worker(rank):
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '29500'
+        options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=128)
+
+        if rank == 0:
+            options.set_device_map("worker1", {0: 1})
+            rpc.init_rpc(
+                f"worker{rank}",
+                rank=rank,
+                world_size=2,
+                rpc_backend_options=options
+            )
+            measure(comm_mode="cpu")
+            measure(comm_mode="cuda")
+        else:
+            rpc.init_rpc(
+                f"worker{rank}",
+                rank=rank,
+                world_size=2,
+                rpc_backend_options=options
+            )
+
+        # block until all rpcs finish
+        rpc.shutdown()
+
+
+    if __name__=="__main__":
+        world_size = 2
+        mp.spawn(run_worker, nprocs=world_size, join=True)
+
+Outputs are displayed below, which shows that CUDA RPC can help to achieve
+34X speed up compared to CPU RPC in this experiment.
+
+::
+
+    cpu RPC total execution time: 2.3145179748535156 Seconds
+    cuda RPC total execution time: 0.06867480278015137 Seconds
diff --git a/recipes_source/distributed_rpc_profiling.rst b/recipes_source/distributed_rpc_profiling.rst
new file mode 100644
index 000000000..51f5a4ca8
--- /dev/null
+++ b/recipes_source/distributed_rpc_profiling.rst
@@ -0,0 +1,433 @@
+Profiling PyTorch RPC-Based Workloads
+======================================
+
+In this recipe, you will learn:
+
+-  An overview of the `Distributed RPC Framework`_.
+-  An overview of the `PyTorch Profiler`_.
+-  How to use the profiler to profile RPC-based workloads.
+-  A short example showcasing how to use the profiler to tune RPC parameters.
+
+Requirements
+------------
+
+-  PyTorch 1.6+
+
+The instructions for installing PyTorch are
+available at `pytorch.org`_.
+
+What is the Distributed RPC Framework?
+---------------------------------------
+
+The **Distributed RPC Framework** provides mechanisms for multi-machine model
+training through a set of primitives to allow for remote communication, and a 
+higher-level API to automatically differentiate models split across several machines.
+For this recipe, it would be helpful to be familiar with the `Distributed RPC Framework`_
+as well as the `RPC Tutorials`_. 
+
+What is the PyTorch Profiler?
+---------------------------------------
+The profiler is a context manager based API that allows for on-demand profiling of
+operators in a model's workload. The profiler can be used to analyze various aspects
+of a model including execution time, operators invoked, and memory consumption. For a
+detailed tutorial on using the profiler to profile a single-node model, please see the
+`Profiler Recipe`_.
+
+
+
+How to use the Profiler for RPC-based workloads
+-----------------------------------------------
+
+The profiler supports profiling of calls made of RPC and allows the user to have a
+detailed view into the operations that take place on different nodes. To demonstrate an
+example of this, let's first set up the RPC framework. The below code snippet will initialize
+two RPC workers on the same host, named ``worker0`` and ``worker1`` respectively. The workers will
+be spawned as subprocesses, and we set some environment variables required for proper
+initialization.
+
+::
+
+  import torch
+  import torch.distributed.rpc as rpc
+  import torch.autograd.profiler as profiler
+  import torch.multiprocessing as mp
+  import os
+  import logging
+  import sys
+
+  logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+  logger = logging.getLogger()
+
+  def random_tensor():
+      return torch.rand((3, 3), requires_grad=True)
+
+
+  def worker(rank, world_size):
+      os.environ["MASTER_ADDR"] = "localhost"
+      os.environ["MASTER_PORT"] = "29500"
+      worker_name = f"worker{rank}"
+
+      # Initialize RPC framework.
+      rpc.init_rpc(
+          name=worker_name,
+          rank=rank,
+          world_size=world_size
+      )
+      logger.debug(f"{worker_name} successfully initialized RPC.")
+
+      pass # to be continued below
+
+      logger.debug(f"Rank {rank} waiting for workers and shutting down RPC")
+      rpc.shutdown()
+      logger.debug(f"Rank {rank} shutdown RPC")
+
+
+  if __name__ == '__main__':
+      # Run 2 RPC workers.
+      world_size = 2
+      mp.spawn(worker, args=(world_size,), nprocs=world_size)
+
+Running the above program should present you with the following output:
+
+::
+
+  DEBUG:root:worker1 successfully initialized RPC.
+  DEBUG:root:worker0 successfully initialized RPC.
+  DEBUG:root:Rank 0 waiting for workers and shutting down RPC
+  DEBUG:root:Rank 1 waiting for workers and shutting down RPC
+  DEBUG:root:Rank 1 shutdown RPC
+  DEBUG:root:Rank 0 shutdown RPC
+
+Now that we have a skeleton setup of our RPC framework, we can move on to 
+sending RPCs back and forth and using the profiler to obtain a view of what's
+happening under the hood. Let's add to the above ``worker`` function:
+
+::
+
+    def worker(rank, world_size):
+        # Above code omitted...
+        if rank == 0:
+            dst_worker_rank = (rank + 1) % world_size
+            dst_worker_name = f"worker{dst_worker_rank}"
+            t1, t2 = random_tensor(), random_tensor() 
+            # Send and wait RPC completion under profiling scope.
+            with profiler.profile() as prof:
+                fut1 = rpc.rpc_async(dst_worker_name, torch.add, args=(t1, t2))
+                fut2 = rpc.rpc_async(dst_worker_name, torch.mul, args=(t1, t2))
+                # RPCs must be awaited within profiling scope.
+                fut1.wait()
+                fut2.wait()
+
+            print(prof.key_averages().table())
+
+The aforementioned code creates 2 RPCs, specifying ``torch.add`` and ``torch.mul``, respectively, 
+to be run with two random input tensors on worker 1. Since we use the ``rpc_async`` API, 
+we are returned a ``torch.futures.Future`` object, which must be awaited for the result
+of the computation. Note that this wait must take place within the scope created by
+the profiling context manager in order for the RPC to be accurately profiled. Running
+the code with this new worker function should result in the following output:
+
+:: 
+
+  # Some columns are omitted for brevity, exact output subject to randomness
+  ----------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+  Name                                                              Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls  Node ID          
+  ----------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+  rpc_async#aten::add(worker0 -> worker1)                           0.00%            0.000us          0                20.462ms         20.462ms         1                0                         
+  rpc_async#aten::mul(worker0 -> worker1)                           0.00%            0.000us          0                5.712ms          5.712ms          1                0                
+  rpc_async#aten::mul(worker0 -> worker1)#remote_op: mul            1.84%            206.864us        2.69%            302.162us        151.081us        2                1                
+  rpc_async#aten::add(worker0 -> worker1)#remote_op: add            1.41%            158.501us        1.57%            176.924us        176.924us        1                1                
+  rpc_async#aten::mul(worker0 -> worker1)#remote_op: output_nr      0.04%            4.980us          0.04%            4.980us          2.490us          2                1                
+  rpc_async#aten::mul(worker0 -> worker1)#remote_op: is_leaf        0.07%            7.806us          0.07%            7.806us          1.952us          4                1                
+  rpc_async#aten::add(worker0 -> worker1)#remote_op: empty          0.16%            18.423us         0.16%            18.423us         18.423us         1                1                
+  rpc_async#aten::mul(worker0 -> worker1)#remote_op: empty          0.14%            15.712us         0.14%            15.712us         15.712us         1                1                
+  ----------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+  Self CPU time total: 11.237ms
+
+Here we can see that the profiler has profiled our ``rpc_async`` calls made to ``worker1``
+from ``worker0``. In particular, the first 2 entries in the table show details (such as
+the operator name, originating worker, and destination worker) about each RPC call made
+and the ``CPU total`` column indicates the end-to-end latency of the RPC call. 
+
+We also have visibility into the actual operators invoked remotely on worker 1 due to RPC.
+We can see operations that took place on ``worker1`` by checking the ``Node ID`` column. For 
+example, we can interpret the row with name ``rpc_async#aten::mul(worker0 -> worker1)#remote_op: mul``
+as a ``mul`` operation taking place on the remote node, as a result of the RPC sent to ``worker1``
+from ``worker0``, specifying ``worker1`` to run the builtin ``mul`` operator on the input tensors.
+Note that names of remote operations are prefixed with the name of the RPC event that resulted
+in them. For example, remote operations corresponding to the ``rpc.rpc_async(dst_worker_name, torch.add, args=(t1, t2))``
+call are prefixed with ``rpc_async#aten::mul(worker0 -> worker1)``.
+
+We can also use the profiler to gain insight into user-defined functions that are executed over RPC. 
+For example, let's add the following to the above ``worker`` function:
+
+::
+
+  # Define somewhere outside of worker() func.
+  def udf_with_ops():
+      import time
+      time.sleep(1)
+      t1, t2 = random_tensor(), random_tensor()
+      torch.add(t1, t2)
+      torch.mul(t1, t2)
+
+  def worker(rank, world_size):
+      # Above code omitted
+      with profiler.profile() as p:
+          fut = rpc.rpc_async(dst_worker_name, udf_with_ops)
+          fut.wait()
+      print(p.key_averages().table())
+
+The above code creates a user-defined function that sleeps for 1 second, and then executes various
+operators. Similar to what we've done above, we send an RPC to the remote worker, specifying it to
+run our user-defined function. Running this code should result in the following output:
+
+::
+
+  # Exact output subject to randomness
+  --------------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+  Name                                                                  Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls  Node ID          
+  --------------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+  rpc_async#udf_with_ops(worker0 -> worker1)                            0.00%            0.000us          0                1.008s           1.008s           1                0                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: rand            12.58%           80.037us         47.09%           299.589us        149.795us        2                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: empty           15.40%           98.013us         15.40%           98.013us         24.503us         4                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: uniform_        22.85%           145.358us        23.87%           151.870us        75.935us         2                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: is_complex      1.02%            6.512us          1.02%            6.512us          3.256us          2                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: add             25.80%           164.179us        28.43%           180.867us        180.867us        1                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: mul             20.48%           130.293us        31.43%           199.949us        99.975us         2                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: output_nr       0.71%            4.506us          0.71%            4.506us          2.253us          2                1                
+  rpc_async#udf_with_ops(worker0 -> worker1)#remote_op: is_leaf         1.16%            7.367us          1.16%            7.367us          1.842us          4                1                
+  --------------------------------------------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  
+
+Here we can see that the user-defined function has successfully been profiled with its name
+``(rpc_async#udf_with_ops(worker0 -> worker1))``, and has the CPU total time we would roughly expect
+(slightly greater than 1s given the ``sleep``). Similar to the above profiling output, we can see the
+remote operators that have been executed on worker 1 as part of executing this RPC request.
+
+In addition, we can visualize remote execution using the tracing functionality provided by the profiler.
+Let's add the following code to the above ``worker`` function:
+
+::
+
+    def worker(rank, world_size):
+        # Above code omitted
+        # Will generate trace for above profiling output
+        trace_file = "/tmp/trace.json"
+        prof.export_chrome_trace(trace_file)
+        logger.debug(f"Wrote trace to {trace_file}")
+
+Now, we can load the trace file in Chrome (``chrome://tracing``). We should see output similar to
+the following:
+
+.. image:: ../_static/img/rpc_trace_img.png
+   :scale: 25 %
+
+As we can see, we have traced our RPC requests and can also visualize traces of the remote operations,
+in this case, given in the trace row for ``node_id: 1``.
+
+
+Example: Using profiler to tune RPC initialization parameters
+--------------------------------------------------------------
+
+The following exercise is intended to be a simple example into how one can use statistics and traces
+from the profiler to guide tuning RPC initialization parameters. In particular, we will focus on tuning
+the ``num_worker_threads`` parameter used during RPC initialization. First, we modify our ``rpc.init_rpc``
+call to the following:
+
+::
+
+    # Initialize RPC framework.
+    num_worker_threads = 1
+    rpc.init_rpc(
+      name=worker_name,
+      rank=rank,
+      world_size=world_size,
+      rpc_backend_options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=num_worker_threads)
+    )
+
+This will initialize the [TensorPipe RPC backend](https://pytorch.org/docs/stable/rpc.html#tensorpipe-backend) with only one thread for processing RPC requests. Next, add
+the following function somewhere outside of the ``worker`` main function:
+
+::
+
+    def num_workers_udf_with_ops():
+      t = torch.randn((100, 100))
+      for i in range(10):
+        t.mul(t)
+        t.add(t)
+        t = t.relu()
+        t = t.sigmoid()
+    return t
+
+This function is mainly intended to be a dummy CPU-intensive function for demonstration purposes. Next, we add the
+following RPC and profiling code to our main ``worker`` function:
+
+::
+
+    with profiler.profile() as p:
+      futs = []
+      for i in range(4):
+        fut = rpc.rpc_async(dst_worker_name, num_workers_udf_with_ops)
+        futs.append(fut)
+      for f in futs:
+        f.wait()
+
+    print(p.key_averages().table())
+
+    trace_file = "/tmp/trace.json"
+    # Export the trace.
+    p.export_chrome_trace(trace_file)
+    logger.debug(f"Wrote trace to {trace_file}")
+
+Running the code should return the following profiling statistics (exact output subject to randomness):
+
+::
+
+    -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls       Node ID
+    -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                aten::zeros         0.33%     143.557us         0.47%     203.125us      50.781us             4             0
+                                                aten::empty         0.24%     101.487us         0.24%     101.487us      12.686us             8             0
+                                                aten::zero_         0.04%      17.758us         0.04%      17.758us       4.439us             4             0
+    rpc_async#num_workers_udf_with_ops(worker0 -> worker...         0.00%       0.000us             0     189.757ms      47.439ms             4             0
+    # additional columns omitted for brevity
+    -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+
+We can see that there were 4 RPC calls as expected taking a total of 190ms. Let's now tune the ``num_worker_threads`` 
+parameter we set earlier, by changing it to ``num_worker_threads = 8``. Running the code with that change should return
+the following profiling statistics (exact output subject to randomness):
+
+::
+
+    -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls       Node ID
+    -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
+                                                aten::zeros         0.31%     127.320us         0.53%     217.203us      54.301us             4             0
+                                                aten::empty         0.27%     113.529us         0.27%     113.529us      14.191us             8             0
+                                                aten::zero_         0.04%      18.032us         0.04%      18.032us       4.508us             4             0
+    rpc_async#num_workers_udf_with_ops(worker0 -> worker...         0.00%       0.000us             0      94.776ms      23.694ms             4             0
+
+
+We see a clear ~2x speedup, and hypothesize that this speedup is due to exploiting parallelism on the server due
+to the additional cores available. However, how can we ensure that this speedup is due to the increase in cores?
+Taking a look at the trace visualization helps with this. Below is the trace when we set ``num_worker_threads=1``:
+
+.. image:: ../_static/img/oneworker.png
+   :scale: 25 %
+
+Focusing on the trace for ``node 1``, we can see that the RPCs are ran serially on the server.
+
+Next, the following is the trace where we set ``num_worker_threads=8``:
+
+.. image:: ../_static/img/8_workers.png
+   :scale: 25 %
+
+Based on the latter trace, we can see ``node 1`` was able to execute the RPCs in parallel on the server, due to having additional
+worker threads. To summarize, we were able to leverage both the profiler's output report and trace to pick an appropriate
+``num_worker_threads`` parameter for RPC initialization in this simple exercise.
+
+
+Putting it all together, we have the following code for this recipe:
+
+::
+
+    import torch
+    import torch.distributed.rpc as rpc
+    import torch.autograd.profiler as profiler
+    import torch.multiprocessing as mp
+    import os
+    import logging
+    import sys
+
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+    logger = logging.getLogger()
+
+    def random_tensor():
+      return torch.rand((3, 3), requires_grad=True)
+
+    def udf_with_ops():
+      import time
+      time.sleep(1)
+      t1, t2 = random_tensor(), random_tensor()
+      torch.add(t1, t2)
+      torch.mul(t1, t2)
+
+    def num_workers_udf_with_ops():  
+      t = torch.randn((100, 100))
+      for i in range(10):
+          t.mul(t)
+          t.add(t)
+          t = t.relu()
+          t = t.sigmoid()
+      return t
+
+    def worker(rank, world_size):
+      os.environ["MASTER_ADDR"] = "localhost"
+      os.environ["MASTER_PORT"] = "29500"
+      worker_name = f"worker{rank}"
+
+      # Initialize RPC framework.
+      num_worker_threads =8
+      rpc.init_rpc(
+        name=worker_name,
+        rank=rank,
+        world_size=world_size,
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=num_worker_threads),
+      )
+      logger.debug(f"{worker_name} successfully initialized RPC.")
+
+      if rank == 0:
+        dst_worker_rank = (rank + 1) % world_size
+        dst_worker_name = f"worker{dst_worker_rank}"
+        t1, t2 = random_tensor(), random_tensor()
+        # Send and wait RPC completion under profiling scope.
+        with profiler.profile() as prof:  
+          fut1 = rpc.rpc_async(dst_worker_name, torch.add, args=(t1, t2))
+          fut2 = rpc.rpc_async(dst_worker_name, torch.mul, args=(t1, t2))
+          # RPCs must be awaited within profiling scope.
+          fut1.wait()
+          fut2.wait()
+        print(prof.key_averages().table())
+
+        with profiler.profile() as p:
+          futs = []
+          for i in range(4):
+            fut = rpc.rpc_async(dst_worker_name, num_workers_udf_with_ops)
+            futs.append(fut)
+            for f in futs:
+              f.wait()
+
+        print(p.key_averages().table())
+
+        trace_file = "/tmp/trace.json"
+        # Export the trace.
+        p.export_chrome_trace(trace_file)
+        logger.debug(f"Wrote trace to {trace_file}")
+
+
+      logger.debug(f"Rank {rank} waiting for workers and shutting down RPC")
+      rpc.shutdown()
+      logger.debug(f"Rank {rank} shutdown RPC")
+
+
+
+    if __name__ == '__main__':
+      # Run 2 RPC workers.
+      world_size = 2
+      mp.spawn(worker, args=(world_size,), nprocs=world_size)
+
+
+Learn More
+-------------------
+
+-  `pytorch.org`_ for installation instructions, and more documentation
+   and tutorials.
+-  `Distributed RPC Framework`_ for RPC framework and API reference.
+- `Full profiler documentation`_ for profiler documentation.
+
+.. _pytorch.org: https://pytorch.org/
+.. _Full profiler documentation: https://pytorch.org/docs/stable/autograd.html#profiler
+.. _Pytorch Profiler: https://pytorch.org/docs/stable/autograd.html#profiler
+.. _Distributed RPC Framework: https://pytorch.org/docs/stable/rpc.html
+.. _RPC Tutorials: https://pytorch.org/tutorials/intermediate/rpc_tutorial.html
+.. _Profiler Recipe: https://pytorch.org/tutorials/recipes/recipes/profiler.html
diff --git a/recipes_source/fuse.rst b/recipes_source/fuse.rst
new file mode 100644
index 000000000..e4dfa36bc
--- /dev/null
+++ b/recipes_source/fuse.rst
@@ -0,0 +1,157 @@
+Fuse Modules Recipe
+=====================================
+
+This recipe demonstrates how to fuse a list of PyTorch modules into a single module and how to do the performance test to compare the fused model with its non-fused version.
+
+Introduction
+------------
+
+Before quantization is applied to a model to reduce its size and memory footprint (see `Quantization Recipe <quantization.html>`_ for details on quantization), the list of modules in the model may be fused first into a single module. Fusion is optional, but it may save on memory access, make the model run faster, and improve its accuracy.
+
+
+Pre-requisites
+--------------
+
+PyTorch 1.6.0 or 1.7.0
+
+Steps
+--------------
+
+Follow the steps below to fuse an example model, quantize it, script it, optimize it for mobile, save it and test it with the Android benchmark tool.
+
+1. Define the Example Model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use the same example model defined in the `PyTorch Mobile Performance Recipes <https://pytorch.org/tutorials/recipes/mobile_perf.html>`_:
+
+::
+
+    import torch
+    from torch.utils.mobile_optimizer import optimize_for_mobile
+
+    class AnnotatedConvBnReLUModel(torch.nn.Module):
+        def __init__(self):
+            super(AnnotatedConvBnReLUModel, self).__init__()
+            self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
+            self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float)
+            self.relu = torch.nn.ReLU(inplace=True)
+            self.quant = torch.quantization.QuantStub()
+            self.dequant = torch.quantization.DeQuantStub()
+
+        def forward(self, x):
+            x.contiguous(memory_format=torch.channels_last)
+            x = self.quant(x)
+            x = self.conv(x)
+            x = self.bn(x)
+            x = self.relu(x)
+            x = self.dequant(x)
+            return x
+
+
+2. Generate Two Models with and without `fuse_modules`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Add the following code below the model definition above and run the script:
+
+::
+
+    model = AnnotatedConvBnReLUModel()
+    print(model)
+
+    def prepare_save(model, fused):
+        model.qconfig = torch.quantization.get_default_qconfig('qnnpack')
+        torch.quantization.prepare(model, inplace=True)
+        torch.quantization.convert(model, inplace=True)
+        torchscript_model = torch.jit.script(model)
+        torchscript_model_optimized = optimize_for_mobile(torchscript_model)
+        torch.jit.save(torchscript_model_optimized, "model.pt" if not fused else "model_fused.pt")
+
+    prepare_save(model, False)
+
+    model = AnnotatedConvBnReLUModel()
+    model_fused = torch.quantization.fuse_modules(model, [['bn', 'relu']], inplace=False)
+    print(model_fused)
+
+    prepare_save(model_fused, True)
+
+
+The graphs of the original model and its fused version will be printed as follows:
+
+::
+
+    AnnotatedConvBnReLUModel(
+      (conv): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1), bias=False)
+      (bn): BatchNorm2d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+      (relu): ReLU(inplace=True)
+      (quant): QuantStub()
+      (dequant): DeQuantStub()
+    )
+
+    AnnotatedConvBnReLUModel(
+      (conv): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1), bias=False)
+      (bn): BNReLU2d(
+        (0): BatchNorm2d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (1): ReLU(inplace=True)
+      )
+      (relu): Identity()
+      (quant): QuantStub()
+      (dequant): DeQuantStub()
+    )
+
+In the second fused model output, the first item `bn` in the list is replaced with the fused module, and the rest of the modules (`relu` in this example) is replaced with identity. In addition, the non-fused and fused versions of the model `model.pt` and `model_fused.pt` are generated.
+
+3. Build the Android benchmark Tool
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Get the PyTorch source and build the Android benchmark tool as follows:
+
+::
+
+    git clone --recursive https://github.com/pytorch/pytorch
+    cd pytorch
+    git submodule update --init --recursive
+    BUILD_PYTORCH_MOBILE=1 ANDROID_ABI=arm64-v8a ./scripts/build_android.sh -DBUILD_BINARY=ON
+
+
+This will generate the Android benchmark binary `speed_benchmark_torch` in the `build_android/bin` folder.
+
+4. Test Compare the Fused and Non-Fused Models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Connect your Android device, then copy `speed_benchmark_torch` and the model files and run the benchmark tool on them:
+
+::
+
+    adb push build_android/bin/speed_benchmark_torch /data/local/tmp
+    adb push model.pt /data/local/tmp
+    adb push model_fused.pt /data/local/tmp
+    adb shell "/data/local/tmp/speed_benchmark_torch --model=/data/local/tmp/model.pt" --input_dims="1,3,224,224" --input_type="float"
+    adb shell "/data/local/tmp/speed_benchmark_torch --model=/data/local/tmp/model_fused.pt" --input_dims="1,3,224,224" --input_type="float"
+
+
+The results from the last two commands should be like:
+
+::
+
+    Main run finished. Microseconds per iter: 6189.07. Iters per second: 161.575
+
+and
+
+::
+
+    Main run finished. Microseconds per iter: 6216.65. Iters per second: 160.858
+
+For this example model, there is no much performance difference between the fused and non-fused models. But the similar steps can be used to fuse and prepare a real deep model and test to see the performance improvement. Keep in mind that currently `torch.quantization.fuse_modules` only fuses the following sequence of modules:
+
+* conv, bn
+* conv, bn, relu
+* conv, relu
+* linear, relu
+* bn, relu
+
+If any other sequence list is provided to the `fuse_modules` call, it will simply be ignored.
+
+Learn More
+---------------
+
+See `here <https://pytorch.org/docs/stable/quantization.html#preparing-model-for-quantization>`_ for the official documentation of `torch.quantization.fuse_modules`.
diff --git a/recipes_source/mobile_perf.rst b/recipes_source/mobile_perf.rst
index e4d432b42..2e7e7c17f 100644
--- a/recipes_source/mobile_perf.rst
+++ b/recipes_source/mobile_perf.rst
@@ -72,7 +72,7 @@ Code your model:
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Do not be confused that fuse_modules is in the quantization package.
-It works for all ``torcn.nn.Module``.
+It works for all ``torch.nn.Module``.
 
 ``torch.quantization.fuse_modules`` fuses a list of modules into a single module.
 It fuses only the following sequence of modules:
@@ -237,7 +237,7 @@ Now we are ready to benchmark your model:
 
 ::
 
-  adb shell "/data/local/tmp/speed_benchmark_torch --model="/data/local/tmp/model.pt" --input_dims="1,3,224,224" --input_type="float"
+  adb shell "/data/local/tmp/speed_benchmark_torch --model=/data/local/tmp/model.pt" --input_dims="1,3,224,224" --input_type="float"
   ----- output -----
   Starting benchmark.
   Running warmup runs.
@@ -250,7 +250,7 @@ iOS - Benchmarking Setup
 
 For iOS, we'll be using our `TestApp <https://github.com/pytorch/pytorch/tree/master/ios/TestApp>`_ as the benchmarking tool. 
 
-To begin with, let's apply the ``optimize_for_mobile`` method to our python script located at `TestApp/benchmark/trace_mode.py <https://github.com/pytorch/pytorch/blob/master/ios/TestApp/benchmark/trace_model.py>`_. Simply modify the code as below.
+To begin with, let's apply the ``optimize_for_mobile`` method to our python script located at `TestApp/benchmark/trace_model.py <https://github.com/pytorch/pytorch/blob/master/ios/TestApp/benchmark/trace_model.py>`_. Simply modify the code as below.
 
 ::
 
diff --git a/recipes_source/model_preparation_android.rst b/recipes_source/model_preparation_android.rst
new file mode 100644
index 000000000..55ef7d973
--- /dev/null
+++ b/recipes_source/model_preparation_android.rst
@@ -0,0 +1,85 @@
+Model Preparation for Android Recipe
+=====================================
+
+This recipe demonstrates how to prepare a PyTorch MobileNet v2 image classification model for Android apps, and how to set up Android projects to use the mobile-ready model file.
+
+Introduction
+-----------------
+
+After a PyTorch model is trained or a pre-trained model is made available, it is normally not ready to be used in mobile apps yet. It needs to be quantized (see the `Quantization Recipe <quantization.html>`_), converted to TorchScript so Android apps can load it, and optimized for mobile apps. Furthermore, Android apps need to be set up correctly to enable the use of PyTorch Mobile libraries, before they can load and use the model for inference.
+
+Pre-requisites
+-----------------
+
+PyTorch 1.6.0 or 1.7.0
+
+torchvision 0.6.0 or 0.7.0
+
+Android Studio 3.5.1 or above with NDK installed
+
+Steps
+-----------------
+
+1. Get Pretrained and Quantized MobileNet v2 Model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To get the MobileNet v2 quantized model, simply do:
+
+::
+
+    import torchvision
+
+    model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True)
+
+2. Script and Optimize the Model for Mobile Apps
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use either the `script` or `trace` method to convert the quantized model to the TorchScript format:
+
+::
+
+    import torch
+
+    dummy_input = torch.rand(1, 3, 224, 224)
+    torchscript_model = torch.jit.trace(model_quantized, dummy_input)
+
+or
+
+::
+
+    torchscript_model = torch.jit.script(model_quantized)
+
+
+.. warning::
+    The `trace` method only scripts the code path executed during the trace, so it will not work properly for models that include decision branches. See the `Script and Optimize for Mobile Recipe <script_optimized.html>`_ for more details.
+
+Then optimize the TorchScript formatted model for mobile and save it:
+
+::
+
+    from torch.utils.mobile_optimizer import optimize_for_mobile
+    torchscript_model_optimized = optimize_for_mobile(torchscript_model)
+    torch.jit.save(torchscript_model_optimized, "mobilenetv2_quantized.pt")
+
+With the total 7 or 8 (depending on if the `script` or `trace` method is called to get the TorchScript format of the model) lines of code in the two steps above, we have a model ready to be added to mobile apps.
+
+3. Add the Model and PyTorch Library on Android
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* In your current or a new Android Studio project, open the build.gradle file, and add the following two lines (the second one is required only if you plan to use a TorchVision model):
+
+::
+
+    implementation 'org.pytorch:pytorch_android:1.6.0'
+    implementation 'org.pytorch:pytorch_android_torchvision:1.6.0'
+
+* Drag and drop the model file `mobilenetv2_quantized.pt` to your project's assets folder.
+
+That's it! Now you can build your Android app with the PyTorch library and the model ready to use. To actually write code to use the model, refer to the PyTorch Mobile `Android Quickstart with a HelloWorld Example <https://pytorch.org/mobile/android/#quickstart-with-a-helloworld-example>`_ and `Android Hackathon Example <https://github.com/pytorch/workshops/tree/master/PTMobileWalkthruAndroid>`_.
+
+Learn More
+-----------------
+
+1. `PyTorch Mobile site <https://pytorch.org/mobile>`_
+
+2. `Introduction to TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_
diff --git a/recipes_source/model_preparation_ios.rst b/recipes_source/model_preparation_ios.rst
new file mode 100644
index 000000000..2fbacd7fa
--- /dev/null
+++ b/recipes_source/model_preparation_ios.rst
@@ -0,0 +1,95 @@
+Model Preparation for iOS Recipe
+=====================================
+
+This recipe demonstrates how to prepare a PyTorch MobileNet v2 image classification model for iOS apps, and how to set up an iOS project to use the mobile-ready model file.
+
+Introduction
+-----------------
+
+After a PyTorch model is trained or a pre-trained model is made available, it is normally not ready to be used in mobile apps yet. It needs to be quantized (see `Quantization Recipe <quantization.html>`_ for more details), converted to TorchScript so iOS apps can load it and optimized for mobile apps (see `Script and Optimize for Mobile Recipe <script_optimized.html>`_). Furthermore, iOS apps need to be set up correctly to enable the use of PyTorch Mobile libraries, before they can load and use the model for inference.
+
+Pre-requisites
+-----------------
+
+PyTorch 1.6.0 or 1.7.0
+
+torchvision 0.6.0 or 0.7.0
+
+Xcode 11 or 12
+
+Steps
+-----------------
+
+1. Get Pretrained and Quantized MobileNet v2 Model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To get the MobileNet v2 quantized model, simply do:
+
+::
+
+    import torchvision
+
+    model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True)
+
+2. Script and Optimize the Model for Mobile Apps
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use either the script or trace method to convert the quantized model to the TorchScript format:
+
+::
+
+    import torch
+
+    dummy_input = torch.rand(1, 3, 224, 224)
+    torchscript_model = torch.jit.trace(model_quantized, dummy_input)
+
+or
+
+::
+
+    torchscript_model = torch.jit.script(model_quantized)
+
+.. warning::
+    The `trace` method only scripts the code path executed during the trace, so it will not work properly for models that include decision branches. See the `Script and Optimize for Mobile Recipe <script_optimized.html>`_ for more details.
+
+
+Then optimize the TorchScript formatted model for mobile and save it:
+
+::
+
+    from torch.utils.mobile_optimizer import optimize_for_mobile
+    torchscript_model_optimized = optimize_for_mobile(torchscript_model)
+    torch.jit.save(torchscript_model_optimized, "mobilenetv2_quantized.pt")
+
+With the total 7 or 8 (depending on if the script or trace method is called to get the TorchScript format of the model) lines of code in the two steps above, we have a model ready to be added to mobile apps.
+
+3. Add the Model and PyTorch Library on iOS
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To use the mobile-ready model `mobilenetv2_quantized.pt` in an iOS app, either create a new Xcode project or in your existing Xcode project, then follow the steps below:
+
+* Open a Mac Terminal, cd to your iOS app's project folder;
+
+* If your iOS app does not use Cocoapods yet, run `pod init` first to generate the `Podfile` file.
+
+* Edit `Podfile` either from Xcode or any editor, and add the following line under the target:
+
+::
+
+    pod 'LibTorch', '~>1.6.1'
+
+* Run `pod install` from the Terminal and then open your project's xcworkspace file;
+
+* Save the two files `TorchModule.h` and `TorchModule.mm` from `here <https://github.com/pytorch/ios-demo-app/tree/master/HelloWorld/HelloWorld/HelloWorld/TorchBridge>`_ and drag and drop them to your project. If your project is Swift based, a message box with the title "Would you like to configure an Objective-C bridging header?" will show up; click the "Create Bridging Header" button to create a Swift to Objective-c bridging header file, and add `#import "TorchModule.h"` to the header file `<your_project_name>-Bridging-Header.h`;
+
+* Drag and drop the model file `mobilenetv2_quantized.pt` to the project.
+
+After these steps, you can successfully build and run your Xcode project. To actually write code to use the model, refer to the PyTorch Mobile `iOS Code Walkthrough <https://pytorch.org/mobile/ios/#code-walkthrough>`_ and two complete ready-to-run sample iOS apps `HelloWorld <https://github.com/pytorch/ios-demo-app/tree/master/HelloWorld>`_ and `iOS Hackathon Example <https://github.com/pytorch/workshops/tree/master/PTMobileWalkthruIOS>`_.
+
+
+Learn More
+-----------------
+
+1. `PyTorch Mobile site <https://pytorch.org/mobile>`_
+
+2. `Introduction to TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_
diff --git a/recipes_source/ptmobile_recipes_summary.rst b/recipes_source/ptmobile_recipes_summary.rst
new file mode 100644
index 000000000..cddee940f
--- /dev/null
+++ b/recipes_source/ptmobile_recipes_summary.rst
@@ -0,0 +1,40 @@
+Summary of PyTorch Mobile Recipes
+=====================================
+
+This summary provides a top level overview of recipes for PyTorch Mobile to help developers choose which recipes to follow for their PyTorch-powered mobile app development.
+
+Introduction
+----------------
+
+When a PyTorch model is trained or retrained, or when a pre-trained model is available, for mobile deployment, follow the the recipes outlined in this summary so mobile apps can successfully use the model.
+
+Pre-requisites
+----------------
+
+PyTorch 1.6.0 or 1.7.0
+
+(Optional) torchvision 0.6.0 or 0.7.0
+
+For iOS development: Xcode 11 or 12
+
+For Android development: Android Studio 3.5.1 or above (with NDK installed); or Android SDK, NDK, Gradle, JDK.
+
+New Recipes for PyTorch Mobile
+--------------------------------
+
+* (Recommended) To fuse a list of PyTorch modules into a single module to reduce the model size before quantization, read the `Fuse Modules recipe <fuse.html>`_.
+
+* (Recommended) To reduce the model size and make it run faster without losing much on accuracy, read the `Quantization Recipe <quantization.html>`_.
+
+* (Must) To convert the model to TorchScipt and (optional) optimize it for mobile apps, read the `Script and Optimize for Mobile Recipe <script_optimized.html>`_.
+
+* (Must for iOS development) To add the model in an iOS project and use PyTorch pod for iOS, read the `Model preparation for iOS Recipe <model_preparation_ios.html>`_.
+
+* (Must for Android development) To add the model in an Android project and use the PyTorch library for Android, read the `Model preparation for Android Recipe <model_preparation_android.html>`_.
+
+
+Learn More
+-----------------
+
+1. `PyTorch Mobile site <https://pytorch.org/mobile>`_
+2. `PyTorch Mobile Performance Recipes <https://pytorch.org/tutorials/recipes/mobile_perf.html>`_
diff --git a/recipes_source/quantization.rst b/recipes_source/quantization.rst
new file mode 100644
index 000000000..7c04fbdb0
--- /dev/null
+++ b/recipes_source/quantization.rst
@@ -0,0 +1,135 @@
+Quantization Recipe
+=====================================
+
+This recipe demonstrates how to quantize a PyTorch model so it can run with reduced size and faster inference speed with about the same accuracy as the original model. Quantization can be applied to both server and mobile model deployment, but it can be especially important or even critical on mobile, because a non-quantized model's size may exceed the limit that an iOS or Android app allows for, cause the deployment or OTA update to take too much time, and make the inference too slow for a good user experience.
+
+Introduction
+------------
+
+Quantization is a technique that converts 32-bit floating numbers in the model parameters to 8-bit integers. With quantization, the model size and memory footprint can be reduced to 1/4 of its original size, and the inference can be made about 2-4 times faster, while the accuracy stays about the same.
+
+There are overall three approaches or workflows to quantize a model: post training dynamic quantization, post training static quantization, and quantization aware training. But if the model you want to use already has a quantized version, you can use it directly without going through any of the three workflows above. For example, the `torchvision` library already includes quantized versions for models MobileNet v2, ResNet 18, ResNet 50, Inception v3, GoogleNet, among others. So we will make the last approach another workflow, albeit a simple one.
+
+.. note::
+    The quantization support is available for a limited set of operators. See `this <https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#device-and-operator-support>`_ for more information.
+
+Pre-requisites
+-----------------
+
+PyTorch 1.6.0 or 1.7.0
+
+torchvision 0.6.0 or 0.7.0
+
+Workflows
+------------
+
+Use one of the four workflows below to quantize a model.
+
+1. Use Pretrained Quantized MobileNet v2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To get the MobileNet v2 quantized model, simply do:
+
+::
+
+    import torchvision
+    model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True)
+
+
+To compare the size difference of a non-quantized MobileNet v2 model with its quantized version:
+
+::
+
+    model = torchvision.models.mobilenet_v2(pretrained=True)
+
+    import os
+    import torch
+
+    def print_model_size(mdl):
+        torch.save(mdl.state_dict(), "tmp.pt")
+        print("%.2f MB" %(os.path.getsize("tmp.pt")/1e6))
+        os.remove('tmp.pt')
+
+    print_model_size(model)
+    print_model_size(model_quantized)
+
+
+The outputs will be:
+
+::
+
+    14.27 MB
+    3.63 MB
+
+2. Post Training Dynamic Quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To apply Dynamic Quantization, which converts all the weights in a model from 32-bit floating numbers to 8-bit integers but doesn't convert the activations to int8 till just before performing the computation on the activations, simply call `torch.quantization.quantize_dynamic`:
+
+::
+
+    model_dynamic_quantized = torch.quantization.quantize_dynamic(
+        model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8
+    )
+
+where `qconfig_spec` specifies the list of submodule names in `model` to apply quantization to.
+
+.. warning:: An important limitation of Dynamic Quantization, while it is the easiest workflow if you do not have a pre-trained quantized model ready for use, is that it currently only supports `nn.Linear` and `nn.LSTM` in `qconfig_spec`, meaning that you will have to use Static Quantization or Quantization Aware Training, to be discussed later, to quantize other modules such as `nn.Conv2d`.
+
+The full documentation of the `quantize_dynamic` API call is `here <https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic>`_. Three other examples of using the post training dynamic quantization are `the Bert example <https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html>`_, `an LSTM model example <https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html#test-dynamic-quantization>`_, and another `demo LSTM example <https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html#do-the-quantization>`_.
+
+3. Post Training Static Quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This method converts both the weights and the activations to 8-bit integers beforehand so there won't be on-the-fly conversion on the activations during the inference, as the dynamic quantization does, hence improving the performance significantly.
+
+To apply static quantization on a model, run the following code:
+
+::
+
+    backend = "qnnpack"
+    model.qconfig = torch.quantization.get_default_qconfig(backend)
+    torch.backends.quantized.engine = backend
+    model_static_quantized = torch.quantization.prepare(model, inplace=False)
+    model_static_quantized = torch.quantization.convert(model_static_quantized, inplace=False)
+
+After this, running `print_model_size(model_static_quantized)` shows the static quantized model is `3.98MB`.
+
+A complete model definition and static quantization example is `here <https://pytorch.org/docs/stable/quantization.html#quantization-api-summary>`_. A dedicated static quantization tutorial is `here <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
+.. note::
+  To make the model run on mobile devices which normally have arm architecture, you need to use `qnnpack` for `backend`; to run the model on computer with x86 architecture, use `fbgemm`.
+
+4. Quantization Aware Training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Quantization aware training inserts fake quantization to all the weights and activations during the model training process and results in higher inference accuracy than the post-training quantization methods. It is typically used in CNN models.
+
+To enable a model for quantization aware traing, define in the `__init__` method of the model definition a `QuantStub` and a `DeQuantStub` to convert tensors from floating point to quantized type and vice versa:
+
+::
+
+    self.quant = torch.quantization.QuantStub()
+    self.dequant = torch.quantization.DeQuantStub()
+
+Then in the beginning and the end of the `forward` method of the model definition, call `x = self.quant(x)` and `x = self.dequant(x)`.
+
+To do a quantization aware training, use the following code snippet:
+
+::
+
+    model.qconfig = torch.quantization.get_default_qat_qconfig(backend)
+    model_qat = torch.quantization.prepare_qat(model, inplace=False)
+    # quantization aware training goes here
+    model_qat = torch.quantization.convert(model_qat.eval(), inplace=False)
+
+For more detailed examples of the quantization aware training, see `here <https://pytorch.org/docs/master/quantization.html#quantization-aware-training>`_ and `here <https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html#quantization-aware-training>`_.
+
+A pre-trained quantized model can also be used for quantized aware transfer learning, using the same `quant` and `dequant` calls shown above. See `here <https://pytorch.org/tutorials/intermediate/quantized_transfer_learning_tutorial.html#part-1-training-a-custom-classifier-based-on-a-quantized-feature-extractor>`_ for a complete example.
+
+After a quantized model is generated using one of the steps above, before the model can be used to run on mobile devices, it needs to be further converted to the `TorchScript` format and then optimized for mobile apps. See the `Script and Optimize for Mobile recipe <script_optimized.html>`_ for details.
+
+Learn More
+-----------------
+
+For more info on the different workflows of quantization, see `here <https://pytorch.org/docs/stable/quantization.html#quantization-workflows>`_ and `here <https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#post-training-static-quantization>`_.
diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt
index f93ee92c2..a182b0a11 100644
--- a/recipes_source/recipes/README.txt
+++ b/recipes_source/recipes/README.txt
@@ -56,3 +56,7 @@ PyTorch Recipes
 14. mobile_perf.py
          PyTorch Mobile Performance Recipes
          https://pytorch.org/tutorials/recipes/mobile_perf.html
+
+15. amp_recipe.py
+         Automatic Mixed Precision
+         https://pytorch.org/tutorials/recipes/amp_recipe.html
diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py
new file mode 100644
index 000000000..c1ec52a38
--- /dev/null
+++ b/recipes_source/recipes/amp_recipe.py
@@ -0,0 +1,325 @@
+# -*- coding: utf-8 -*-
+"""
+Automatic Mixed Precision
+*************************
+**Author**: `Michael Carilli <https://github.com/mcarilli>`_
+
+`torch.cuda.amp <https://pytorch.org/docs/stable/amp.html>`_ provides convenience methods for mixed precision,
+where some operations use the ``torch.float32`` (``float``) datatype and other operations
+use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions,
+are much faster in ``float16``. Other ops, like reductions, often require the dynamic
+range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype,
+which can reduce your network's runtime and memory footprint.
+
+Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_ and
+`torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_ together.
+
+This recipe measures the performance of a simple network in default precision,
+then walks through adding ``autocast`` and ``GradScaler`` to run the same network in
+mixed precision with improved performance.
+
+You may download and run this recipe as a standalone Python script.
+The only requirements are Pytorch 1.6+ and a CUDA-capable GPU.
+
+Mixed precision primarily benefits Tensor Core-enabled architectures (Volta, Turing, Ampere).
+This recipe should show significant (2-3X) speedup on those architectures.
+On earlier architectures (Kepler, Maxwell, Pascal), you may observe a modest speedup.
+Run ``nvidia-smi`` to display your GPU's architecture.
+"""
+
+import torch, time, gc
+
+# Timing utilities
+start_time = None
+
+def start_timer():
+    global start_time
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.synchronize()
+    start_time = time.time()
+
+def end_timer_and_print(local_msg):
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print("\n" + local_msg)
+    print("Total execution time = {:.3f} sec".format(end_time - start_time))
+    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))
+
+##########################################################
+# A simple network
+# ----------------
+# The following sequence of linear layers and ReLUs should show a speedup with mixed precision.
+
+def make_model(in_size, out_size, num_layers):
+    layers = []
+    for _ in range(num_layers - 1):
+        layers.append(torch.nn.Linear(in_size, in_size))
+        layers.append(torch.nn.ReLU())
+    layers.append(torch.nn.Linear(in_size, out_size))
+    return torch.nn.Sequential(*tuple(layers)).cuda()
+
+##########################################################
+# ``batch_size``, ``in_size``, ``out_size``, and ``num_layers`` are chosen to be large enough to saturate the GPU with work.
+# Typically, mixed precision provides the greatest speedup when the GPU is saturated.
+# Small networks may be CPU bound, in which case mixed precision won't improve performance.
+# Sizes are also chosen such that linear layers' participating dimensions are multiples of 8,
+# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting<troubleshooting>` below).
+#
+# Exercise: Vary participating sizes and see how the mixed precision speedup changes.
+
+batch_size = 512 # Try, for example, 128, 256, 513.
+in_size = 4096
+out_size = 4096
+num_layers = 3
+num_batches = 50
+epochs = 3
+
+# Creates data in default precision.
+# The same data is used for both default and mixed precision trials below.
+# You don't need to manually change inputs' dtype when enabling mixed precision.
+data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
+targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]
+
+loss_fn = torch.nn.MSELoss().cuda()
+
+##########################################################
+# Default Precision
+# -----------------
+# Without ``torch.cuda.amp``, the following simple network executes all ops in default precision (``torch.float32``):
+
+net = make_model(in_size, out_size, num_layers)
+opt = torch.optim.SGD(net.parameters(), lr=0.001)
+
+start_timer()
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        output = net(input)
+        loss = loss_fn(output, target)
+        loss.backward()
+        opt.step()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+end_timer_and_print("Default precision:")
+
+##########################################################
+# Adding autocast
+# ---------------
+# Instances of `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#autocasting>`_
+# serve as context managers that allow regions of your script to run in mixed precision.
+#
+# In these regions, CUDA ops run in a dtype chosen by autocast
+# to improve performance while maintaining accuracy.
+# See the `Autocast Op Reference <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# for details on what precision autocast chooses for each op, and under what circumstances.
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        # Runs the forward pass under autocast.
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            # output is float16 because linear layers autocast to float16.
+            assert output.dtype is torch.float16
+
+            loss = loss_fn(output, target)
+            # loss is float32 because mse_loss layers autocast to float32.
+            assert loss.dtype is torch.float32
+
+        # Exits autocast before backward().
+        # Backward passes under autocast are not recommended.
+        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
+        loss.backward()
+        opt.step()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# Adding GradScaler
+# -----------------
+# `Gradient scaling <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_
+# helps prevent gradients with small magnitudes from flushing to zero
+# ("underflowing") when training with mixed precision.
+#
+# `torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_
+# performs the steps of gradient scaling conveniently.
+
+# Constructs scaler once, at the beginning of the convergence run, using default args.
+# If your network fails to converge with default GradScaler args, please file an issue.
+# The same GradScaler instance should be used for the entire convergence run.
+# If you perform multiple convergence runs in the same script, each run should use
+# a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
+scaler = torch.cuda.amp.GradScaler()
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            loss = loss_fn(output, target)
+
+        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+        scaler.scale(loss).backward()
+
+        # scaler.step() first unscales the gradients of the optimizer's assigned params.
+        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
+        # otherwise, optimizer.step() is skipped.
+        scaler.step(opt)
+
+        # Updates the scale for next iteration.
+        scaler.update()
+
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# All together: "Automatic Mixed Precision"
+# ------------------------------------------
+# (The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
+# If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops.
+# This allows switching between default precision and mixed precision without if/else statements.)
+
+use_amp = True
+
+net = make_model(in_size, out_size, num_layers)
+opt = torch.optim.SGD(net.parameters(), lr=0.001)
+scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
+
+start_timer()
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast(enabled=use_amp):
+            output = net(input)
+            loss = loss_fn(output, target)
+        scaler.scale(loss).backward()
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+end_timer_and_print("Mixed precision:")
+
+##########################################################
+# Inspecting/modifying gradients (e.g., clipping)
+# --------------------------------------------------------
+# All gradients produced by ``scaler.scale(loss).backward()`` are scaled.  If you wish to modify or inspect
+# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should
+# unscale them first using `scaler.unscale_(optimizer) <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.unscale_>`_.
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            loss = loss_fn(output, target)
+        scaler.scale(loss).backward()
+
+        # Unscales the gradients of optimizer's assigned params in-place
+        scaler.unscale_(opt)
+
+        # Since the gradients of optimizer's assigned params are now unscaled, clips as usual.
+        # You may use the same value for max_norm here as you would without gradient scaling.
+        torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=0.1)
+
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# Saving/Resuming
+# ----------------
+# To save/resume Amp-enabled runs with bitwise accuracy, use
+# `scaler.state_dict <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.state_dict>`_ and
+# `scaler.load_state_dict <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.load_state_dict>`_.
+#
+# When saving, save the scaler state dict alongside the usual model and optimizer state dicts.
+# Do this either at the beginning of an iteration before any forward passes, or at the end of
+# an iteration after ``scaler.update()``.
+
+checkpoint = {"model": net.state_dict(),
+              "optimizer": opt.state_dict(),
+              "scaler": scaler.state_dict()}
+# Write checkpoint as desired, e.g.,
+# torch.save(checkpoint, "filename")
+
+##########################################################
+# When resuming, load the scaler state dict alongside the model and optimizer state dicts.
+
+# Read checkpoint as desired, e.g.,
+# dev = torch.cuda.current_device()
+# checkpoint = torch.load("filename",
+#                         map_location = lambda storage, loc: storage.cuda(dev))
+net.load_state_dict(checkpoint["model"])
+opt.load_state_dict(checkpoint["optimizer"])
+scaler.load_state_dict(checkpoint["scaler"])
+
+##########################################################
+# If a checkpoint was created from a run *without* Amp, and you want to resume training *with* Amp,
+# load model and optimizer states from the checkpoint as usual.  The checkpoint won't contain a saved scaler state, so
+# use a fresh instance of ``GradScaler``.
+#
+# If a checkpoint was created from a run *with* Amp and you want to resume training *without* Amp,
+# load model and optimizer states from the checkpoint as usual, and ignore the saved scaler state.
+
+##########################################################
+# Inference/Evaluation
+# --------------------
+# ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary.
+
+##########################################################
+# .. _advanced-topics:
+#
+# Advanced topics
+# ---------------
+# See the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_ for advanced use cases including:
+#
+# * Gradient accumulation
+# * Gradient penalty/double backward
+# * Networks with multiple models, optimizers, or losses
+# * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``)
+# * Custom autograd functions (subclasses of ``torch.autograd.Function``)
+#
+# If you perform multiple convergence runs in the same script, each run should use
+# a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
+#
+# If you're registering a custom C++ op with the dispatcher, see the
+# `autocast section <https://pytorch.org/tutorials/advanced/dispatcher.html#autocast>`_
+# of the dispatcher tutorial.
+
+##########################################################
+# .. _troubleshooting:
+#
+# Troubleshooting
+# ---------------
+# Speedup with Amp is minor
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# 1. Your network may fail to saturate the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance
+#    won't matter.
+#
+#    * A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s)
+#      as much as you can without running OOM.
+#    * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors).
+#    * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can).
+# 2. Your network may be GPU compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores.
+#    In this case a reduced speedup is expected.
+# 3. Matmul dimensions are not Tensor Core-friendly.  Make sure matmuls' participating sizes are multiples of 8.
+#    (For NLP models with encoders/decoders, this can be subtle.  Also, convolutions used to have similar size constraints
+#    for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist.  See
+#    `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>`_ for guidance.)
+#
+# Loss is inf/NaN
+# ~~~~~~~~~~~~~~~
+# First, check if your network fits an :ref:`advanced use case<advanced-topics>`.
+# See also `Prefer binary_cross_entropy_with_logits over binary_cross_entropy <https://pytorch.org/docs/stable/amp.html#prefer-binary-cross-entropy-with-logits-over-binary-cross-entropy>`_.
+#
+# If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information:
+#
+# 1. Disable ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if infs/NaNs persist.
+# 2. If you suspect part of your network (e.g., a complicated loss function) overflows , run that forward region in ``float32``
+#    and see if infs/NaNs persist.
+#    `The autocast docstring <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_'s last code snippet
+#    shows forcing a subregion to run in ``float32`` (by locally disabling autocast and casting the subregion's inputs).
+#
+# Type mismatch error (may manifest as CUDNN_STATUS_BAD_PARAM)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Autocast tries to cover all ops that benefit from or require casting.
+# `Ops that receive explicit coverage <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# are chosen based on numerical properties, but also on experience.
+# If you see a type mismatch error in an autocast-enabled forward region or a backward pass following that region,
+# it's possible autocast missed an op.
+#
+# Please file an issue with the error backtrace.  ``export TORCH_SHOW_CPP_STACKTRACES=1`` before running your script to provide
+# fine-grained information on which backend op is failing.
diff --git a/recipes_source/recipes/benchmark.py b/recipes_source/recipes/benchmark.py
new file mode 100644
index 000000000..bd2e28e96
--- /dev/null
+++ b/recipes_source/recipes/benchmark.py
@@ -0,0 +1,892 @@
+"""
+PyTorch Benchmark
+====================================
+This recipe provides a quick-start guide to using PyTorch
+``benchmark`` module to measure and compare code performance.
+
+Introduction
+------------
+Benchmarking is an important step in writing code. It helps
+us validate that our code meets performance expectations,
+compare different approaches to solving the same problem and
+prevent performance regressions.
+
+There are many options when it comes to benchmarking PyTorch code
+including the Python builtin ``timeit`` module. However, benchmarking
+PyTorch code has many caveats that can be easily overlooked such as
+managing the number of threads and synchronizing CUDA devices. Moreover,
+generating Tensor inputs for benchmarking can be quite tedious.
+
+This recipe demonstrates how to use PyTorch ``benchmark`` module to avoid
+common mistakes while making it easier to compare performance of
+different code, generate input for benchmarking and more.
+
+Setup
+-----
+Before we begin, install ``torch`` if it isn’t already available.
+
+::
+
+   pip install torch
+
+"""
+
+
+######################################################################
+# Steps
+# -----
+#
+# 1. Defining functions to benchmark
+# 2. Benchmarking with ``timeit.Timer``
+# 3. Benchmarking with ``torch.utils.benchmark.Timer``
+# 4. Benchmarking with `Blocked Autorange`
+# 5. Comparing benchmark results
+# 6. Saving/Loading benchmark results
+# 7. Generating inputs with `Fuzzed Parameters`
+# 8. Collecting instruction counts with `Callgrind`
+#
+# 1. Defining functions to benchmark
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# As of the time of this writing, `torch.dot <https://pytorch.org/docs/stable/generated/torch.dot.html?highlight=dot#torch.dot>`__
+# does not support batched mode, so we will compare two approaches to
+# implementing it using existing ``torch`` operators: one approach uses a
+# combination of ``mul`` and ``sum`` while the other reduces the problem to ``bmm``.
+#
+
+import torch
+
+
+def batched_dot_mul_sum(a, b):
+    '''Computes batched dot by multiplying and summing'''
+    return a.mul(b).sum(-1)
+
+
+def batched_dot_bmm(a, b):
+    '''Computes batched dot by reducing to bmm'''
+    a = a.reshape(-1, 1, a.shape[-1])
+    b = b.reshape(-1, b.shape[-1], 1)
+    return torch.bmm(a, b).flatten(-3)
+
+
+# Input for benchmarking
+x = torch.randn(10000, 64)
+
+# Ensure that both functions compute the same output
+assert batched_dot_mul_sum(x, x).allclose(batched_dot_bmm(x, x))
+
+
+######################################################################
+# 2. Benchmarking with ``timeit.Timer``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# First, let's benchmark the code using Python's builtin ``timeit`` module.
+# We keep the benchmark code simple here so we can compare the defaults
+# of ``timeit`` and ``torch.utils.benchmark``.
+#
+
+import timeit
+
+t0 = timeit.Timer(
+    stmt='batched_dot_mul_sum(x, x)', 
+    setup='from __main__ import batched_dot_mul_sum',
+    globals={'x': x})
+
+t1 = timeit.Timer(
+    stmt='batched_dot_bmm(x, x)',
+    setup='from __main__ import batched_dot_bmm',
+    globals={'x': x})
+
+print(f'mul_sum(x, x):  {t0.timeit(100) / 100 * 1e6:>5.1f} us')
+print(f'bmm(x, x):      {t1.timeit(100) / 100 * 1e6:>5.1f} us')
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     mul_sum(x, x):  111.6 us
+#     bmm(x, x):       70.0 us
+#
+
+
+######################################################################
+# 3. Benchmarking with ``torch.utils.benchmark.Timer``
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# PyTorch ``benchmark`` module was designed to be familiar to those who
+# have used the ``timeit`` module before. However, its defaults make it
+# easier and safer to use for benchmarking PyTorch code. Let's first
+# compare the same basic API as above.
+#
+
+import torch.utils.benchmark as benchmark
+
+t0 = benchmark.Timer(
+    stmt='batched_dot_mul_sum(x, x)', 
+    setup='from __main__ import batched_dot_mul_sum',
+    globals={'x': x})
+
+t1 = benchmark.Timer(
+    stmt='batched_dot_bmm(x, x)',
+    setup='from __main__ import batched_dot_bmm',
+    globals={'x': x})
+
+print(t0.timeit(100))
+print(t1.timeit(100))
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb10400d0f0>
+#     batched_dot_mul_sum(x, x)
+#     setup: from __main__ import batched_dot_mul_sum
+#       379.29 us
+#       1 measurement, 100 runs , 1 thread
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb103d67048>
+#     batched_dot_bmm(x, x)
+#     setup: from __main__ import batched_dot_bmm
+#       716.42 us
+#       1 measurement, 100 runs , 1 thread
+#
+
+######################################################################
+# Even though the APIs are the same for the basic functionality, there
+# are some important differences. ``benchmark.Timer.timeit()`` returns the
+# time per run as opposed to the total runtime like ``timeit.Timer.timeit()``
+# does. PyTorch ``benchmark`` module also provides formatted string
+# representations for printing the results.
+#
+# Another important difference, and the reason why the results diverge
+# is that PyTorch benchmark module runs in a single thread by default.
+# We can change the number of threads with the num_threads arg.
+#
+# ``torch.utils.benchmark.Timer`` takes several additional arguments
+# including: `label`, `sub_label`, `description` and `env` which change
+# the ``__repr__`` of the measurement object returned and are used for
+# grouping the results (more on this later).
+#
+
+num_threads = torch.get_num_threads()
+print(f'Benchmarking on {num_threads} threads')
+
+t0 = benchmark.Timer(
+    stmt='batched_dot_mul_sum(x, x)', 
+    setup='from __main__ import batched_dot_mul_sum',
+    globals={'x': x},
+    num_threads=num_threads,
+    label='Multithreaded batch dot',
+    sub_label='Implemented using mul and sum')
+
+t1 = benchmark.Timer(
+    stmt='batched_dot_bmm(x, x)',
+    setup='from __main__ import batched_dot_bmm',
+    globals={'x': x},
+    num_threads=num_threads,
+    label='Multithreaded batch dot',
+    sub_label='Implemented using bmm')
+
+print(t0.timeit(100))
+print(t1.timeit(100))
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     Benchmarking on 40 threads
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb103d54080>
+#     Multithreaded batch dot: Implemented using mul and sum
+#     setup: from __main__ import batched_dot_mul_sum
+#       118.47 us
+#       1 measurement, 100 runs , 40 threads
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb16935d2e8>
+#     Multithreaded batch dot: Implemented using bmm
+#     setup: from __main__ import batched_dot_bmm
+#       68.21 us
+#       1 measurement, 100 runs , 40 threads
+
+######################################################################
+# Running ``benchmark`` with all threads available gives similar results
+# as the ``timeit`` module. More importantly, which version is faster
+# depends on how many threads we run the code with. This is why it's
+# important to benchmark the code with thread settings that are
+# representative of real use cases. Another important thing to remember
+# is to synchronize CPU and CUDA when benchmarking on the GPU. Let's run
+# the above benchmarks again on a CUDA tensor and see what happens.
+#
+
+x = torch.randn(10000, 1024, device='cuda')
+
+t0 = timeit.Timer(
+    stmt='batched_dot_mul_sum(x, x)', 
+    setup='from __main__ import batched_dot_mul_sum',
+    globals={'x': x})
+
+t1 = timeit.Timer(
+    stmt='batched_dot_bmm(x, x)',
+    setup='from __main__ import batched_dot_bmm',
+    globals={'x': x})
+
+# Ran each twice to show difference before/after warmup
+print(f'mul_sum(x, x):  {t0.timeit(100) / 100 * 1e6:>5.1f} us')
+print(f'mul_sum(x, x):  {t0.timeit(100) / 100 * 1e6:>5.1f} us')
+print(f'bmm(x, x):      {t1.timeit(100) / 100 * 1e6:>5.1f} us')
+print(f'bmm(x, x):      {t1.timeit(100) / 100 * 1e6:>5.1f} us')
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     mul_sum(x, x):   27.6 us
+#     mul_sum(x, x):   25.3 us
+#     bmm(x, x):      2775.5 us
+#     bmm(x, x):       22.4 us
+#
+
+t0 = benchmark.Timer(
+    stmt='batched_dot_mul_sum(x, x)', 
+    setup='from __main__ import batched_dot_mul_sum',
+    globals={'x': x})
+
+t1 = benchmark.Timer(
+    stmt='batched_dot_bmm(x, x)',
+    setup='from __main__ import batched_dot_bmm',
+    globals={'x': x})
+
+# Run only once since benchmark module does warmup for us
+print(t0.timeit(100))
+print(t1.timeit(100))
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb10400d080>
+#     batched_dot_mul_sum(x, x)
+#     setup: from __main__ import batched_dot_mul_sum
+#       232.93 us
+#       1 measurement, 100 runs , 1 thread
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb10400d0f0>
+#     batched_dot_bmm(x, x)
+#     setup: from __main__ import batched_dot_bmm
+#       181.04 us
+#       1 measurement, 100 runs , 1 thread
+#
+
+######################################################################
+# The results reveal something interesting. The first run of the ``bmm``
+# version using the ``timeit`` module takes much longer than the second
+# run. This is because ``bmm`` calls into `cuBLAS` which needs to be
+# loaded the first time it's called which takes some time. This is why
+# it's important to do a warmup run before benchmarking, luckily for
+# us, PyTorch's ``benchmark`` module takes care of that.
+#
+# The difference in the results between ``timeit`` and ``benchmark`` modules
+# is because the `timeit` module is not synchronizing CUDA and is thus only
+# timing the time to launch the kernel. PyTorch's ``benchmark`` module does
+# the synchronization for us.
+
+
+######################################################################
+# 4. Benchmarking with `Blocked Autorange`
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# While ``timeit.Timer.autorange`` takes a single continuous measurement
+# of at least 0.2 seconds, `torch.utils.benchmark.blocked_autorange`
+# takes many measurements whose times total at least 0.2 seconds (which
+# can be changed by the `min_run_time` parameter) subject to the constraint
+# that timing overhead is a small fraction of the overall measurement.
+# This is accomplished by first running with an increasing number of runs
+# per loop until the runtime is much larger than measurement overhead
+# (which also serves as a warm up), and then taking measurements until
+# the target time is reached. This has the useful properties that it wastes
+# less data and allows us to compute statistics to estimate the reliability
+# of the measurements.
+#
+
+m0 = t0.blocked_autorange()
+m1 = t1.blocked_autorange()
+
+print(m0)
+print(m1)
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb10400d0f0>
+#     batched_dot_mul_sum(x, x)
+#     setup: from __main__ import batched_dot_mul_sum
+#       231.79 us
+#       1 measurement, 1000 runs , 1 thread
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb10400d080>
+#     batched_dot_bmm(x, x)
+#     setup: from __main__ import batched_dot_bmm
+#       Median: 162.08 us
+#       2 measurements, 1000 runs per measurement, 1 thread
+#
+
+######################################################################
+# We can also inspect the individual statistics from the returned
+# measurements object.
+
+print(f"Mean:   {m0.mean * 1e6:6.2f} us")
+print(f"Median: {m0.median * 1e6:6.2f} us")
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     Mean:   231.79 us
+#     Median: 231.79 us
+#
+
+######################################################################
+# 5. Comparing benchmark results
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# So far we've been comparing our two versions of batched dot against a
+# single input. In practice, we want to try a combination of inputs as
+# well as different number of threads. The ``Compare`` class helps display
+# the results of many measurements in a formatted table. It uses the
+# annotations described above (`label`, `sub_label`, `num_threads`, etc.) as
+# well as `description` to group and organize the table. Let's use
+# ``Compare`` to see how our functions perform for different input sizes
+# and number of threads.
+#
+
+from itertools import product
+
+# Compare takes a list of measurements which we'll save in results.
+results = []
+
+sizes = [1, 64, 1024, 10000]
+for b, n in product(sizes, sizes):
+    # label and sub_label are the rows
+    # description is the column
+    label = 'Batched dot'
+    sub_label = f'[{b}, {n}]'
+    x = torch.ones((b, n))
+    for num_threads in [1, 4, 16, 32]:
+        results.append(benchmark.Timer(
+            stmt='batched_dot_mul_sum(x, x)',
+            setup='from __main__ import batched_dot_mul_sum',
+            globals={'x': x},
+            num_threads=num_threads,
+            label=label,
+            sub_label=sub_label,
+            description='mul/sum',
+        ).blocked_autorange(min_run_time=1))
+        results.append(benchmark.Timer(
+            stmt='batched_dot_bmm(x, x)',
+            setup='from __main__ import batched_dot_bmm',
+            globals={'x': x},
+            num_threads=num_threads,
+            label=label,
+            sub_label=sub_label,
+            description='bmm',
+        ).blocked_autorange(min_run_time=1))
+
+compare = benchmark.Compare(results)
+compare.print()
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     [--------------- Batched dot ----------------]
+#                           |  mul/sum   |    bmm   
+#     1 threads: -----------------------------------
+#           [1, 1]          |       5.9  |      11.2
+#           [1, 64]         |       6.4  |      11.4
+#           [1, 1024]       |       6.7  |      14.2
+#           [1, 10000]      |      10.2  |      23.7
+#           [64, 1]         |       6.3  |      11.5
+#           [64, 64]        |       8.6  |      15.4
+#           [64, 1024]      |      39.4  |     204.4
+#           [64, 10000]     |     274.9  |     748.5
+#           [1024, 1]       |       7.7  |      17.8
+#           [1024, 64]      |      40.3  |      76.4
+#           [1024, 1024]    |     432.4  |    2795.9
+#           [1024, 10000]   |   22657.3  |   11899.5
+#           [10000, 1]      |      16.9  |      74.8
+#           [10000, 64]     |     300.3  |     609.4
+#           [10000, 1024]   |   23098.6  |   27246.1
+#           [10000, 10000]  |  267073.7  |  118823.7
+#     4 threads: -----------------------------------
+#           [1, 1]          |       6.0  |      11.5
+#           [1, 64]         |       6.2  |      11.2
+#           [1, 1024]       |       6.8  |      14.3
+#           [1, 10000]      |      10.2  |      23.7
+#           [64, 1]         |       6.3  |      16.2
+#           [64, 64]        |       8.8  |      18.2
+#           [64, 1024]      |      41.5  |     189.1
+#           [64, 10000]     |      91.7  |     849.1
+#           [1024, 1]       |       7.6  |      17.4
+#           [1024, 64]      |      43.5  |      33.5
+#           [1024, 1024]    |     135.4  |    2782.3
+#           [1024, 10000]   |    7471.1  |   11874.0
+#           [10000, 1]      |      16.8  |      33.9
+#           [10000, 64]     |     118.7  |     173.2
+#           [10000, 1024]   |    7264.6  |   27824.7
+#           [10000, 10000]  |  100060.9  |  121499.0
+#     16 threads: ----------------------------------
+#           [1, 1]          |       6.0  |      11.3
+#           [1, 64]         |       6.2  |      11.2
+#           [1, 1024]       |       6.9  |      14.2
+#           [1, 10000]      |      10.3  |      23.8
+#           [64, 1]         |       6.4  |      24.1
+#           [64, 64]        |       9.0  |      23.8
+#           [64, 1024]      |      54.1  |     188.5
+#           [64, 10000]     |      49.9  |     748.0
+#           [1024, 1]       |       7.6  |      23.4
+#           [1024, 64]      |      55.5  |      28.2
+#           [1024, 1024]    |      66.9  |    2773.9
+#           [1024, 10000]   |    6111.5  |   12833.7
+#           [10000, 1]      |      16.9  |      27.5
+#           [10000, 64]     |      59.5  |      73.7
+#           [10000, 1024]   |    6295.9  |   27062.0
+#           [10000, 10000]  |   71804.5  |  120365.8
+#     32 threads: ----------------------------------
+#           [1, 1]          |       5.9  |      11.3
+#           [1, 64]         |       6.2  |      11.3
+#           [1, 1024]       |       6.7  |      14.2
+#           [1, 10000]      |      10.5  |      23.8
+#           [64, 1]         |       6.3  |      31.7
+#           [64, 64]        |       9.1  |      30.4
+#           [64, 1024]      |      72.0  |     190.4
+#           [64, 10000]     |     103.1  |     746.9
+#           [1024, 1]       |       7.6  |      28.4
+#           [1024, 64]      |      70.5  |      31.9
+#           [1024, 1024]    |      65.6  |    2804.6
+#           [1024, 10000]   |    6764.0  |   11871.4
+#           [10000, 1]      |      17.8  |      31.8
+#           [10000, 64]     |     110.3  |      56.0
+#           [10000, 1024]   |    6640.2  |   27592.2
+#           [10000, 10000]  |   73003.4  |  120083.2
+#
+#     Times are in microseconds (us).
+#
+
+######################################################################
+# The results above indicate that the version which reduces to bmm
+# is better for larger tensors running on multiple threads, while for
+# smaller and/or single thread code, the other version is better.
+#
+# ``Compare`` also provides functions for changing the table format
+#
+
+compare.trim_significant_figures()
+compare.colorize()
+compare.print()
+
+
+######################################################################
+# 6. Saving/Loading benchmark results
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# `Measurements` (and `CallgrindStats` which are described in section 8)
+# are pickleable. This makes A/B testing easy, as you can collect
+# measurements from two separate environments, pickle them, and then
+# load both in a single environment. Timer even takes an `env`
+# constructor argument so that such A/B testing works seamlessly.
+#
+# Let's imagine that rather than two Python functions, the add/sum
+# and bmm approaches were in two different builds of PyTorch.
+# The example below demonstrates how one might A/B test them. For
+# simplicity, we only use a subset of shapes, and simply round trip
+# results through pickle rather than actually using multiple environments
+# and writing results to disk.
+#
+
+import pickle
+
+ab_test_results = []
+for env in ('environment A: mul/sum', 'environment B: bmm'):
+    for b, n in ((1, 1), (1024, 10000), (10000, 1)):
+        x = torch.ones((b, n))
+        dot_fn = (batched_dot_mul_sum if env == 'environment A: mul/sum' else batched_dot_bmm)
+        m = benchmark.Timer(
+            stmt='batched_dot(x, x)',
+            globals={'x': x, 'batched_dot': dot_fn},
+            num_threads=1,
+            label='Batched dot',
+            description=f'[{b}, {n}]',
+            env=env,
+        ).blocked_autorange(min_run_time=1)
+        ab_test_results.append(pickle.dumps(m))
+
+ab_results = [pickle.loads(i) for i in ab_test_results]
+compare = benchmark.Compare(ab_results)
+compare.trim_significant_figures()
+compare.colorize()
+compare.print()
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     [------------------------------------- Batched dot -------------------------------------]
+#                                                    |  [1, 1]  |  [1024, 10000]  |  [10000, 1]
+#     1 threads: ------------------------------------------------------------------------------
+#       (environment A: mul/sum)  batched_dot(x, x)  |     7    |      36000      |      21
+#       (environment B: bmm)      batched_dot(x, x)  |    14    |      40000      |      85
+#
+#     Times are in microseconds (us).
+#
+
+# And just to show that we can round trip all of the results from earlier:
+round_tripped_results = pickle.loads(pickle.dumps(results))
+assert(str(benchmark.Compare(results)) == str(benchmark.Compare(round_tripped_results)))
+
+
+######################################################################
+# 7. Generating inputs with `Fuzzed Parameters`
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# As we've seen in the previous section, there can be some stark
+# performance differences depending on the input tensors. Hence, it
+# is a good idea to run benchmarks on a number of different inputs.
+# However, creating all these input tensors can be tedious which is
+# where ``torch.utils.benchmark.Fuzzer`` and related classes come in.
+# Let's take a look at how we can use the Fuzzer to create some test
+# cases for the benchmark.
+#
+
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, FuzzedTensor, ParameterAlias
+
+# Generates random tensors with 128 to 10000000 elements and sizes k0 and k1 chosen from a
+# loguniform distribution in [1, 10000], 40% of which will be discontiguous on average.
+example_fuzzer = Fuzzer(
+    parameters = [
+        FuzzedParameter('k0', minval=1, maxval=10000, distribution='loguniform'),
+        FuzzedParameter('k1', minval=1, maxval=10000, distribution='loguniform'),
+    ],
+    tensors = [
+        FuzzedTensor('x', size=('k0', 'k1'), min_elements=128, max_elements=10000000, probability_contiguous=0.6)
+    ],
+    seed=0,
+)
+
+results = []
+for tensors, tensor_params, params in example_fuzzer.take(10):
+    # description is the column label
+    sub_label=f"{params['k0']:<6} x {params['k1']:<4} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}"
+    results.append(benchmark.Timer(
+        stmt='batched_dot_mul_sum(x, x)',
+        setup='from __main__ import batched_dot_mul_sum',
+        globals=tensors,
+        label='Batched dot',
+        sub_label=sub_label,
+        description='mul/sum',
+    ).blocked_autorange(min_run_time=1))
+    results.append(benchmark.Timer(
+        stmt='batched_dot_bmm(x, x)',
+        setup='from __main__ import batched_dot_bmm',
+        globals=tensors,
+        label='Batched dot',
+        sub_label=sub_label,
+        description='bmm',
+    ).blocked_autorange(min_run_time=1))
+
+compare = benchmark.Compare(results)
+compare.trim_significant_figures()
+compare.print()
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     [--------------------- Batched dot ---------------------]
+#                                          |  mul/sum  |   bmm 
+#     1 threads: ----------------------------------------------
+#           725    x 257                   |      87   |    180
+#           49     x 383                   |      15   |     30
+#           34     x 1468                  |      30   |    118
+#           187    x 5039                  |     400   |   1200
+#           2140   x 1296 (discontiguous)  |    2000   |  41000
+#           78     x 1598                  |      74   |    310
+#           519    x 763                   |     190   |   1500
+#           141    x 1082                  |      87   |    500
+#           78     x 5    (discontiguous)  |       9   |     20
+#           187    x 1                     |      12   |     10
+#
+#     Times are in microseconds (us). 
+#
+
+######################################################################
+# There is a lot of flexibility for defining your own Fuzzers which
+# is great for creating a powerful set of inputs to benchmark. But to
+# make things even simpler, PyTorch benchmark module comes with some
+# buitin Fuzzers for common benchmarking needs. Let's take a look at
+# how we can use one of these builtin fuzzers.
+#
+
+from torch.utils.benchmark.op_fuzzers import binary
+
+results = []
+for tensors, tensor_params, params in binary.BinaryOpFuzzer(seed=0).take(10):
+    sub_label=f"{params['k0']:<6} x {params['k1']:<4} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}"
+    results.append(benchmark.Timer(
+        stmt='batched_dot_mul_sum(x, x)',
+        setup='from __main__ import batched_dot_mul_sum',
+        globals=tensors,
+        label='Batched dot',
+        sub_label=sub_label,
+        description='mul/sum',
+    ).blocked_autorange(min_run_time=1))
+    results.append(benchmark.Timer(
+        stmt='batched_dot_bmm(x, x)',
+        setup='from __main__ import batched_dot_bmm',
+        globals=tensors,
+        label='Batched dot',
+        sub_label=sub_label,
+        description='bmm',
+    ).blocked_autorange(min_run_time=1))
+
+compare = benchmark.Compare(results)
+compare.trim_significant_figures()
+compare.colorize(rowwise=True)
+compare.print()
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     [----------------------- Batched dot ------------------------]
+#                                              |  mul/sum  |   bmm  
+#     1 threads: ---------------------------------------------------
+#           64     x 473  (discontiguous)      |    10000  |   40000
+#           16384  x 12642115 (discontiguous)  |       31  |      78
+#           8192   x 892                       |     4800  |   20400
+#           512    x 64   (discontiguous)      |   110000  |  400000
+#           493    x 27   (discontiguous)      |     1100  |    2440
+#           118    x 32   (discontiguous)      |      870  |    2030
+#           16     x 495  (discontiguous)      |    23600  |   24000
+#           488    x 62374                     |    90000  |  100000
+#           240372 x 69                        |    40000  |   16000
+#           40156  x 32   (discontiguous)      |     2670  |    5000
+#    
+#     Times are in microseconds (us).
+#
+
+######################################################################
+# 8. Collecting instruction counts with `Callgrind`
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# One of the challenges of optimizing code is the variation and opacity of
+# wall time. There are many sources of non-determinism, from adaptive clock
+# speeds to resource contention with other processes. Furthermore, end-to-end
+# time gives no insight into where time is being spent, which is really what
+# we're interested in when optimizing code.
+#
+# A complementary approach is to also collect instruction counts. These counts
+# are a proxy metric and do not capture all aspects of performance
+# (e.g. memory or I/O bound tasks), however they do have several useful
+# properties. Instruction counts are reproducible, insensitive to environmental
+# variation, and offer fine grained insight into where a program is spending
+# cycles.
+#
+# To see the utility of instruction counts, let us look at how we might
+# reduce the overhead of `batched_dot_mul_sum`. The obvious solution is to
+# move it to C++, so we avoid going between Python and C++ multiple times.
+#
+# Fortunately, the source is nearly identical. One question that we have to ask
+# in C++ is whether we should take arguments by value or reference.
+#
+
+batched_dot_src = """\
+/* ---- Python ---- */
+// def batched_dot_mul_sum(a, b):
+//     return a.mul(b).sum(-1)
+
+torch::Tensor batched_dot_mul_sum_v0(
+    const torch::Tensor a,
+    const torch::Tensor b) {
+  return a.mul(b).sum(-1);
+}
+
+torch::Tensor batched_dot_mul_sum_v1(
+    const torch::Tensor& a,
+    const torch::Tensor& b) {
+  return a.mul(b).sum(-1);
+}
+"""
+
+
+# PyTorch makes it easy to test our C++ implementations by providing a utility
+# to JIT compile C++ source into Python extensions:
+import os
+from torch.utils import cpp_extension
+cpp_lib = cpp_extension.load_inline(
+    name='cpp_lib',
+    cpp_sources=batched_dot_src,
+    extra_cflags=['-O3'],
+    extra_include_paths=[
+        # `load_inline` needs to know where to find Pybind11 headers.
+        os.path.join(os.getenv('CONDA_PREFIX'), 'include')
+    ],
+    functions=['batched_dot_mul_sum_v0', 'batched_dot_mul_sum_v1']
+)
+
+# `load_inline` will create a shared object that is loaded into Python. When we collect
+# instruction counts Timer will create a subprocess, so we need to re-import it. The
+# import process is slightly more complicated for C extensions, but that's all we're
+# doing here.
+module_import_str = f"""\
+# https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
+import importlib.util
+spec = importlib.util.spec_from_file_location("cpp_lib", {repr(cpp_lib.__file__)})
+cpp_lib = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(cpp_lib)"""
+
+import textwrap
+def pretty_print(result):
+    """Import machinery for cpp_lib.so can get repetitive to look at."""
+    print(repr(result).replace(textwrap.indent(module_import_str, "  "), "  import cpp_lib"))
+
+
+t_baseline = benchmark.Timer(
+    stmt='batched_dot_mul_sum(x, x)',
+    setup='''\
+from __main__ import batched_dot_mul_sum
+x = torch.randn(2, 2)''')
+
+t0 = benchmark.Timer(
+    stmt='cpp_lib.batched_dot_mul_sum_v0(x, x)',
+    setup=f'''\
+{module_import_str}
+x = torch.randn(2, 2)''')
+
+t1 = benchmark.Timer(
+    stmt='cpp_lib.batched_dot_mul_sum_v1(x, x)',
+    setup=f'''\
+{module_import_str}
+x = torch.randn(2, 2)''')
+
+# Moving to C++ did indeed reduce overhead, but it's hard to tell which
+# calling convention is more efficient. v1 (call with references) seems to
+# be a bit faster, but it's within measurement error.
+pretty_print(t_baseline.blocked_autorange())
+pretty_print(t0.blocked_autorange())
+pretty_print(t1.blocked_autorange())
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb16935d2e8>
+#     batched_dot_mul_sum(x, x)
+#     setup:
+#       from __main__ import batched_dot_mul_sum
+#       x = torch.randn(2, 2)
+#    
+#       6.92 us
+#       1 measurement, 100000 runs , 1 thread
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb16935d2e8>
+#     cpp_lib.batched_dot_mul_sum_v0(x, x)
+#     setup:
+#       import cpp_lib
+#       x = torch.randn(2, 2)
+#    
+#       5.29 us
+#       1 measurement, 100000 runs , 1 thread
+#     <torch.utils.benchmark.utils.common.Measurement object at 0x7fb16935d2e8>
+#     cpp_lib.batched_dot_mul_sum_v1(x, x)
+#     setup:
+#       import cpp_lib
+#       x = torch.randn(2, 2)
+#    
+#       5.22 us
+#       1 measurement, 100000 runs , 1 thread
+#
+
+# Let's use Callgrind to determine which is better.
+stats_v0 = t0.collect_callgrind()
+stats_v1 = t1.collect_callgrind()
+
+pretty_print(stats_v0)
+pretty_print(stats_v1)
+
+# `.as_standardized` removes file names and some path prefixes, and makes
+# it easier to read the function symbols.
+stats_v0 = stats_v0.as_standardized()
+stats_v1 = stats_v1.as_standardized()
+
+# `.delta` diffs the instruction counts, and `.denoise` removes several
+# functions in the Python interpreter that are known to have significant
+# jitter.
+delta = stats_v1.delta(stats_v0).denoise()
+
+# `.transform` is a convenience API for transforming function names. It is
+# useful for increasing cancelation when diff-ing instructions, as well as
+# just generally improving readability.
+replacements = (
+    ("???:void pybind11", "pybind11"),
+    ("batched_dot_mul_sum_v0", "batched_dot_mul_sum_v1"),
+    ("at::Tensor, at::Tensor", "..."),
+    ("at::Tensor const&, at::Tensor const&", "..."),
+    ("auto torch::detail::wrap_pybind_function_impl_", "wrap_pybind_function_impl_"),
+)
+for before, after in replacements:
+    delta = delta.transform(lambda l: l.replace(before, after))
+
+# We can use print options to control how much of the function to display.
+torch.set_printoptions(linewidth=160)
+
+# Once parsed, the instruction counts make clear that passing `a` and `b`
+# by reference is more efficient as it skips some c10::TensorImpl bookkeeping
+# for the intermediate Tensors, and is also works better with PyBind11. This
+# is consistent with our noisy wall time observations.
+print(delta)
+
+######################################################################
+# .. code-block:: none
+#    :caption: Output
+#
+#     <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.CallgrindStats object at 0x7fb0f06e7630>
+#     cpp_lib.batched_dot_mul_sum_v0(x, x)
+#     setup:
+#       import cpp_lib
+#       x = torch.randn(2, 2)
+#    
+#                                All          Noisy symbols removed
+#         Instructions:      2392671                    2392671
+#         Baseline:             4367                       4367
+#     100 runs per measurement, 1 thread
+#     Warning: PyTorch was not built with debug symbols.
+#              Source information may be limited. Rebuild with
+#              REL_WITH_DEB_INFO=1 for more detailed results.
+#     <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.CallgrindStats object at 0x7fb10400d208>
+#     cpp_lib.batched_dot_mul_sum_v1(x, x)
+#     setup:
+#       import cpp_lib
+#       x = torch.randn(2, 2)
+#    
+#                                All          Noisy symbols removed
+#         Instructions:      2378978                    2378978
+#         Baseline:             4367                       4367
+#         100 runs per measurement, 1 thread
+#         Warning: PyTorch was not built with debug symbols.
+#                  Source information may be limited. Rebuild with
+#                  REL_WITH_DEB_INFO=1 for more detailed results.
+#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7fb1000ab358>
+#               86  ???:0x000000000020d9e0
+#           56  ???:0x000000000020db10
+#        -1100  pybind11::cpp_function::initialize<wrap_pybind_function_impl_<at::Tensor ... r (&)(...), std::integer_sequence<unsigned long, 0ul, 1ul>)::{lambda(...)
+#        -1600  ???:wrap_pybind_function_impl_<at::Tensor (&)(...), 0ul, 1ul>(at::Tensor (&)(...), std::integer_sequence<unsigned long, 0ul, 1ul>)::{lambda(...)
+#        -5200  ???:c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::reset_()
+#        -5935  ???:0x000000000022c0e0
+#    
+#     Total: -13693
+#
+
+
+######################################################################
+# Learn More
+# ----------
+#
+# Take a look at these other recipes to continue your learning:
+#
+# -  `PyTorch Profiler <https://pytorch.org/tutorials/recipes/recipes/profiler.html>`_
+#
diff --git a/recipes_source/recipes/custom_dataset_transforms_loader.py b/recipes_source/recipes/custom_dataset_transforms_loader.py
deleted file mode 100644
index e73039647..000000000
--- a/recipes_source/recipes/custom_dataset_transforms_loader.py
+++ /dev/null
@@ -1,459 +0,0 @@
-"""
-사용자 정의 PyTorch Dataloader 작성하기
-===========================================================
-
-머신러닝 알고리즘을 개발하기 위해서는 데이터 전처리에 많은 노력이 필요합니다. PyTorch는 데이터를 로드하는데 쉽고 가능하다면
-더 좋은 가독성을 가진 코드를 만들기위해 많은 도구들을 제공합니다. 이 레시피에서는 다음 세 가지를 배울 수 있습니다.
-
- 1. PyTorch 데이터셋 API들을 이용하여 사용자 정의 데이터셋 만들기.
- 2. 구성가능하며 호출 될 수 있는 사용자 정의 transform 만들기.
- 3. 이러한 컴포넌트들을 합쳐서 사용자 정의 dataloader 만들기.
-
-이 튜토리얼을 실행하기 위해서는 다음의 패키지들이 설치 되었는지 확인해 주세요.
- -  ``scikit-image``: 이미지 I/O와 이미지 변형에 필요합니다.
- -  ``pandas``: CSV를 더 쉽게 파싱하기 위해 필요합니다.
-
-작성되고 있는 이 시점에서, 이 레시피는 `Sasank Chilamkurthy <https://chsasank.github.io>`__ 의 오리지널 튜토리얼을 바탕으로 하며
-나중에는 `Joe Spisak <https://github.com/jspisak>`__ 에 의해 수정되었습니다.
-한국어로 `Jae Joong Lee <https://https://github.com/JaeLee18>`__ 에 의해 번역되었습니다.
-"""
-
-
-######################################################################
-# 설정
-# ----------------------
-#
-# 먼저 이 레시피에 필요한 모든 라이브러리들을 불러오도록 하겠습니다.
-#
-#
-
-from __future__ import print_function, division
-import os
-import torch
-import pandas as pd
-from skimage import io, transform
-import numpy as np
-import matplotlib.pyplot as plt
-from torch.utils.data import Dataset, DataLoader
-from torchvision import transforms, utils
-
-# 경고 메시지 무시하기
-import warnings
-warnings.filterwarnings("ignore")
-
-plt.ion()   # 반응형 모드 설정
-
-
-######################################################################
-# 첫 번째: 데이터셋
-# ----------------------
-#
-
-######################################################################
-#
-# 우리가 다룰 데이터셋은 얼굴 포즈입니다.
-# 전반적으로, 한 얼굴에는 68개의 랜드마크들이 표시되어 있습니다.
-#
-# 다음 단계로는, `여기 <https://download.pytorch.org/tutorial/faces.zip>`_ 에서 
-# 데이터셋을 다운 받아 이미지들이 ‘data/faces/’ 의 경로에 위치하게 해주세요.
-#
-# **알림:** 사실 이 데이터셋은 imagenet 데이터셋에서 ‘face’ 태그를 포함하고 있는 이미지에
-# `dlib` 의 포즈 예측 `<https://blog.dlib.net/2014/08/real-time-face-pose-estimation.html>`__ 을 적용하여 생성하였습니다.
-# 
-# ::
-#
-#    !wget https://download.pytorch.org/tutorial/faces.zip
-#    !mkdir data/faces/
-#    import zipfile
-#    with zipfile.ZipFile("faces.zip","r") as zip_ref:
-#    zip_ref.extractall("/data/faces/")
-#    %cd /data/faces/
-
-
-
-######################################################################
-# 이 데이터셋은 다음과 같은 설명이 달려있는 CSV파일이 포함되어 있습니다.
-#
-# ::
-#
-#      image_name,part_0_x,part_0_y,part_1_x,part_1_y,part_2_x, ... ,part_67_x,part_67_y
-#      0805personali01.jpg,27,83,27,98, ... 84,134
-#      1084239450_e76e00b7e7.jpg,70,236,71,257, ... ,128,312
-#
-# 이제 CSV파일을 빠르게 읽고 파일 안에 있는 설명들은 (N, 2) 배열로 읽어봅시다.
-# 여기서 N은 랜드마크의 갯수입니다.
-#
-
-
-landmarks_frame = pd.read_csv('faces/face_landmarks.csv')
-
-n = 65
-img_name = landmarks_frame.iloc[n, 0]
-landmarks = landmarks_frame.iloc[n, 1:]
-landmarks = np.asarray(landmarks)
-landmarks = landmarks.astype('float').reshape(-1, 2)
-
-print('Image name: {}'.format(img_name))
-print('Landmarks shape: {}'.format(landmarks.shape))
-print('First 4 Landmarks: {}'.format(landmarks[:4]))
-
-######################################################################
-# 1.1 이미지를 표시하기 위해 간단한 도움 함수 작성하기
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 다음으로는 이미지를 보여주기 위해 간단한 도움 함수를 작성하여 이미지가 가지고 있는 랜드마크들과
-# 이미지 샘플을 보여주도록 하겠습니다.
-#
-
-def show_landmarks(image, landmarks):
-    """ 랜드마크와 함께 이미지 보여주기 """
-    plt.imshow(image)
-    plt.scatter(landmarks[:, 0], landmarks[:, 1], s=10, marker='.', c='r')
-    plt.pause(0.001)  #  잠시 멈추어 도표가 업데이트 되게 합니다
-
-plt.figure()
-show_landmarks(io.imread(os.path.join('faces/', img_name)),
-               landmarks)
-plt.show()
-
-
-######################################################################
-# 1.2 데이터셋 클래스 만들기
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 이제 PyTorch 데이터셋 클래스에 대해 알아봅시다.
-#
-#
-
-
-######################################################################
-# ``torch.utils.data.Dataset`` 은 추상 클래스로서 데이터셋을 맡고 있습니다
-# ``Dataset`` 을 상속받아야 하며 다음의 메소드들을 오버라이드 해야합니다.
-#
-# -  ``__len__`` 에는 ``len(dataset)`` 데이터셋의 사이즈를 반환합니다.
-# -  ``__getitem__`` 는 이러한 인덱싱을 지원하고 ``dataset[i]`` 
-#     :math:``i`` 번째 샘플을 얻기 위해 사용됩니다.
-#
-# 우리의 얼굴 랜드마크 데이터셋을 위한 데이터셋 클래스를 만들어 봅시다.
-# 우리는 csv파일은 ``__init__`` 에서 읽고 이미지들은 ``__getitem__`` 에서 읽도록 남겨두겠습니다.
-# 이러한 방법은 메모리를 효율적으로 사용하도록 하는데 그 이유는 모든 이미지를 한 번에 메모리에 저장하지 않고
-# 필요할 때마다 불러오기 때문입니다.
-#
-# 우리 데이터셋의 샘플은 dict 형태로 이렇게 ``{'image': image, 'landmarks': landmarks}`` 되어있습니다.
-# 데이터셋은 선택적 매개변수인 ``transform`` 을 가지고 있어서
-# 필요한 프로세싱 어느것이나 샘플에 적용 될 수 있습니다.
-# ``transform`` 이 얼마나 유용한지는 다른 레시피에서 확인 해 볼 수 있습니다.
-#
-
-class FaceLandmarksDataset(Dataset):
-    """ 얼굴 랜드마크 데이터셋. """
-
-    def __init__(self, csv_file, root_dir, transform=None):
-        """
-        매개변수 :
-            csv_file (문자열): 설명이 포함된 csv 파일 경로. 
-            root_dir (문자역): 모든 이미지가 있는 폴더 경로.
-            transform (호출가능한 함수, 선택적 매개변수): 샘플에 적용 될 수 있는 선택적 변환.
-        """
-        self.landmarks_frame = pd.read_csv(csv_file)
-        self.root_dir = root_dir
-        self.transform = transform
-
-    def __len__(self):
-        return len(self.landmarks_frame)
-
-    def __getitem__(self, idx):
-        if torch.is_tensor(idx):
-            idx = idx.tolist()
-
-        img_name = os.path.join(self.root_dir,
-                                self.landmarks_frame.iloc[idx, 0])
-        image = io.imread(img_name)
-        landmarks = self.landmarks_frame.iloc[idx, 1:]
-        landmarks = np.array([landmarks])
-        landmarks = landmarks.astype('float').reshape(-1, 2)
-        sample = {'image': image, 'landmarks': landmarks}
-
-        if self.transform:
-            sample = self.transform(sample)
-
-        return sample
-
-
-######################################################################
-# 1.3 반복문을 통한 데이터 샘플 사용
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-
-
-######################################################################
-# 다음으로는 이 클래스를 인스턴스화하고 데이터 샘플을 반복문을 이용하여 사용해봅시다. 
-# 우리는 첫 4개의 샘플들만 출력하고 그 4개 샘플들의 랜드마크를 보여주겠습니다.
-#
-
-face_dataset = FaceLandmarksDataset(csv_file='faces/face_landmarks.csv',
-                                    root_dir='faces/')
-
-fig = plt.figure()
-
-for i in range(len(face_dataset)):
-    sample = face_dataset[i]
-
-    print(i, sample['image'].shape, sample['landmarks'].shape)
-
-    ax = plt.subplot(1, 4, i + 1)
-    plt.tight_layout()
-    ax.set_title('Sample #{}'.format(i))
-    ax.axis('off')
-    show_landmarks(**sample)
-
-    if i == 3:
-        plt.show()
-        break
-
-
-######################################################################
-# 두 번째: 데이터 변형
-# ---------------------------
-#
-
-
-######################################################################
-# 우리는 지금까지 어느정도 사용자 정의 데이터셋을 만들어 보았는데 이제는 사용자 정의 변형을 만들 차례 입니다.
-# 컴퓨터 비전에서는 사용자 정의 변형은 알고리즘을 일반화시키고 정확도를 올리는데 도움을 줍니다.
-# 변형들은 훈련시에 사용이 되며 주로 데이터 증강으로 참조되며 최근의 모델 개발에선 흔히 사용됩니다.
-#
-# 데이터셋을 다룰때 자주 일어나는 문제중 하나는 모든 샘플들이 같은 크기를 가지고 있지 않을 경우입니다.
-# 대부분의 신경망들은 미리 정해진 크기의 이미지들을 받아들입니다.
-# 그렇기 때문에 우리는 전처리 코드를 작성해야할 필요가 있습니다.
-# 이제 세개의 변형을 만들어 봅시다.
-#
-# -  ``Rescale``: 이미지 크기를 변경할때 사용됩니다.
-# -  ``RandomCrop``: 무작위로 이미지를 잘라내며 데이터 증강에 쓰입니다.
-# -  ``ToTensor``: Numpy 이미지들을 파이토치 이미지로 변환할때 사용됩니다. (그러기 위해서는 이미지 차원의 순서를 바꿔야합니다.)
-#
-# 우리는 위의 세개의 변형들을 단순한 함수 대신에 호출가능한 클래스로 만들어서 매번 변형이 호출될때 항상 매개변수가 넘겨지지 않도록 할겁니다.
-# 그러기 위해서는 우리는 단순히 ``__call__`` 메소드를 만들고 필요하다면 ``__init__`` 를 만들면 됩니다.
-# 그러면 우리는 변형을 이런식으로 사용할 수 있습니다.
-#
-# ::
-#
-#    tsfm = Transform(params)
-#    transformed_sample = tsfm(sample)
-#
-# 어떻게 이런 변형들이 이미지와 랜드마크에 적용이 되었는지 아래를 봐주시길 바랍니다.
-#
-
-
-######################################################################
-# 2.1 호출 가능한 클래스들 작성하기
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 각각의 변형에 맞는 호출 가능한 클래스 작성을 시작해 봅시다.
-#
-#
-
-class Rescale(object):
-    """ 주어진 크기로 샘플안에 있는 이미지를 재변환 합니다.
-
-    Args:
-        output_size (tuple 또는 int): 원하는 결과값의 크기입니다.
-        tuple로 주어진다면 결과값은 output_size 와 동일해야하며
-        int일때는 설정된 값보다 작은 이미지들의 가로와 세로는 output_size 에 적절한 비율로 변환됩니다.
-    """
-
-    def __init__(self, output_size):
-        assert isinstance(output_size, (int, tuple))
-        self.output_size = output_size
-
-    def __call__(self, sample):
-        image, landmarks = sample['image'], sample['landmarks']
-
-        h, w = image.shape[:2]
-        if isinstance(self.output_size, int):
-            if h > w:
-                new_h, new_w = self.output_size * h / w, self.output_size
-            else:
-                new_h, new_w = self.output_size, self.output_size * w / h
-        else:
-            new_h, new_w = self.output_size
-
-        new_h, new_w = int(new_h), int(new_w)
-
-        img = transform.resize(image, (new_h, new_w))
-
-        # h 와 w  는 이미지의 랜드마크들 때문에 서로 바뀝니다.
-        # x 와 y 축들은 각각 1과 0 값을 가집니다.
-        landmarks = landmarks * [new_w / w, new_h / h]
-
-        return {'image': img, 'landmarks': landmarks}
-
-
-class RandomCrop(object):
-    """ 샘플에 있는 이미지를 무작위로 자르기.
-
-    Args:
-        output_size (tuple 또는 int): 원하는 결과값의 크기입니다.
-        int로 설정하시면 정사각형 형태로 자르게 됩니다.
-    """
-
-    def __init__(self, output_size):
-        assert isinstance(output_size, (int, tuple))
-        if isinstance(output_size, int):
-            self.output_size = (output_size, output_size)
-        else:
-            assert len(output_size) == 2
-            self.output_size = output_size
-
-    def __call__(self, sample):
-        image, landmarks = sample['image'], sample['landmarks']
-
-        h, w = image.shape[:2]
-        new_h, new_w = self.output_size
-
-        top = np.random.randint(0, h - new_h)
-        left = np.random.randint(0, w - new_w)
-
-        image = image[top: top + new_h,
-                      left: left + new_w]
-
-        landmarks = landmarks - [left, top]
-
-        return {'image': image, 'landmarks': landmarks}
-
-
-class ToTensor(object):
-    """ 샘플 안에 있는 n차원 배열을 Tensor로 변홥힙니다. """
-
-    def __call__(self, sample):
-        image, landmarks = sample['image'], sample['landmarks']
-
-        # 색깔 축들을 바꿔치기해야하는데 그 이유는 numpy와 torch의 이미지 표현방식이 다르기 때문입니다.
-        # numpy 이미지: H x W x C
-        # torch 이미지: C X H X W
-        image = image.transpose((2, 0, 1))
-        return {'image': torch.from_numpy(image),
-                'landmarks': torch.from_numpy(landmarks)}
-
-
-######################################################################
-# 2.2 변환들을 구성하고 샘플에 적용해보기.
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 다음에는 작성해왔던 변환들을 구성하고 샘플에 적용해봅시다.
-#
-#
-# 우리가 한 이미지의 가로나 세로중에서 더작은 쪽을 256으로 크기를 바꾸고싶고 
-# 바뀐 이미지에서 무작위하게 가로 세로 전부 224로 자르고 싶다고 상황을 가정해봅시다.
-# 예를들면, 우리는 ``Rescale`` 과 ``RandomCrop`` 변환을 구성해야 합니다.
-# ``torchvision.transforms.Compose`` 는 간단한 호출가능한 클래스로 이러한것들을 우리에게 가능하게 해줍니다.
-#
-
-scale = Rescale(256)
-crop = RandomCrop(128)
-composed = transforms.Compose([Rescale(256),
-                               RandomCrop(224)])
-
-# 위에 있는 변환들을 각각 샘플에 적용 시킵니다.
-fig = plt.figure()
-sample = face_dataset[65]
-for i, tsfrm in enumerate([scale, crop, composed]):
-    transformed_sample = tsfrm(sample)
-
-    ax = plt.subplot(1, 3, i + 1)
-    plt.tight_layout()
-    ax.set_title(type(tsfrm).__name__)
-    show_landmarks(**transformed_sample)
-
-plt.show()
-
-
-######################################################################
-# 2.3 데이터셋을 반복문을 통해 사용하기
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 다음으로 우리는 데이터셋을 반복문을 통해 사용해보도록 하겠습니다.
-#
-#
-# 이제 이 모든것을 다 꺼내어서 변환을 구성하고 데이터셋을 만들어봅시다.
-# 요약하자면 항상 이 데이터셋을 다음과 같이 불러와집니다.
-#
-# -  이미지는 읽으려고 할때마다 불러옵니다.
-# -  변형들은 읽은 이미지에 적용이 됩니다.
-# -  변형들중 하나는 무작위를 이용하기 때문에, 데이터는 샘플링에 따라 증강됩니다.
-#
-# 저번에 해본것처럼 생성된 데이터셋을 ``for i in range`` 이라는 반복문을 통해 사용할 수 있습니다.
-#
-
-transformed_dataset = FaceLandmarksDataset(csv_file='faces/face_landmarks.csv',
-                                           root_dir='faces/',
-                                           transform=transforms.Compose([
-                                               Rescale(256),
-                                               RandomCrop(224),
-                                               ToTensor()
-                                           ]))
-
-for i in range(len(transformed_dataset)):
-    sample = transformed_dataset[i]
-
-    print(i, sample['image'].size(), sample['landmarks'].size())
-
-    if i == 3:
-        break
-
-
-######################################################################
-# 세번째: Dataloader
-# ----------------------
-#
-
-
-######################################################################
-# 직접적으로 데이터셋을 ``for``  반복문으로 데이터를 이용하는건 많은 특성들을 놓칠 수 밖에 없습니다.
-# 특히, 우리는 다음과 같은 특성들을 놓친다고 할 수 있습니다.
-#
-# -  데이터 배치
-# -  데이터 섞기
-# -  ``multiprocessing`` 를 이용하여 병렬적으로 데이터 불러오기
-#
-# ``torch.utils.data.DataLoader`` 는 반복자로서 위에 나와있는 모든 특성들을 제공합니다.
-# 아래에 제시된 사용되는 매개변수들은 쉽게 이해가 될겁니다. 흥미로운 배개변수는 ``collate_fn`` 인데
-# 이것은 정확하게 ``collate_fn`` 을 통해 몇개의 샘플들이 배치가 되어야하는지 지정할 수 있습니다.
-# 하지만 굳이 수정하지 않아도 대부분의 경우에는 잘 작동할겁니다.
-#
-
-dataloader = DataLoader(transformed_dataset, batch_size=4,
-                        shuffle=True, num_workers=4)
-
-
-# 배치를 보여주기위한 도움 함수
-def show_landmarks_batch(sample_batched):
-    """ 샘플들의 배치에서 이미지와 함께 랜드마크를 보여줍니다. """
-    images_batch, landmarks_batch = \
-            sample_batched['image'], sample_batched['landmarks']
-    batch_size = len(images_batch)
-    im_size = images_batch.size(2)
-
-    grid = utils.make_grid(images_batch)
-    plt.imshow(grid.numpy().transpose((1, 2, 0)))
-
-    for i in range(batch_size):
-        plt.scatter(landmarks_batch[i, :, 0].numpy() + i * im_size,
-                    landmarks_batch[i, :, 1].numpy(),
-                    s=10, marker='.', c='r')
-
-        plt.title('Batch from dataloader')
-
-for i_batch, sample_batched in enumerate(dataloader):
-    print(i_batch, sample_batched['image'].size(),
-          sample_batched['landmarks'].size())
-
-    # 4번째 배치를 보여주고 반복문을 멈춥니다.
-    if i_batch == 3:
-        plt.figure()
-        show_landmarks_batch(sample_batched)
-        plt.axis('off')
-        plt.ioff()
-        plt.show()
-        break
-
-
-######################################################################
-# 이제 PyTorch를 이용해서 어떻게 사용자 정의 dataloader를 만드는지 배웠습니다.
-# 저희는 좀 더 관련된 문서들을 깊게 읽으셔서 더욱 맞춤화된 작업 흐림을 가지길 추천 드립니다.
-# 더 배워보시려면 ``torch.utils.data`` 문서를 `여기 <https://pytorch.org/docs/stable/data.html>`__ 에서 읽어 보실 수 있습니다.
diff --git a/recipes_source/recipes/defining_a_neural_network.py b/recipes_source/recipes/defining_a_neural_network.py
index af64758ef..6828640fe 100644
--- a/recipes_source/recipes/defining_a_neural_network.py
+++ b/recipes_source/recipes/defining_a_neural_network.py
@@ -1,24 +1,24 @@
 """
 Pytorch를 사용해 신경망 정의하기
 ====================================
-딥러닝은 인공신경망(models)을 사용하며 이것은 상호연결된 집단의 많은 계층으로 구성된 계산 시스템입니다. 
-데이터가 이 상호연결된 집단을 통과하면서, 신경망은 입력을 출력으로 바꾸기 위해 요구된 계산 방법에 어떻게 근접하는 지를 배울 수 있습니다.  
-PyTorch에서, 신경망은 ``torch.nn`` 패키지를 사용해 구성할 수 있습니다. 
+딥러닝은 인공신경망(models)을 사용하며 이것은 상호연결된 집단의 많은 계층으로 구성된 계산 시스템입니다.
+데이터가 이 상호연결된 집단을 통과하면서, 신경망은 입력을 출력으로 바꾸기 위해 요구된 계산 방법에 어떻게 근접하는 지를 배울 수 있습니다.
+PyTorch에서, 신경망은 ``torch.nn`` 패키지를 사용해 구성할 수 있습니다.
 
 소개
 -----
-PyTorch는 ``torch.nn`` 을 포함하여 신경망을 만들고 훈련시키는 것을 도울 수 있도록 섬세하게 만들어진 모듈과 클래스들을 제공합니다. 
-``nn.Moduel`` 은 계층, 그리고 ``output`` 을 반환하는 ``forward(input)`` 메소드를 포함하고 있습니다. 
+PyTorch는 ``torch.nn`` 을 포함하여 신경망을 만들고 훈련시키는 것을 도울 수 있도록 섬세하게 만들어진 모듈과 클래스들을 제공합니다.
+``nn.Moduel`` 은 계층, 그리고 ``output`` 을 반환하는 ``forward(input)`` 메소드를 포함하고 있습니다.
 
 이 레시피에서, `MNIST dataset <https://pytorch.org/docs/stable/torchvision/datasets.html#mnist>`__ 을 사용하여 신경망을 정의하기 위해 ``torch.nn`` 을 사용할 예정입니다.
 
 설치
 -----
-시작하기 전에, 준비가 되어있지 않다면 ``torch`` 를 설치해야 합니다. 
+시작하기 전에, 준비가 되어있지 않다면 ``torch`` 를 설치해야 합니다.
 
 ::
 
-   pip install torchaudio
+   pip install torch
 
 
 """
@@ -27,17 +27,17 @@
 ######################################################################
 # 단계
 # -----
-# 
+#
 # 1. 데이터를 가져오기 위해 필요한 라이브러리들 불러오기
 # 2. 신경망을 정의하고 초기화하기
 # 3. 데이터가 모델을 어떻게 지나갈 지 구체화하기
 # 4. [선택사항] 데이터를 모델에 적용해 테스트하기
-# 
+#
 # 1. 데이터를 가져오기 위해 필요한 라이브러리들 불러오기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
-# 이 레시피에서, ``torch`` 과 이것의 하위 모듈인 ``torch.nn`` , ``torch.nn.functional`` 을 사용합니다.   
-# 
+#
+# 이 레시피에서, ``torch`` 과 이것의 하위 모듈인 ``torch.nn`` , ``torch.nn.functional`` 을 사용합니다.
+#
 
 import torch
 import torch.nn as nn
@@ -48,28 +48,28 @@
 # 2. 신경망을 정의하고 초기화하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# 이미지를 인식하는 신경망을 만들겁니다. PyTorch에서 만들어진 컨볼루션(convolution)이라고 불리는 방법을 사용하겠습니다.
-# 컨볼루션은 커널이나 작은 메트릭스를 통해 가중치를 부여한 이미지의 각 요소를 주변 값과 더합니다. 
-# 그리고 이것은 입력된 이미지의 특징(모서리 감지, 선명함, 흐릿함 등과 같은)을 추출하는 데 도움을 줍니다. 
+# 이미지를 인식하는 신경망을 만들겁니다. PyTorch에서 만들어진 합성곱(convolution)이라고 불리는 방법을 사용하겠습니다.
+# 합성곱은 커널이나 작은 행렬(matrix)를 통해 가중치를 부여한 이미지의 각 요소를 주변 값과 더합니다.
+# 그리고 이것은 입력된 이미지의 특징(모서리 감지, 선명함, 흐릿함 등과 같은)을 추출하는 데 도움을 줍니다.
 #
-# 모델의 ``Net`` 클래스를 정의하기 위해 2가지가 필요합니다. 
-# 첫번째는 ``nn.Module`` 을 참고하는 ``__init__`` 함수를 작성하는 것입니다. 
-# 이 함수는 신경망에서  fully connected layers를 만드는 것에 사용됩니다.
-# 
-# 컨볼루션을 사용해, 1개의 입력 이미지 채널을 가지고 
+# 모델의 ``Net`` 클래스를 정의하기 위해 2가지가 필요합니다.
+# 첫번째는 ``nn.Module`` 을 참고하는 ``__init__`` 함수를 작성하는 것입니다.
+# 이 함수는 신경망에서 fully connected layers를 만드는 것에 사용됩니다.
+#
+# 합성곱을 사용해, 1개의 입력 이미지 채널을 가지고
 # 목표인 0부터 9까지 숫자를 대표하는 10개의 라벨과 되응되 값을 출력하는 모델을 정의하겠습니다.
-# 이 알고리즘은 만드는 사람에 달렸지만, 기본적인 MNIST 알고리즘을 따르도록 하겠습니다. 
-#  
+# 이 알고리즘은 만드는 사람에 달렸지만, 기본적인 MNIST 알고리즘을 따르도록 하겠습니다.
+#
 
 class Net(nn.Module):
     def __init__(self):
       super(Net, self).__init__()
 
-      # 첫번째 2D 컨볼루션 계층
-      # 1개의 입력 채널(이미지)을 받아들이고, 사각 커널 사이즈가 3인 32개의 컨볼루션 특징들을 출력합니다. 
+      # 첫번째 2D 합성곱 계층
+      # 1개의 입력 채널(이미지)을 받아들이고, 사각 커널 사이즈가 3인 32개의 합성곱 특징들을 출력합니다.
       self.conv1 = nn.Conv2d(1, 32, 3, 1)
-      # 두번째 2D 컨볼루션 계층
-      # 32개의 입력 게층을 받아들이고, 사각 커널 사이즈가 3인 64개의 컨볼루션 특징을 출력합니다. 
+      # 두번째 2D 합성곱 계층
+      # 32개의 입력 게층을 받아들이고, 사각 커널 사이즈가 3인 64개의 합성곱 특징을 출력합니다.
       self.conv2 = nn.Conv2d(32, 64, 3, 1)
 
       # 인접한 픽셀들은 입력 확률에 따라 모두 0 값을 가지거나 혹은 모두 유효한 값이 되도록 만듭니다.
@@ -86,16 +86,16 @@ def __init__(self):
 
 
 ######################################################################
-# 신경망을 정의하는 것을 마쳤습니다. 이제 어떻게 이것을 지나갈 지 정의해야 합니다. 
-# 
+# 신경망을 정의하는 것을 마쳤습니다. 이제 어떻게 이것을 지나갈 지 정의해야 합니다.
+#
 # 3. 데이터가 모델을 어떻게 지나갈 지 구체화하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# PyTorch를 사용해 모델을 생성할 때, 계산 그래프(즉, 신경망)에 데이터를 지나가게 하는 ``forward`` 함수를 정의해야 합니다. 
+# PyTorch를 사용해 모델을 생성할 때, 계산 그래프(즉, 신경망)에 데이터를 지나가게 하는 ``forward`` 함수를 정의해야 합니다.
 # 이것은 feed-forward 알고리즘을 나타냅니다.
-# 
-# ``forward`` 함수에서 어떠한 Tensor 연산자도 사용 가능합니다. 
-# 
+#
+# ``forward`` 함수에서 어떠한 Tensor 연산자도 사용 가능합니다.
+#
 
 class Net(nn.Module):
     def __init__(self):
@@ -107,29 +107,29 @@ def __init__(self):
       self.fc1 = nn.Linear(9216, 128)
       self.fc2 = nn.Linear(128, 10)
 
-    # x는 데이터를 나타냅니다. 
+    # x는 데이터를 나타냅니다.
     def forward(self, x):
-      # 데이터가 conv1을 지나갑니다. 
+      # 데이터가 conv1을 지나갑니다.
       x = self.conv1(x)
-      # x를 ReLU 활성함수(rectified-linear activation function)에 대입합니다. 
+      # x를 ReLU 활성함수(rectified-linear activation function)에 대입합니다.
       x = F.relu(x)
 
       x = self.conv2(x)
       x = F.relu(x)
 
-      # x에 대해서 max pooling을 실행합니다. 
+      # x에 대해서 max pooling을 실행합니다.
       x = F.max_pool2d(x, 2)
-      # 데이터가 dropout1을 지나갑니다. 
+      # 데이터가 dropout1을 지나갑니다.
       x = self.dropout1(x)
-      # start_dim=1으로 x를 압축합니다. 
+      # start_dim=1으로 x를 압축합니다.
       x = torch.flatten(x, 1)
-      # 데이터가 fc1을 지나갑니다. 
+      # 데이터가 fc1을 지나갑니다.
       x = self.fc1(x)
       x = F.relu(x)
       x = self.dropout2(x)
       x = self.fc2(x)
 
-      # x에 softmax를 적용합니다. 
+      # x에 softmax를 적용합니다.
       output = F.log_softmax(x, dim=1)
       return output
 
@@ -137,11 +137,11 @@ def forward(self, x):
 ######################################################################
 # 4. [선택사항] 데이터를 모델에 적용해 테스트하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
-# 원하는 출력값을 받을 수 있는 지 확인하기 위해, 무작위의 데이터를 모델에 통과시켜 시험해봅시다. 
-# 
+#
+# 원하는 출력값을 받을 수 있는 지 확인하기 위해, 무작위의 데이터를 모델에 통과시켜 시험해봅시다.
+#
 
-# 임의의 28x28 이미지로 맞춰줍니다. 
+# 임의의 28x28 이미지로 맞춰줍니다.
 random_data = torch.rand((1, 1, 28, 28))
 
 my_nn = Net()
@@ -150,14 +150,14 @@ def forward(self, x):
 
 
 ######################################################################
-# 결과 tensor의 각 숫자는 임의의 tenosr와 연관된 라벨이 예측한 값과 같다는 것을 나타냅니다.   
-# 
-# 축하합니다! PyTorch로 신경망 정의하기를 성공적으로 해냈습니다. 
-# 
+# 결과 tensor의 각 숫자는 임의의 tenosr와 연관된 라벨이 예측한 값과 같다는 것을 나타냅니다.
+#
+# 축하합니다! PyTorch로 신경망 정의하기를 성공적으로 해냈습니다.
+#
 # 더 알아보기
 # -----------
-# 
+#
 # 계속해서 학습하고 싶다면 다른 레시피를 살펴보십시오:
-# 
+#
 # - `PyTorch에서 state_dict이 무엇인지 <https://pytorch.org/tutorials/recipes/recipes/what_is_state_dict.html>`__
 # - `PyTorch로 추론을 위한 모델을 저장하고 가저오기  <https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_models_for_inference.html>`__
diff --git a/recipes_source/recipes/loading_data_recipe.py b/recipes_source/recipes/loading_data_recipe.py
index 9e5805dad..75aba0eba 100644
--- a/recipes_source/recipes/loading_data_recipe.py
+++ b/recipes_source/recipes/loading_data_recipe.py
@@ -45,14 +45,14 @@
 ######################################################################
 # 단계(Steps)
 # -----
-# 
+#
 # 1. 데이터를 불러오는데 필요한 라이브러리 import하기
 # 2. 데이터 접근하기
 # 3. 데이터 불러오기
 # 4. 데이터 순회하기
 # 5. [선택 사항] 데이터 시각화하기
 #
-# 
+#
 # 1. 데이터를 불러오는데 필요한 라이브러리 import하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
@@ -73,9 +73,9 @@
 # (`더 알아보기 <https://www.openslr.org/1/>`__).
 #
 # ``torchaudio.datasets.YESNO`` 클래스를 사용하여 YesNo 데이터셋을 생성합니다.
-# 
+#
 # ::
-#    
+#
 #    torchaudio.datasets.YESNO(
 #      root,
 #      url='http://www.openslr.org/resources/1/waves_yesno.tar.gz',
@@ -99,7 +99,7 @@
 
 # YesNo 안에 각각의 데이터 항목은 튜플 형태 (파형, 샘플 속도, 라벨)를 가지며,
 # 이때 labels는 0(no)과 1(yes)을 담은 리스트 형태로 되어 있습니다.
-yesno_data_trainset = torchaudio.datasets.YESNO('./', download=True)
+yesno_data = torchaudio.datasets.YESNO('./', download=True)
 
 # 실제 데이터에 접근해서 yesno_data의 형태를 확인합니다. 세 번째 항목을 예시로 살펴봅니다.
 n = 3
@@ -114,10 +114,10 @@
 #
 # 3. 데이터 불러오기
 # ~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # 데이터셋에 성공적으로 접근했으니, 이제 데이터셋을 ``torch.utils.data.DataLoader`` 로 넘겨줍니다.
 # ``DataLoader`` 는 데이터셋을 sampler와 조합시켜 데이터셋을 순회할 수 있는 iterable을 만들어줍니다.
-# 
+#
 
 data_loader = torch.utils.data.DataLoader(yesno_data,
                                           batch_size=1,
@@ -127,7 +127,7 @@
 ######################################################################
 # 4. 데이터 순회하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # 이제 ``data_loader`` 를 이용해서 데이터를 순회할 수 있습니다. 모델을 학습하려면 이처럼
 # 데이터를 순회할 수 있어야 합니다. 아래 예시를 보시면 ``data_loader`` 안에 있는 각각의
 # 데이터 항목이 파형, 샘플 속도, 라벨을 담은 텐서로 바뀌었음을 확인할 수 있습니다.
@@ -142,9 +142,9 @@
 ######################################################################
 # 5. [선택 사항] 데이터 시각화하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # ``DataLoader`` 의 데이터를 시각화해서 더 자세히 확인해보실 수 있습니다.
-# 
+#
 
 import matplotlib.pyplot as plt
 
@@ -156,10 +156,10 @@
 
 ######################################################################
 # 축하드립니다! PyTorch에서 데이터를 불러오는데 성공하셨습니다.
-# 
+#
 # 더 알아보기
 # ----------
-# 
+#
 # 다른 레시피를 둘러보고 계속 배워보세요:
 #
 # - :doc:`/recipes/recipes/defining_a_neural_network`
diff --git a/recipes_source/recipes/profiler.py b/recipes_source/recipes/profiler_recipe.py
similarity index 98%
rename from recipes_source/recipes/profiler.py
rename to recipes_source/recipes/profiler_recipe.py
index 14ae58b50..0bf753197 100644
--- a/recipes_source/recipes/profiler.py
+++ b/recipes_source/recipes/profiler_recipe.py
@@ -209,7 +209,8 @@
 # 더 알아보기
 # -------------
 #
-# 다음의 튜토리얼을 통해 Tensorboard를 사용하여 모델 시각화하는 방법을 살펴보세요:
+# 다음 레시피와 튜토리얼을 읽으며 학습을 계속해보세요:
 #
+# - :doc:`/recipes/recipes/benchmark`
 # - :doc:`/intermediate/tensorboard_tutorial`
 #
diff --git a/recipes_source/recipes/timer_quick_start.py b/recipes_source/recipes/timer_quick_start.py
new file mode 100644
index 000000000..158872639
--- /dev/null
+++ b/recipes_source/recipes/timer_quick_start.py
@@ -0,0 +1,394 @@
+"""
+Timer quick start
+=================
+
+In this tutorial, we're going to cover the primary APIs of
+`torch.utils.benchmark.Timer`. The PyTorch Timer is based on the
+`timeit.Timer <https://docs.python.org/3/library/timeit.html#timeit.Timer>`__
+API, with several PyTorch specific modifications. Familiarity with the
+builtin `Timer` class is not required for this tutorial, however we assume
+that the reader is familiar with the fundamentals of performance work.
+
+A more comprehensive performace tuning tutorial is available at:
+
+    https://pytorch.org/tutorials/recipes/recipes/benchmark.html
+
+
+**Contents:**
+    1. `Defining a Timer <#defining-a-timer>`__
+    2. `Wall time: \`Timer.blocked_autorange(...)\` <#wall-time-timer-blocked-autorange>`__
+    3. `C++ snippets <#c-snippets>`__
+    4. `Instruction counts: \`Timer.collect_callgrind(...)\` <#instruction-counts-timer-collect-callgrind>`__
+    5. `Instruction counts: Delving deeper <#instruction-counts-delving-deeper>`__
+    6. `A/B testing with Callgrind <#a-b-testing-with-callgrind>`__
+    7. `Wrapping up <#wrapping-up>`__
+    8. `Footnotes <#footnotes>`__
+"""
+
+
+###############################################################################
+# 1. Defining a Timer
+# ~~~~~~~~~~~~~~~~~~~
+#
+# A `Timer` serves as a task definition.
+#
+
+from torch.utils.benchmark import Timer
+
+timer = Timer(
+    # The computation which will be run in a loop and timed.
+    stmt="x * y",
+
+    # `setup` will be run before calling the measurement loop, and is used to
+    # populate any state which is needed by `stmt`
+    setup="""
+        x = torch.ones((128,))
+        y = torch.ones((128,))
+    """,
+
+    # Alternately, `globals` can be used to pass variables from the outer scope.
+    # -------------------------------------------------------------------------
+    # globals={
+    #     "x": torch.ones((128,)),
+    #     "y": torch.ones((128,)),
+    # },
+
+    # Control the number of threads that PyTorch uses. (Default: 1)
+    num_threads=1,
+)
+
+###############################################################################
+# 2. Wall time: `Timer.blocked_autorange(...)`
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# This method will handle details such as picking a suitable number if repeats,
+# fixing the number of threads, and providing a convenient representation of
+# the results.
+#
+
+# Measurement objects store the results of multiple repeats, and provide
+# various utility features.
+from torch.utils.benchmark import Measurement
+
+m: Measurement = timer.blocked_autorange(min_run_time=1)
+print(m)
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **Snippet wall time.**
+#
+#         <torch.utils.benchmark.utils.common.Measurement object at 0x7f1929a38ed0>
+#         x * y
+#         setup:
+#           x = torch.ones((128,))
+#           y = torch.ones((128,))
+#
+#           Median: 2.34 us
+#           IQR:    0.07 us (2.31 to 2.38)
+#           424 measurements, 1000 runs per measurement, 1 thread
+#
+
+###############################################################################
+# 3. C++ snippets
+# ~~~~~~~~~~~~~~~
+#
+
+from torch.utils.benchmark import Language
+
+cpp_timer = Timer(
+    "x * y;",
+    """
+        auto x = torch::ones({128});
+        auto y = torch::ones({128});
+    """,
+    language=Language.CPP,
+)
+
+print(cpp_timer.blocked_autorange(min_run_time=1))
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **C++ snippet wall time.**
+#
+#         <torch.utils.benchmark.utils.common.Measurement object at 0x7f192b019ed0>
+#         x * y;
+#         setup:
+#           auto x = torch::ones({128});
+#           auto y = torch::ones({128});
+#
+#           Median: 1.21 us
+#           IQR:    0.03 us (1.20 to 1.23)
+#           83 measurements, 10000 runs per measurement, 1 thread
+#
+
+###############################################################################
+# Unsurprisingly, the C++ snippet is both faster and has lower variation.
+#
+
+###############################################################################
+# 4. Instruction counts: `Timer.collect_callgrind(...)`
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# For deep dive investigations, `Timer.collect_callgrind` wraps
+# `Callgrind <https://valgrind.org/docs/manual/cl-manual.html>` in order to
+# collect instruction counts. These are useful as they offer fine grained and
+# deterministic (or very low noise in the case of Python) insights into how a
+# snippet is run.
+#
+
+from torch.utils.benchmark import CallgrindStats, FunctionCounts
+
+stats: CallgrindStats = cpp_timer.collect_callgrind()
+print(stats)
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **C++ Callgrind stats (summary)**
+#
+#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.CallgrindStats object at 0x7f1929a35850>
+#         x * y;
+#         setup:
+#           auto x = torch::ones({128});
+#           auto y = torch::ones({128});
+#
+#                                 All          Noisy symbols removed
+#             Instructions:       563600                     563600
+#             Baseline:                0                          0
+#         100 runs per measurement, 1 thread
+#
+
+###############################################################################
+# 5. Instruction counts: Delving deeper
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The string representation of CallgrindStats is similar to that of
+# Measurement. `Noisy symbols` are a Python concept (removing calls in the
+# CPython interpreter which are known to be noisy).
+#
+# For more detailed analysis, however, we will want to look at specific calls.
+# `CallgrindStats.stats()` returns a FunctionCounts object to make this easier.
+# Conceptually, FunctionCounts can be thought of as a tuple of pairs with some
+# utility methods, where each pair is `(number of instructions, file path and
+# function name)`.
+#
+# A note on paths:
+#   One generally doesn't care about absolute path. For instance, the full path
+#   and function name for a multiply call is something like:
+#
+#       /the/prefix/to/your/pytorch/install/dir/pytorch/build/aten/src/ATen/core/TensorMethods.cpp:at::Tensor::mul(at::Tensor const&) const [/the/path/to/your/conda/install/miniconda3/envs/ab_ref/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so]
+#
+#   when in reality, all of the information that we're interested in can be
+#   represented in:
+#
+#       build/aten/src/ATen/core/TensorMethods.cpp:at::Tensor::mul(at::Tensor const&) const
+#
+#   CallgrindStats.as_standardized() makes a best effort to strip low signal
+#   portions of the file path, as well as the shared object and is generally
+#   recommended.
+#
+
+inclusive_stats = stats.as_standardized().stats(inclusive=False)
+print(inclusive_stats[:10])
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **C++ Callgrind stats (detailed)**
+#
+#         torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f192a6dfd90>
+#           47264  ???:_int_free
+#           25963  ???:_int_malloc
+#           19900  build/../aten/src/ATen/TensorIter ... (at::TensorIteratorConfig const&)
+#           18000  ???:__tls_get_addr
+#           13500  ???:malloc
+#           11300  build/../c10/util/SmallVector.h:a ... (at::TensorIteratorConfig const&)
+#           10345  ???:_int_memalign
+#           10000  build/../aten/src/ATen/TensorIter ... (at::TensorIteratorConfig const&)
+#            9200  ???:free
+#            8000  build/../c10/util/SmallVector.h:a ... IteratorBase::get_strides() const
+#
+#         Total: 173472
+#
+
+###############################################################################
+# That's still quite a lot to digest. Let's use the `FunctionCounts.transform`
+# method to trim some of the function path, and discard the function called.
+# When we do, the counts of any collisions (e.g. `foo.h:a()` and `foo.h:b()`
+# will both map to `foo.h`) will be added together.
+#
+
+import os
+import re
+
+def group_by_file(fn_name: str):
+    if fn_name.startswith("???"):
+        fn_dir, fn_file = fn_name.split(":")[:2]
+    else:
+        fn_dir, fn_file = os.path.split(fn_name.split(":")[0])
+        fn_dir = re.sub("^.*build/../", "", fn_dir)
+        fn_dir = re.sub("^.*torch/", "torch/", fn_dir)
+
+    return f"{fn_dir:<15} {fn_file}"
+
+print(inclusive_stats.transform(group_by_file)[:10])
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **Callgrind stats (condensed)**
+#
+#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f192995d750>
+#           118200  aten/src/ATen   TensorIterator.cpp
+#            65000  c10/util        SmallVector.h
+#            47264  ???             _int_free
+#            25963  ???             _int_malloc
+#            20900  c10/util        intrusive_ptr.h
+#            18000  ???             __tls_get_addr
+#            15900  c10/core        TensorImpl.h
+#            15100  c10/core        CPUAllocator.cpp
+#            13500  ???             malloc
+#            12500  c10/core        TensorImpl.cpp
+#
+#         Total: 352327
+#
+
+###############################################################################
+# 6. A/B testing with Callgrind
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# One of the most useful features of instruction counts is they allow fine
+# grained comparison of computation, which is critical when analyzing
+# performance.
+#
+# To see this in action, lets compare our multiplication of two size 128
+# Tensors with a {128} x {1} multiplication, which will broadcast the second
+# Tensor:
+#   result = {a0 * b0, a1 * b0, ..., a127 * b0}
+#
+
+broadcasting_stats = Timer(
+    "x * y;",
+    """
+        auto x = torch::ones({128});
+        auto y = torch::ones({1});
+    """,
+    language=Language.CPP,
+).collect_callgrind().as_standardized().stats(inclusive=False)
+
+###############################################################################
+# Often we want to A/B test two different environments. (e.g. testing a PR, or
+# experimenting with compile flags.) This is quite simple, as CallgrindStats,
+# FunctionCounts, and Measurement are all pickleable. Simply save measurements
+# from each environment, and load them in a single process for analysis.
+#
+
+import pickle
+
+# Let's round trip `broadcasting_stats` just to show that we can.
+broadcasting_stats = pickle.loads(pickle.dumps(broadcasting_stats))
+
+
+# And now to diff the two tasks:
+delta = broadcasting_stats - inclusive_stats
+
+def extract_fn_name(fn: str):
+    """Trim everything except the function name."""
+    fn = ":".join(fn.split(":")[1:])
+    return re.sub(r"\(.+\)", "(...)", fn)
+
+# We use `.transform` to make the diff readable:
+print(delta.transform(extract_fn_name))
+
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **Instruction count delta**
+#
+#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f192995d750>
+#             17600  at::TensorIteratorBase::compute_strides(...)
+#             12700  at::TensorIteratorBase::allocate_or_resize_outputs()
+#             10200  c10::SmallVectorImpl<long>::operator=(...)
+#              7400  at::infer_size(...)
+#              6200  at::TensorIteratorBase::invert_perm(...) const
+#              6064  _int_free
+#              5100  at::TensorIteratorBase::reorder_dimensions()
+#              4300  malloc
+#              4300  at::TensorIteratorBase::compatible_stride(...) const
+#               ...
+#               -28  _int_memalign
+#              -100  c10::impl::check_tensor_options_and_extract_memory_format(...)
+#              -300  __memcmp_avx2_movbe
+#              -400  at::detail::empty_cpu(...)
+#             -1100  at::TensorIteratorBase::numel() const
+#             -1300  void at::native::(...)
+#             -2400  c10::TensorImpl::is_contiguous(...) const
+#             -6100  at::TensorIteratorBase::compute_fast_setup_type(...)
+#            -22600  at::TensorIteratorBase::fast_set_up(...)
+#
+#         Total: 58091
+#
+
+###############################################################################
+# So the broadcasting version takes an extra 580 instructions per call (recall
+# that we're collecting 100 runs per sample), or about 10%. There are quite a
+# few TensorIterator calls, so lets drill down to those. FunctionCounts.filter
+# makes this easy.
+#
+
+print(delta.transform(extract_fn_name).filter(lambda fn: "TensorIterator" in fn))
+
+###############################################################################
+# .. code-block:: none
+#    :caption: **Instruction count delta (filter)**
+#
+#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f19299544d0>
+#             17600  at::TensorIteratorBase::compute_strides(...)
+#             12700  at::TensorIteratorBase::allocate_or_resize_outputs()
+#              6200  at::TensorIteratorBase::invert_perm(...) const
+#              5100  at::TensorIteratorBase::reorder_dimensions()
+#              4300  at::TensorIteratorBase::compatible_stride(...) const
+#              4000  at::TensorIteratorBase::compute_shape(...)
+#              2300  at::TensorIteratorBase::coalesce_dimensions()
+#              1600  at::TensorIteratorBase::build(...)
+#             -1100  at::TensorIteratorBase::numel() const
+#             -6100  at::TensorIteratorBase::compute_fast_setup_type(...)
+#            -22600  at::TensorIteratorBase::fast_set_up(...)
+#
+#         Total: 24000
+#
+
+###############################################################################
+# This makes plain what is going on: there is a fast path in TensorIterator
+# setup, but in the {128} x {1} case we miss it and have to do a more general
+# analysis which is more expensive. The most prominent call omitted by the
+# filter is `c10::SmallVectorImpl<long>::operator=(...)`, which is also part
+# of the more general setup.
+#
+
+###############################################################################
+# 7. Wrapping up
+# ~~~~~~~~~~~~~~
+#
+# In summary, use `Timer.blocked_autorange` to collect wall times. If timing
+# variation is too high, increase `min_run_time`, or move to C++ snippets if
+# convenient.
+#
+# For fine grained analysis, use `Timer.collect_callgrind` to measure
+# instruction counts and `FunctionCounts.(__add__ / __sub__ / transform / filter)`
+# to slice-and-dice them.
+#
+
+###############################################################################
+# 8. Footnotes
+# ~~~~~~~~~~~~
+#
+#   - Implied `import torch`
+#       If `globals` does not contain "torch", Timer will automatically
+#       populate it. This means that `Timer("torch.empty(())")` will work.
+#       (Though other imports should be placed in `setup`,
+#       e.g. `Timer("np.zeros(())", "import numpy as np")`)
+#
+#   - REL_WITH_DEB_INFO
+#       In order to provide full information about the PyTorch internals which
+#       are executed, Callgrind needs access to C++ debug symbols. This is
+#       accomplished by setting REL_WITH_DEB_INFO=1 when building PyTorch.
+#       Otherwise function calls will be opaque. (The resultant CallgrindStats
+#       will warn if debug symbols are missing.)
diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
new file mode 100644
index 000000000..9d9726ae7
--- /dev/null
+++ b/recipes_source/recipes/tuning_guide.py
@@ -0,0 +1,370 @@
+"""
+Performance Tuning Guide
+*************************
+**Author**: `Szymon Migacz <https://github.com/szmigacz>`_
+
+Performance Tuning Guide is a set of optimizations and best practices which can
+accelerate training and inference of deep learning models in PyTorch. Presented
+techniques often can be implemented by changing only a few lines of code and can
+be applied to a wide range of deep learning models across all domains.
+
+General optimizations
+---------------------
+"""
+
+###############################################################################
+# Enable async data loading and augmentation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader>`_
+# supports asynchronous data loading and data augmentation in separate worker
+# subprocesses. The default setting for ``DataLoader`` is ``num_workers=0``,
+# which means that the data loading is synchronous and done in the main process.
+# As a result the main training process has to wait for the data to be available
+# to continue the execution.
+#
+# Setting ``num_workers > 0`` enables asynchronous data loading and overlap
+# between the training and data loading. ``num_workers`` should be tuned
+# depending on the workload, CPU, GPU, and location of training data.
+#
+# ``DataLoader`` accepts ``pin_memory`` argument, which defaults to ``False``.
+# When using a GPU it's better to set ``pin_memory=True``, this instructs
+# ``DataLoader`` to use pinned memory and enables faster and asynchronous memory
+# copy from the host to the GPU.
+
+###############################################################################
+# Disable gradient calculation for validation or inference
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# PyTorch saves intermediate buffers from all operations which involve tensors
+# that require gradients. Typically gradients aren't needed for validation or
+# inference.
+# `torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html#torch.no_grad>`_
+# context manager can be applied to disable gradient calculation within a
+# specified block of code, this accelerates execution and reduces the amount of
+# required memory.
+# `torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html#torch.no_grad>`_
+# can also be used as a function decorator.
+
+###############################################################################
+# Disable bias for convolutions directly followed by a batch norm
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# `torch.nn.Conv2d() <https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d>`_
+# has ``bias`` parameter which defaults to ``True`` (the same is true for
+# `Conv1d <https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html#torch.nn.Conv1d>`_
+# and
+# `Conv3d <https://pytorch.org/docs/stable/generated/torch.nn.Conv3d.html#torch.nn.Conv3d>`_
+# ).
+#
+# If a ``nn.Conv2d`` layer is directly followed by a ``nn.BatchNorm2d`` layer,
+# then the bias in the convolution is not needed, instead use
+# ``nn.Conv2d(..., bias=False, ....)``. Bias is not needed because in the first
+# step ``BatchNorm`` subtracts the mean, which effectively cancels out the
+# effect of bias.
+#
+# This is also applicable to 1d and 3d convolutions as long as ``BatchNorm`` (or
+# other normalization layer) normalizes on the same dimension as convolution's
+# bias.
+#
+# Models available from `torchvision <https://github.com/pytorch/vision>`_
+# already implement this optimization.
+
+###############################################################################
+# Use parameter.grad = None instead of model.zero_grad() or optimizer.zero_grad()
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Instead of calling:
+model.zero_grad()
+# or
+optimizer.zero_grad()
+
+###############################################################################
+# to zero out gradients, use the following method instead:
+
+for param in model.parameters():
+    param.grad = None
+
+###############################################################################
+# The second code snippet does not zero the memory of each individual parameter,
+# also the subsequent backward pass uses assignment instead of addition to store
+# gradients, this reduces the number of memory operations.
+#
+# Setting gradient to ``None`` has a slightly different numerical behavior than
+# setting it to zero, for more details refer to the
+# `documentation <https://pytorch.org/docs/master/optim.html#torch.optim.Optimizer.zero_grad>`_.
+#
+# Alternatively, starting from PyTorch 1.7, call ``model`` or
+# ``optimizer.zero_grad(set_to_none=True)``.
+
+###############################################################################
+# Fuse pointwise operations
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# Pointwise operations (elementwise addition, multiplication, math functions -
+# ``sin()``, ``cos()``, ``sigmoid()`` etc.) can be fused into a single kernel
+# to amortize memory access time and kernel launch time.
+#
+# `PyTorch JIT <https://pytorch.org/docs/stable/jit.html>`_ can fuse kernels
+# automatically, although there could be additional fusion opportunities not yet
+# implemented in the compiler, and not all device types are supported equally.
+#
+# Pointwise operations are memory-bound, for each operation PyTorch launches a
+# separate kernel. Each kernel loads data from the memory, performs computation
+# (this step is usually inexpensive) and stores results back into the memory.
+#
+# Fused operator launches only one kernel for multiple fused pointwise ops and
+# loads/stores data only once to the memory. This makes JIT very useful for
+# activation functions, optimizers, custom RNN cells etc.
+#
+# In the simplest case fusion can be enabled by applying
+# `torch.jit.script <https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script>`_
+# decorator to the function definition, for example:
+
+@torch.jit.script
+def fused_gelu(x):
+    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
+
+###############################################################################
+# Refer to
+# `TorchScript documentation <https://pytorch.org/docs/stable/jit.html>`_
+# for more advanced use cases.
+
+###############################################################################
+# Enable channels_last memory format for computer vision models
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# PyTorch 1.5 introduced support for ``channels_last`` memory format for
+# convolutional networks. This format is meant to be used in conjunction with
+# `AMP <https://pytorch.org/docs/stable/amp.html>`_ to further accelerate
+# convolutional neural networks with
+# `Tensor Cores <https://www.nvidia.com/en-us/data-center/tensor-cores/>`_.
+#
+# Support for ``channels_last`` is experimental, but it's expected to work for
+# standard computer vision models (e.g. ResNet-50, SSD). To convert models to
+# ``channels_last`` format follow
+# `Channels Last Memory Format Tutorial <https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html>`_.
+# The tutorial includes a section on
+# `converting existing models <https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html#converting-existing-models>`_.
+
+###############################################################################
+# Checkpoint intermediate buffers
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Buffer checkpointing is a technique to mitigate the memory capacity burden of
+# model training. Instead of storing inputs of all layers to compute upstream
+# gradients in backward propagation, it stores the inputs of a few layers and
+# the others are recomputed during backward pass. The reduced memory
+# requirements enables increasing the batch size that can improve utilization.
+#
+# Checkpointing targets should be selected carefully. The best is not to store
+# large layer outputs that have small re-computation cost. The example target
+# layers are activation functions (e.g. ``ReLU``, ``Sigmoid``, ``Tanh``),
+# up/down sampling and matrix-vector operations with small accumulation depth.
+#
+# PyTorch supports a native
+# `torch.utils.checkpoint <https://pytorch.org/docs/stable/checkpoint.html>`_
+# API to automatically perform checkpointing and recomputation.
+
+###############################################################################
+# Disable debugging APIs
+# ~~~~~~~~~~~~~~~~~~~~~~
+# Many PyTorch APIs are intended for debugging and should be disabled for
+# regular training runs:
+#
+# * anomaly detection:
+#   `torch.autograd.detect_anomaly <https://pytorch.org/docs/stable/autograd.html#torch.autograd.detect_anomaly>`_
+#   or
+#   `torch.autograd.set_detect_anomaly(True) <https://pytorch.org/docs/stable/autograd.html#torch.autograd.set_detect_anomaly>`_
+# * profiler related:
+#   `torch.autograd.profiler.emit_nvtx <https://pytorch.org/docs/stable/autograd.html#torch.autograd.profiler.emit_nvtx>`_,
+#   `torch.autograd.profiler.profile <https://pytorch.org/docs/stable/autograd.html#torch.autograd.profiler.profile>`_
+# * autograd gradcheck:
+#   `torch.autograd.gradcheck <https://pytorch.org/docs/stable/autograd.html#torch.autograd.gradcheck>`_
+#   or
+#   `torch.autograd.gradgradcheck <https://pytorch.org/docs/stable/autograd.html#torch.autograd.gradgradcheck>`_
+#
+
+###############################################################################
+# GPU specific optimizations
+# --------------------------
+
+###############################################################################
+# Enable cuDNN auto-tuner
+# ~~~~~~~~~~~~~~~~~~~~~~~
+# `NVIDIA cuDNN <https://developer.nvidia.com/cudnn>`_ supports many algorithms
+# to compute a convolution. Autotuner runs a short benchmark and selects the
+# kernel with the best performance on a given hardware for a given input size.
+#
+# For convolutional networks (other types currently not supported), enable cuDNN
+# autotuner before launching the training loop by setting:
+
+torch.backends.cudnn.benchmark = True
+###############################################################################
+#
+# * the auto-tuner decisions may be non-deterministic; different algorithm may
+#   be selected for different runs.  For more details see
+#   `PyTorch: Reproducibility <https://pytorch.org/docs/stable/notes/randomness.html?highlight=determinism>`_
+# * in some rare cases, such as with highly variable input sizes,  it's better
+#   to run convolutional networks with autotuner disabled to avoid the overhead
+#   associated with algorithm selection for each input size.
+#
+
+###############################################################################
+# Avoid unnecessary CPU-GPU synchronization
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Avoid unnecessary synchronizations, to let the CPU run ahead of the
+# accelerator as much as possible to make sure that the accelerator work queue
+# contains many operations.
+#
+# When possible, avoid operations which require synchronizations, for example:
+#
+# * ``print(cuda_tensor)``
+# * ``cuda_tensor.item()``
+# * memory copies: ``tensor.cuda()``,  ``cuda_tensor.cpu()`` and equivalent
+#   ``tensor.to(device)`` calls
+# * ``cuda_tensor.nonzero()``
+# * python control flow which depends on results of operations performed on cuda
+#   tensors e.g. ``if (cuda_tensor != 0).all()``
+#
+
+###############################################################################
+# Create tensors directly on the target device
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Instead of calling ``torch.rand(size).cuda()`` to generate a random tensor,
+# produce the output directly on the target device:
+# ``torch.rand(size, device=torch.device('cuda'))``.
+#
+# This is applicable to all functions which create new tensors and accept
+# ``device`` argument:
+# `torch.rand() <https://pytorch.org/docs/stable/generated/torch.rand.html#torch.rand>`_,
+# `torch.zeros() <https://pytorch.org/docs/stable/generated/torch.zeros.html#torch.zeros>`_,
+# `torch.full() <https://pytorch.org/docs/stable/generated/torch.full.html#torch.full>`_
+# and similar.
+
+###############################################################################
+# Use mixed precision and AMP
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Mixed precision leverages
+# `Tensor Cores <https://www.nvidia.com/en-us/data-center/tensor-cores/>`_
+# and offers up to 3x overall speedup on Volta and newer GPU architectures. To
+# use Tensor Cores AMP should be enabled and matrix/tensor dimensions should
+# satisfy requirements for calling kernels that use Tensor Cores.
+#
+# To use Tensor Cores:
+#
+# * set sizes to multiples of 8 (to map onto dimensions of Tensor Cores)
+#
+#   * see
+#     `Deep Learning Performance Documentation
+#     <https://docs.nvidia.com/deeplearning/performance/index.html#optimizing-performance>`_
+#     for more details and guidelines specific to layer type
+#   * if layer size is derived from other parameters rather than fixed, it can
+#     still be explicitly padded e.g. vocabulary size in NLP models
+#
+# * enable AMP
+#
+#   * Introduction to Mixed Precision Training and AMP:
+#     `video <https://www.youtube.com/watch?v=jF4-_ZK_tyc&feature=youtu.be>`_,
+#     `slides <https://nvlabs.github.io/eccv2020-mixed-precision-tutorial/files/dusan_stosic-training-neural-networks-with-tensor-cores.pdf>`_
+#   * native PyTorch AMP is available starting from PyTorch 1.6:
+#     `documentation <https://pytorch.org/docs/stable/amp.html>`_,
+#     `examples <https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples>`_,
+#     `tutorial <https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html>`_
+#
+#
+
+###############################################################################
+# Pre-allocate memory in case of variable input length
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Models for speech recognition or for NLP are often trained on input tensors
+# with variable sequence length. Variable length can be problematic for PyTorch
+# caching allocator and can lead to reduced performance or to unexpected
+# out-of-memory errors. If a batch with a short sequence length is followed by
+# an another batch with longer sequence length, then PyTorch is forced to
+# release intermediate buffers from previous iteration and to re-allocate new
+# buffers. This process is time consuming and causes fragmentation in the
+# caching allocator which may result in out-of-memory errors.
+#
+# A typical solution is to implement pre-allocation. It consists of the
+# following steps:
+#
+# #. generate a (usually random) batch of inputs with maximum sequence length
+#    (either corresponding to max length in the training dataset or to some
+#    predefined threshold)
+# #. execute a forward and a backward pass with the generated batch, do not
+#    execute an optimizer or a learning rate scheduler, this step pre-allocates
+#    buffers of maximum size, which can be reused in subsequent
+#    training iterations
+# #. zero out gradients
+# #. proceed to regular training
+#
+
+###############################################################################
+# Distributed optimizations
+# -------------------------
+
+###############################################################################
+# Use efficient data-parallel backend
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# PyTorch has two ways to implement data-parallel training:
+#
+# * `torch.nn.DataParallel <https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html#torch.nn.DataParallel>`_
+# * `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+#
+# ``DistributedDataParallel`` offers much better performance and scaling to
+# multiple-GPUs. For more information refer to the
+# `relevant section of CUDA Best Practices <https://pytorch.org/docs/stable/notes/cuda.html#use-nn-parallel-distributeddataparallel-instead-of-multiprocessing-or-nn-dataparallel>`_
+# from PyTorch documentation.
+
+###############################################################################
+# Skip unnecessary all-reduce if training with DistributedDataParallel and gradient accumulation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# By default
+# `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+# executes gradient all-reduce after every backward pass to compute the average
+# gradient over all workers participating in the training. If training uses
+# gradient accumulation over N steps, then all-reduce is not necessary after
+# every training step, it's only required to perform all-reduce after the last
+# call to backward, just before the execution of the optimizer.
+#
+# ``DistributedDataParallel`` provides
+# `no_sync() <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.no_sync>`_
+# context manager which disables gradient all-reduce for particular iteration.
+# ``no_sync()`` should be applied to first ``N-1`` iterations of gradient
+# accumulation, the last iteration should follow the default execution and
+# perform the required gradient all-reduce.
+
+###############################################################################
+# Match the order of layers in constructors and during the execution if using DistributedDataParallel(find_unused_parameters=True)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# `torch.nn.parallel.DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+# with ``find_unused_parameters=True`` uses the order of layers and parameters
+# from model constructors to build buckets for ``DistributedDataParallel``
+# gradient all-reduce. ``DistributedDataParallel`` overlaps all-reduce with the
+# backward pass. All-reduce for a particular bucket is asynchronously triggered
+# only when all gradients for parameters in a given bucket are available.
+#
+# To maximize the amount of overlap, the order in model constructors should
+# roughly match the order during the execution. If the order doesn't match, then
+# all-reduce for the entire bucket waits for the gradient which is the last to
+# arrive, this may reduce the overlap between backward pass and all-reduce,
+# all-reduce may end up being exposed, which slows down the training.
+#
+# ``DistributedDataParallel`` with ``find_unused_parameters=False`` (which is
+# the default setting) relies on automatic bucket formation based on order of
+# operations encountered during the backward pass. With
+# ``find_unused_parameters=False`` it's not necessary to reorder layers or
+# parameters to achieve optimal performance.
+
+###############################################################################
+# Load-balance workload in a distributed setting
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Load imbalance typically may happen for models processing sequential data
+# (speech recognition, translation, language models etc.). If one device
+# receives a batch of data with sequence length longer than sequence lengths for
+# the remaining devices, then all devices wait for the worker which finishes
+# last. Backward pass functions as an implicit synchronization point in a
+# distributed setting with
+# `DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_
+# backend.
+#
+# There are multiple ways to solve the load balancing problem. The core idea is
+# to distribute workload over all workers as uniformly as possible within each
+# global batch. For example Transformer solves imbalance by forming batches with
+# approximately constant number of tokens (and variable number of sequences in a
+# batch), other models solve imbalance by bucketing samples with similar
+# sequence length or even by sorting dataset by sequence length.
diff --git a/recipes_source/recipes/what_is_state_dict.py b/recipes_source/recipes/what_is_state_dict.py
index 89e7de096..a8a135cd4 100644
--- a/recipes_source/recipes/what_is_state_dict.py
+++ b/recipes_source/recipes/what_is_state_dict.py
@@ -29,7 +29,7 @@
 
 ::
 
-   pip install torchaudio
+   pip install torch
 
 """
 
diff --git a/recipes_source/recipes/zeroing_out_gradients.py b/recipes_source/recipes/zeroing_out_gradients.py
index 86c32fdf2..6649a6695 100644
--- a/recipes_source/recipes/zeroing_out_gradients.py
+++ b/recipes_source/recipes/zeroing_out_gradients.py
@@ -1,32 +1,33 @@
 """
 PyTorch에서 변화도를 0으로 만들기
 ================================
-신경망을 구축할 때는 변화도를 0으로 만들어 주는 것이 좋습니다. 기본적으로 
+신경망을 구축할 때는 변화도를 0으로 만들어 주는 것이 좋습니다. 기본적으로
 ``.backward()`` 를 호출할 때마다 변화도가 버퍼에 쌓이기 때문입니다. (덮어쓰지 않는다는 의미입니다.)
 
 개요
 ------------
-신경망을 학습시킬 때, 경사 하강법을 거쳐 모델 정확도를 높일 수 있습니다. 경사 하강법은 간단히 
-설명해 모델의 가중치와 편향을 약간씩 수정하면서 손실(또는 오류)를 최소화하는 과정입니다. 
+신경망을 학습시킬 때, 경사 하강법을 거쳐 모델 정확도를 높일 수 있습니다. 경사 하강법은 간단히
+설명해 모델의 가중치와 편향을 약간씩 수정하면서 손실(또는 오류)를 최소화하는 과정입니다.
 
-``torch.Tensor`` 는 PyTorch 의 핵심이 되는 클래스 입니다. 텐서를 생성할 때 
-``.requires_grad`` 속성을 ``True`` 로 설정하면, 텐서에 가해진 모든 연산을 추적합니다. 
+``torch.Tensor`` 는 PyTorch 의 핵심이 되는 클래스 입니다. 텐서를 생성할 때
+``.requires_grad`` 속성을 ``True`` 로 설정하면, 텐서에 가해진 모든 연산을 추적합니다.
 뒤따르는 모든 역전파 단계에서도 마찬가지로, 이 텐서의 변화도는 ``.grad`` 속성에 누적될 것입니다.
 모든 변화도의 축적 또는 합은 손실 텐서에서 ``.backward()`` 를 호출할 때 계산됩니다.
 
-텐서의 변화도를 0으로 만들어 주어야 하는 경우도 있습니다. 예를 들어 학습 과정 반복문을 
-시작할 때, 누적되는 변화도를 정확하게 추적하기 위해서는 변화도를 우선 0으로 만들어 주어야 합니다. 
+텐서의 변화도를 0으로 만들어 주어야 하는 경우도 있습니다. 예를 들어 학습 과정 반복문을
+시작할 때, 누적되는 변화도를 정확하게 추적하기 위해서는 변화도를 우선 0으로 만들어 주어야 합니다.
 이 레시피에서는 PyTorch 라이브러리를 사용하여 변화도를 0으로 만드는 방법을 배워봅니다.
 PyTorch에 내장된 ``CIFAR10`` 데이터셋에 대하여 신경망을 훈련시키는 과정을 통해 알아봅시다.
 
 설정
 -----
 이 레시피에는 데이터를 학습시키는 내용이 포함되어 있기 때문에, 실행 가능한 노트북 파일이 있다면
-런타임을 GPU 또는 TPU로 전환하는 것이 좋습니다. 시작하기에 앞서, ``torch`` 와 
+런타임을 GPU 또는 TPU로 전환하는 것이 좋습니다. 시작하기에 앞서, ``torch`` 와
 ``torchvision`` 패키지가 없다면 설치합니다.
 
 ::
 
+   pip install torch
    pip install torchvision
 
 
@@ -36,21 +37,21 @@
 ######################################################################
 # 단계(Steps)
 # -----
-# 
+#
 # 1단계부터 4단계까지는 학습을 위한 데이터와 신경망을 준비하며, 5단계에서 변화도를 0으로
-# 만들어 줍니다. 이미 준비한 데이터와 신경망이 있다면 5단계로 건너뛰어도 좋습니다. 
-# 
+# 만들어 줍니다. 이미 준비한 데이터와 신경망이 있다면 5단계로 건너뛰어도 좋습니다.
+#
 # 1. 데이터를 불러오기 위해 필요한 모든 라이브러리 import 하기
 # 2. 데이터셋 불러오고 정규화하기
 # 3. 신경망 구축하기
 # 4. 손실 함수 정의하기
 # 5. 신경망을 학습시킬 때 변화도 0으로 만들기
-# 
+#
 # 1. 데이터를 불러오기 위해 필요한 모든 라이브러리 import 하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # 이 레시피에서는 데이터셋에 접근하기 위해 ``torch`` 와 ``torchvision`` 을 사용합니다.
-# 
+#
 
 import torch
 
@@ -66,10 +67,10 @@
 ######################################################################
 # 2. 데이터셋 불러오고 정규화하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
-# PyTorch는 다양한 내장 데이터셋을 제공합니다. PyTorch에서 데이터 불러오기 레시피를 참고해 
+#
+# PyTorch는 다양한 내장 데이터셋을 제공합니다. PyTorch에서 데이터 불러오기 레시피를 참고해
 # 더 많은 정보를 얻을 수 있습니다.
-# 
+#
 
 transform = transforms.Compose(
     [transforms.ToTensor(),
@@ -92,10 +93,10 @@
 ######################################################################
 # 3. 신경망 구축하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # 컨볼루션 신경망을 정의하겠습니다. 자세한 내용은 신경망 정의하기 레시피를
 # 참조해주세요.
-# 
+#
 
 class Net(nn.Module):
     def __init__(self):
@@ -120,9 +121,9 @@ def forward(self, x):
 ######################################################################
 # 4. 손실 함수과 옵티마이저 정의하기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # 분류를 위한 Cross-Entropy 손실 함수와 모멘텀을 설정한 SGD 옵티마이저를 사용합니다.
-# 
+#
 
 net = Net()
 criterion = nn.CrossEntropyLoss()
@@ -132,14 +133,14 @@ def forward(self, x):
 ######################################################################
 # 5. 신경망을 학습시키는 동안 변화도를 0으로 만들기
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 
+#
 # 이제부터 흥미로운 부분을 살펴보려고 합니다.
 # 여기서 할 일은 데이터 이터레이터를 순회하면서, 신경망에 입력을 주고
 # 최적화하는 것입니다.
-# 
-# 데이터의 엔터티 각각의 변화도를 0으로 만들어주는 것에 유의하십시오. 
-# 신경망을 학습시킬 때 불필요한 정보를 추적하지 않도록 하기 위함입니다. 
-# 
+#
+# 데이터의 엔터티 각각의 변화도를 0으로 만들어주는 것에 유의하십시오.
+# 신경망을 학습시킬 때 불필요한 정보를 추적하지 않도록 하기 위함입니다.
+#
 
 for epoch in range(2):  # 전체 데이터셋을 여러번 반복하기
 
@@ -168,16 +169,16 @@ def forward(self, x):
 
 
 ######################################################################
-# ``model.zero_grad()`` 를 사용해도 변화도를 0으로 만들 수 있습니다. 
+# ``model.zero_grad()`` 를 사용해도 변화도를 0으로 만들 수 있습니다.
 # 이는 옵티마이저에 모든 모델 파라미터가 포함되는 한 ``optimizer.zero_grad()`` 를
-# 사용하는 것과 동일합니다. 어떤 것을 사용할 것인지 최선의 선택을 하기 바랍니다. 
-# 
-# 축하합니다! 이제 PyTorch에서 변화도를 0으로 만들 수 있습니다. 
-# 
+# 사용하는 것과 동일합니다. 어떤 것을 사용할 것인지 최선의 선택을 하기 바랍니다.
+#
+# 축하합니다! 이제 PyTorch에서 변화도를 0으로 만들 수 있습니다.
+#
 # 더 알아보기
 # ----------
-# 
+#
 # 다른 레시피를 둘러보고 계속 배워보세요:
-# 
+#
 # - :doc:`/recipes/recipes/loading_data_recipe`
 # - :doc:`/recipes/recipes/save_load_across_devices`
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index a85763f18..b5a174690 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -88,16 +88,30 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py
 
 .. customcarditem::
    :header: PyTorch에서 변화도를 0으로 만들기
-   :card_description: 변화도를 언제 0으로 만들어야 하며, 그렇게 하는 것이 모델의 정확도를 높이는 데에 어떻게 도움이 되는지 알아봅니다. 
+   :card_description: 변화도를 언제 0으로 만들어야 하며, 그렇게 하는 것이 모델의 정확도를 높이는 데에 어떻게 도움이 되는지 알아봅니다.
    :image: ../_static/img/thumbnails/cropped/zeroing-out-gradients.PNG
    :link: ../recipes/recipes/zeroing_out_gradients.html
    :tags: Basics
 
+.. customcarditem::
+   :header: Pytorch 벤치마크
+   :card_description: PyTorch의 벤치마크 모듈을 사용하여 코드의 성능을 측정하고 비교하는 방법을 알아봅니다.
+   :image: ../_static/img/thumbnails/cropped/profiler.png
+   :link: ../recipes/recipes/benchmark.html
+   :tags: Basics
+
+.. customcarditem::
+   :header: PyTorch Benchmark (quick start)
+   :card_description: Learn how to measure snippet run times and collect instructions.
+   :image: ../_static/img/thumbnails/cropped/profiler.png
+   :link: ../recipes/recipes/timer_quick_start.html
+   :tags: Basics
+
 .. customcarditem::
    :header: Pytorch 프로파일러
-   :card_description: PyTorch의 프로파일러를 사용하여 운영자 시간과 메모리 소비량을 측정하는 방법을 알아봅니다.
+   :card_description: PyTorch의 프로파일러를 사용하여 연산 시간과 메모리 소비량을 측정하는 방법을 알아봅니다.
    :image: ../_static/img/thumbnails/cropped/profiler.png
-   :link: ../recipes/recipes/profiler.html
+   :link: ../recipes/recipes/profiler_recipe.html
    :tags: Basics
 
 .. Customization
@@ -165,6 +179,79 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py
    :link: ../recipes/android_native_app_with_custom_op.html
    :tags: Mobile
 
+.. customcarditem::
+  :header: Fuse Modules recipe
+  :card_description: Learn how to fuse a list of PyTorch modules into a single module to reduce the model size before quantization.
+  :image: ../_static/img/thumbnails/cropped/mobile.png
+  :link: ../recipes/fuse.html
+  :tags: Mobile
+
+.. customcarditem::
+  :header: Quantization for Mobile Recipe
+  :card_description: Learn how to reduce the model size and make it run faster without losing much on accuracy.
+  :image: ../_static/img/thumbnails/cropped/mobile.png
+  :link: ../recipes/quantization.html
+  :tags: Mobile,Quantization
+
+.. customcarditem::
+  :header: Script and Optimize for Mobile
+  :card_description: Learn how to convert the model to TorchScipt and (optional) optimize it for mobile apps.
+  :image: ../_static/img/thumbnails/cropped/mobile.png
+  :link: ../recipes/script_optimized.html
+  :tags: Mobile
+
+.. customcarditem::
+  :header: Model Preparation for iOS Recipe
+  :card_description: Learn how to add the model in an iOS project and use PyTorch pod for iOS.
+  :image: ../_static/img/thumbnails/cropped/ios.png
+  :link: ../recipes/model_preparation_ios.html
+  :tags: Mobile
+
+.. customcarditem::
+  :header: Model Preparation for Android Recipe
+  :card_description: Learn how to add the model in an Android project and use the PyTorch library for Android.
+  :image: ../_static/img/thumbnails/cropped/android.png
+  :link: ../recipes/model_preparation_android.html
+  :tags: Mobile
+
+.. customcarditem::
+   :header: Profiling PyTorch RPC-Based Workloads
+   :card_description: How to use the PyTorch profiler to profile RPC-based workloads.
+   :image: ../_static/img/thumbnails/cropped/profile.png
+   :link: ../recipes/distributed_rpc_profiling.html
+   :tags: Production
+
+.. Automatic Mixed Precision
+.. customcarditem::
+   :header: Automatic Mixed Precision
+   :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs.
+   :image: ../_static/img/thumbnails/cropped/amp.png
+   :link: ../recipes/recipes/amp_recipe.html
+   :tags: Model-Optimization
+
+.. Performance
+.. customcarditem::
+   :header: Performance Tuning Guide
+   :card_description: Tips for achieving optimal performance.
+   :image: ../_static/img/thumbnails/cropped/profiler.png
+   :link: ../recipes/recipes/tuning_guide.html
+   :tags: Model-Optimization
+
+.. Distributed Training
+.. customcarditem::
+   :header: Shard Optimizer States with ZeroRedundancyOptimizer
+   :card_description: How to use ZeroRedundancyOptimizer to reduce memory consumption.
+   :image: ../_static/img/thumbnails/cropped/profiler.png
+   :link: ../recipes/zero_redundancy_optimizer.html
+   :tags: Distributed-Training
+
+.. customcarditem::
+   :header: Direct Device-to-Device Communication with TensorPipe RPC
+   :card_description: How to use RPC with direct GPU-to-GPU communication.
+   :image: ../_static/img/thumbnails/cropped/profiler.png
+   :link: ../recipes/cuda_rpc.html
+   :tags: Distributed-Training
+
 .. End of tutorial card section
 
 .. raw:: html
@@ -192,11 +279,14 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py
    /recipes/recipes/warmstarting_model_using_parameters_from_a_different_model
    /recipes/recipes/save_load_across_devices
    /recipes/recipes/zeroing_out_gradients
-   /recipes/recipes/profiler
-   /recipes/recipes/custom_dataset_transforms_loader
+   /recipes/recipes/profiler_recipe
    /recipes/recipes/Captum_Recipe
    /recipes/recipes/tensorboard_with_pytorch
    /recipes/recipes/dynamic_quantization
+   /recipes/recipes/amp_recipe
+   /recipes/recipes/tuning_guide
    /recipes/torchscript_inference
    /recipes/deployment_with_flask
    /recipes/distributed_rpc_profiling
+   /recipes/zero_redundancy_optimizer
+   /recipes/cuda_rpc
\ No newline at end of file
diff --git a/recipes_source/script_optimized.rst b/recipes_source/script_optimized.rst
new file mode 100644
index 000000000..e1c0f19ad
--- /dev/null
+++ b/recipes_source/script_optimized.rst
@@ -0,0 +1,211 @@
+Script and Optimize for Mobile Recipe
+=====================================
+
+This recipe demonstrates how to convert a PyTorch model to TorchScript which can run in a high-performance C++ environment such as iOS and Android, and how to optimize the converted TorchScript model for mobile deployment.
+
+Introduction
+------------
+
+After a PyTorch model is trained and optionally but preferably quantized (see `Quantization Recipe <quantization.html>`_ for more details), one essential step before the model can be used in iOS and Android apps is to convert the Python-dependent model to TorchScript, which can then further be optimized for mobile apps. Conversion to TorchScript can be as simple as a single call, or as complicated as changing the original model in many different places.
+
+Pre-requisites
+--------------
+
+PyTorch 1.6.0 or 1.7.0
+
+Conversion to TorchScript
+-------------------------
+
+There are two basic ways to convert a PyTorch model to TorchScript, using `trace` and `script`. Mixing `trace` and `script` may also be needed in some cases - see `here <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#mixing-scripting-and-tracing>`_ for more information.
+
+Use the `trace` Method
+^^^^^^^^^^^^^^^^^^^^^^
+
+To use the `trace` method on a model, an example or dummy input for the model needs to be specified, the actual input size needs to be the same as the example input size, and the model definition cannot have control flow such as `if` or `for`. The reason for these constraints is that running `trace` on a model with an example input simply calls the model's `forward` method with the input and all operations executed in the model layers are recorded, creating the trace of the model.
+
+::
+
+    import torch
+
+    dummy_input = torch.rand(1, 3, 224, 224)
+    torchscript_model = torch.jit.trace(model_quantized, dummy_input)
+
+
+Use the `script` Method
+^^^^^^^^^^^^^^^^^^^^^^^
+
+For the example above, calling `script` below makes no difference:
+
+::
+
+    torchscript_model = torch.jit.script(model_quantized)
+
+But if a model has some flow control, then `trace` won't correctly record all the possible traces. Take some code snippet of an example model definition from `here <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_ for example:
+
+::
+
+    import torch
+
+    class MyDecisionGate(torch.nn.Module):
+        def forward(self, x):
+            if x.sum() > 0:
+                return x
+            else:
+                return -x
+
+    x = torch.rand(3, 4)
+    traced_cell = torch.jit.trace(MyDecisionGate(), x)
+    print(traced_cell.code)
+
+The code above will output:
+
+::
+
+    TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can''t record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
+
+    if x.sum() > 0:
+  def forward(self,
+      x: Tensor) -> Tensor:
+    return x
+
+
+Note that "the trace might not generalize to other inputs" warning above means that if the model has any kind of data-dependent control flow, `trace` is not the right answer. But if we replace the last two lines of the Python code snippet above (before the code output) with:
+
+::
+
+    scripted_cell = torch.jit.script(MyDecisionGate())
+    print(scripted_cell.code)
+
+The scripted model as shown by the `print` result below will be covering all possible inputs, thus generalizing to other inputs:
+
+::
+
+    def forward(self,
+        x: Tensor) -> Tensor:
+      _0 = bool(torch.gt(torch.sum(x, dtype=None), 0))
+      if _0:
+        _1 = x
+      else:
+        _1 = torch.neg(x)
+      return _1
+
+
+This is another example of using `trace` and `script` - it converts the model trained in the PyTorch tutorial `NLP FROM SCRATCH: TRANSLATION WITH A SEQUENCE TO SEQUENCE NETWORK AND ATTENTION <https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html>`_:
+
+::
+
+    encoder = EncoderRNN(input_lang.n_words, hidden_size)
+    decoder = AttnDecoderRNN(hidden_size, output_lang.n_words)
+
+    # method 1: using trace with example inputs
+
+    encoder_input=torch.tensor([1])
+    encoder_hidden=torch.zeros(1, 1, hidden_size)
+
+    decoder_input1=torch.tensor([[0]])
+    decoder_input2=torch.zeros(1, 1, hidden_size)
+    decoder_input3=torch.zeros(MAX_LENGTH, hidden_size)
+
+    traced_encoder = torch.jit.trace(encoder, (encoder_input, encoder_hidden))
+    traced_decoder = torch.jit.trace(decoder, (decoder_input1, decoder_input2, decoder_input3))
+
+    # method 2: using script
+
+    scripted_encoder = torch.jit.script(encoder)
+    scripted_decoder = torch.jit.script(decoder)
+
+So is it true that one can simply always use the `script` call and the model is converted to TorchScript? The answer is no, because TorchScript is actually a subset of Python and to make `script` work, the PyTorch model definition must only use the language features of that TorchScript subset of Python. `TorchScript Language Reference <https://pytorch.org/docs/master/jit_language_reference.html#language-reference>`_ covers all the details of what is supported in TorchScript. Below we will describe some of the common errors when using the `script` method.
+
+
+Fix Common Errors When Using the `script` Method
+----------------------------------------------------
+
+If you apply the `script` method to a non-trivial model, chances are you may encounter several types of errors. Check out `this tutorial <https://pytorch.org/tutorials/beginner/deploy_seq2seq_hybrid_frontend_tutorial.html>`_ for a complete example of converting a chatbot model to TorchScript. But follow the steps below to fix common errors when you run the `script` method:
+
+1. RuntimeError `attribute lookup is not defined on python value of type`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For this error, pass the value of the model as a parameter in the constructor. This is because when calling `script` on a model that accepts another model as a parameter, the model passed is actually of type `TracedModule` or `ScriptModule`, not of type `Module`, making the the model attribute not defined when scripting.
+
+For example, the `LuongAttnDecoderRNN` module in the tutorial above has an attribute `n_layers`, and the `GreedySearchDecoder` module refers to the `n_layers` attribute of a `decoder` instance of the `LuongAttnDecoderRNN` module, so in order to make `script` work, the `GreedySearchDecoder` module's constructor needs to be changed from:
+
+::
+
+    def __init__(self, encoder, decoder):
+
+to:
+
+::
+
+    def __init__(self, encoder, decoder, decoder_n_layers):
+      ...
+      self._decoder_n_layers = decoder_n_layers
+
+
+and the `GreedySearchDecoder`'s `forward` method needs to refer `self._decoder_n_layers` instead of `decoder.n_layers`.
+
+2. RuntimeError `python value of type '...' cannot be used as a value.`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The complete error message for this one continues with `Perhaps it is a closed over global variable? If so, please consider passing it in as an argument or use a local variable instead.`, store global variables' values as attributes in the model constructor (there's no need to add them to a special list called `__constants__`). The reason is that global values can be used conveniently in normal model training and inference, but the global values are not accessible during the scripting.
+
+For example, `device` and `SOS_token` are global variables, and to make `script` work, they need to be added to the `GreedySearchDecoder`'s constructor:
+
+::
+
+    self._device = device
+    self._SOS_token = SOS_token
+
+and referred to as `self._device` and `self._SOS_token` instead of `device` and `SOS_token` in the `GreedySearchDecoder`'s `forward` method.
+
+3. RuntimeError `all inputs of range must be '...', found Tensor (inferred) in argument`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The error message continues with: `add type definitions for each of the module's forward method arguments. Because all parameters to a TorchScript function are of the `torch.Tensor` type by default, you need to specifically declare the type for each parameter that is not of type 'Tensor'. For a complete list of TorchScript-supported types, see `here <https://pytorch.org/docs/master/jit_language_reference.html#supported-type>`_.
+
+For example, the `GreedySearchDecoder`'s `forward` method signature needs to be changed from:
+
+::
+
+    def forward(self, input_seq, input_length, max_length):
+
+to:
+
+::
+
+    def forward(self, input_seq, input_length, max_length : int):
+
+After using the `trace` or `script` method above, and fixing possible errors, you should have a TorchScript model ready to be optimized for mobile.
+
+
+Optimize a TorchScript Model
+--------------------------------------
+
+Simply run the following code snippet to optimize a TorchScript model generated with the `trace` and/or `script` method:
+
+::
+
+    from torch.utils.mobile_optimizer import optimize_for_mobile
+    optimized_torchscript_model = optimize_for_mobile(torchscript_model)
+
+The optimized model can then be saved and deployed in mobile apps:
+
+::
+
+    optimized_torchscript_model.save("optimized_torchscript_model.pth")
+
+By default, `optimize_for_mobile` will perform the following types of optimizations:
+
+* Conv2D and BatchNorm fusion which folds Conv2d-BatchNorm2d into Conv2d;
+
+* Insert and fold prepacked ops which rewrites the model graph to replace 2D convolutions and linear ops with their prepacked counterparts.
+
+* ReLU and hardtanh fusion which rewrites graph by finding ReLU/hardtanh ops and fuses them together.
+
+* Dropout removal which removes dropout nodes from this module when training is false.
+
+
+Learn More
+-----------------
+1. The official `TorchScript Language Reference <https://pytorch.org/docs/stable/jit_language_reference.html>`_.
+2. The `torch.utils.mobile_optimizer` `API documentation <https://pytorch.org/docs/stable/mobile_optimizer.html>`_.
diff --git a/recipes_source/zero_redundancy_optimizer.rst b/recipes_source/zero_redundancy_optimizer.rst
new file mode 100644
index 000000000..bec480344
--- /dev/null
+++ b/recipes_source/zero_redundancy_optimizer.rst
@@ -0,0 +1,157 @@
+Shard Optimizer States with ZeroRedundancyOptimizer
+===================================================
+
+.. note:: `ZeroRedundancyOptimizer` is introduced in PyTorch 1.8 as a prototype
+    feature. This API is subject to change.
+
+In this recipe, you will learn:
+
+- The high-level idea of `ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__.
+- How to use `ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__
+  in distributed training and its impact.
+
+
+Requirements
+------------
+
+- PyTorch 1.8+
+- `Getting Started With Distributed Data Parallel <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`_
+
+
+What is ``ZeroRedundancyOptimizer``?
+------------------------------------
+
+The idea of `ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__
+comes from `DeepSpeed/ZeRO project <https://github.com/microsoft/DeepSpeed>`_ and
+`Marian <https://github.com/marian-nmt/marian-dev>`_ that shard
+optimizer states across distributed data-parallel processes to
+reduce per-process memory footprint. In the
+`Getting Started With Distributed Data Parallel <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`_
+tutorial, we have shown how to use
+`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`_
+(DDP) to train models. In that tutorial, each process keeps a dedicated replica
+of the optimizer. Since DDP has already synchronized gradients in the
+backward pass, all optimizer replicas will operate on the same parameter and
+gradient values in every iteration, and this is how DDP keeps model replicas in
+the same state. Oftentimes, optimizers also maintain local states. For example,
+the ``Adam`` optimizer uses per-parameter ``exp_avg`` and ``exp_avg_sq`` states. As a
+result, the ``Adam`` optimizer's memory consumption is at least twice the model
+size. Given this observation, we can reduce the optimizer memory footprint by
+sharding optimizer states across DDP processes. More specifically, instead of
+creating per-param states for all parameters, each optimizer instance in
+different DDP processes only keeps optimizer states for a shard of all model
+parameters. The optimizer ``step()`` function only updates the parameters in its
+shard and then broadcasts its updated parameters to all other peer DDP
+processes, so that all model replicas still land in the same state.
+
+How to use ``ZeroRedundancyOptimizer``?
+---------------------------------------
+
+The code below demonstrates how to use
+`ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__.
+The majority of the code is similar to the simple DDP example presented in
+`Distributed Data Parallel notes <https://pytorch.org/docs/stable/notes/ddp.html>`_.
+The main difference is the ``if-else`` clause in the ``example`` function which
+wraps optimizer constructions, toggling between
+`ZeroRedundancyOptimizer <https://pytorch.org/docs/master/distributed.optim.html>`__
+and ``Adam`` optimizer.
+
+
+::
+
+    import os
+    import torch
+    import torch.distributed as dist
+    import torch.multiprocessing as mp
+    import torch.nn as nn
+    import torch.optim as optim
+    from torch.distributed.optim import ZeroRedundancyOptimizer
+    from torch.nn.parallel import DistributedDataParallel as DDP
+
+    def print_peak_memory(prefix, device):
+        if device == 0:
+            print(f"{prefix}: {torch.cuda.max_memory_allocated(device) // 1e6}MB ")
+
+    def example(rank, world_size, use_zero):
+        torch.manual_seed(0)
+        torch.cuda.manual_seed(0)
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '29500'
+        # create default process group
+        dist.init_process_group("gloo", rank=rank, world_size=world_size)
+
+        # create local model
+        model = nn.Sequential(*[nn.Linear(2000, 2000).to(rank) for _ in range(20)])
+        print_peak_memory("Max memory allocated after creating local model", rank)
+
+        # construct DDP model
+        ddp_model = DDP(model, device_ids=[rank])
+        print_peak_memory("Max memory allocated after creating DDP", rank)
+
+        # define loss function and optimizer
+        loss_fn = nn.MSELoss()
+        if use_zero:
+            optimizer = ZeroRedundancyOptimizer(
+                ddp_model.parameters(),
+                optimizer_class=torch.optim.Adam,
+                lr=0.01
+            )
+        else:
+            optimizer = torch.optim.Adam(ddp_model.parameters(), lr=0.01)
+
+        # forward pass
+        outputs = ddp_model(torch.randn(20, 2000).to(rank))
+        labels = torch.randn(20, 2000).to(rank)
+        # backward pass
+        loss_fn(outputs, labels).backward()
+
+        # update parameters
+        print_peak_memory("Max memory allocated before optimizer step()", rank)
+        optimizer.step()
+        print_peak_memory("Max memory allocated after optimizer step()", rank)
+
+        print(f"params sum is: {sum(model.parameters()).sum()}")
+
+
+
+    def main():
+        world_size = 2
+        print("=== Using ZeroRedundancyOptimizer ===")
+        mp.spawn(example,
+            args=(world_size, True),
+            nprocs=world_size,
+            join=True)
+
+        print("=== Not Using ZeroRedundancyOptimizer ===")
+        mp.spawn(example,
+            args=(world_size, False),
+            nprocs=world_size,
+            join=True)
+
+    if __name__=="__main__":
+        main()
+
+The output is shown below. When enabling ``ZeroRedundancyOptimizer`` with ``Adam``,
+the optimizer ``step()`` peak memory consumption is half of vanilla ``Adam``'s
+memory consumption. This agrees with our expectation, as we are sharding
+``Adam`` optimizer states across two processes. The output also shows that, with
+``ZeroRedundancyOptimizer``, the model parameters still end up with the same
+values after one iterations (the parameters sum is the same with and without
+``ZeroRedundancyOptimizer``).
+
+::
+
+    === Using ZeroRedundancyOptimizer ===
+    Max memory allocated after creating local model: 335.0MB
+    Max memory allocated after creating DDP: 656.0MB
+    Max memory allocated before optimizer step(): 992.0MB
+    Max memory allocated after optimizer step(): 1361.0MB
+    params sum is: -3453.6123046875
+    params sum is: -3453.6123046875
+    === Not Using ZeroRedundancyOptimizer ===
+    Max memory allocated after creating local model: 335.0MB
+    Max memory allocated after creating DDP: 656.0MB
+    Max memory allocated before optimizer step(): 992.0MB
+    Max memory allocated after optimizer step(): 1697.0MB
+    params sum is: -3453.6123046875
+    params sum is: -3453.6123046875
diff --git a/requirements.txt b/requirements.txt
index b5114e0bc..b5cc8b462 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,9 @@
-# Refer to ./jenkins/build.sh for tutorial build instructions 
+# Refer to ./jenkins/build.sh for tutorial build instructions
 
-sphinx
+sphinx==1.8.2
 sphinx-gallery==0.3.1
+docutils==0.16
+sphinx-copybutton
 sphinx-sitemap
 tqdm
 numpy
@@ -14,7 +16,9 @@ PyHamcrest
 bs4
 awscli==1.16.35
 flask
-spacy
+spacy==2.3.2
+ray[tune]
+
 
 # PyTorch Theme
 -e git+git://github.com/9bow/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
@@ -22,9 +26,13 @@ spacy
 ipython
 
 # to run examples
+boto3
 pandas
+requests
 scikit-image
-# pillow >= 4.2 will throw error when trying to write mode RGBA as JPEG,
-# this is a workaround to the issue.
-pillow==4.1.1
+scipy
+pillow==8.1.1
 wget
+gym
+gym-super-mario-bros==7.3.0
+timm
\ No newline at end of file