diff --git a/README.md b/README.md
index 3be19e741..7cca099ed 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ PyTorch에서 제공하는 튜토리얼의 한국어 번역을 위한 저장소
 
 ## 원문
 
-현재 PyTorch v1.9 튜토리얼 번역이 진행 중입니다. ([pytorch/tutorials@2571e95](https://github.com/pytorch/tutorials/commit/2571e95df42b8ed46d11ac9827c637fa4e826dfe) 기준)
+현재 PyTorch v1.10.1 튜토리얼([pytorch/tutorials@444fbd1](https://github.com/pytorch/tutorials/commit/444fbd16f2ddf9967baf8b06e83867a141b071c2) 기준) 번역이 진행 중입니다.
 
 최신 버전의 튜토리얼(공식, 영어)은 [PyTorch tutorials 사이트](https://pytorch.org/tutorials) 및 [PyTorch tutorials 저장소](https://github.com/pytorch/tutorials)를 참고해주세요.
 
@@ -46,5 +46,5 @@ v1.0 이후 번역은 별도 저장소로 관리하지 않습니다. [이 저장
   | 0.3.1  | [PyTorch-tutorials-kr-0.3.1](https://9bow.github.io/PyTorch-tutorials-kr-0.3.1) | [GitHub 저장소](https://github.com/9bow/PyTorch-tutorials-kr-0.3.1) |
 
 ---
-This is a project to translate [pytorch/tutorials@2571e95](https://github.com/pytorch/tutorials/commit/2571e95df42b8ed46d11ac9827c637fa4e826dfe) into Korean.
+This is a project to translate [pytorch/tutorials@444fbd1](https://github.com/pytorch/tutorials/commit/444fbd16f2ddf9967baf8b06e83867a141b071c2) into Korean.
 For the latest version, please visit to the [official PyTorch tutorials repo](https://github.com/pytorch/tutorials).
diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst
index 0264f1e7f..1b938b79f 100644
--- a/advanced_source/torch_script_custom_ops.rst
+++ b/advanced_source/torch_script_custom_ops.rst
@@ -579,13 +579,13 @@ custom operator, that loads and executes a serialized TorchScript model:
     }
 
     // Deserialize the ScriptModule from a file using torch::jit::load().
-    std::shared_ptr<torch::jit::script::Module> module = torch::jit::load(argv[1]);
+    torch::jit::script::Module module = torch::jit::load(argv[1]);
 
     std::vector<torch::jit::IValue> inputs;
     inputs.push_back(torch::randn({4, 8}));
     inputs.push_back(torch::randn({8, 5}));
 
-    torch::Tensor output = module->forward(std::move(inputs)).toTensor();
+    torch::Tensor output = module.forward(std::move(inputs)).toTensor();
 
     std::cout << output << std::endl;
   }
@@ -1029,5 +1029,5 @@ visible to TorchScript:
 
   >>> import torch
   >>> torch.ops.load_library("warp_perspective.so")
-  >>> print(torch.ops.custom.warp_perspective)
+  >>> print(torch.ops.my_ops.warp_perspective)
   <built-in method custom::warp_perspective of PyCapsule object at 0x7ff51c5b7bd0>
diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py
index ab9a0d72b..0fd4d6af5 100644
--- a/beginner_source/basics/optimization_tutorial.py
+++ b/beginner_source/basics/optimization_tutorial.py
@@ -135,7 +135,7 @@ def forward(self, x):
 #####################################
 # 학습 단계(loop)에서 최적화는 세단계로 이뤄집니다:
 #  * ``optimizer.zero_grad()``\ 를 호출하여 모델 매개변수의 변화도를 재설정합니다. 기본적으로 변화도는 더해지기(add up) 때문에 중복 계산을 막기 위해 반복할 때마다 명시적으로 0으로 설정합니다.
-#  * ``loss.backward()``\ 를 호출하여 예측 손실(prediction loss)을 역전파합니다. PyTorch는 각 매개변수에 대한 손실의 변화도를 저장합니다.
+#  * ``loss.backwards()``\ 를 호출하여 예측 손실(prediction loss)을 역전파합니다. PyTorch는 각 매개변수에 대한 손실의 변화도를 저장합니다.
 #  * 변화도를 계산한 뒤에는 ``optimizer.step()``\ 을 호출하여 역전파 단계에서 수집된 변화도로 매개변수를 조정합니다.
 
 
diff --git a/beginner_source/blitz/autograd_tutorial.py b/beginner_source/blitz/autograd_tutorial.py
index 472229932..d9d11a9fa 100644
--- a/beginner_source/blitz/autograd_tutorial.py
+++ b/beginner_source/blitz/autograd_tutorial.py
@@ -32,7 +32,11 @@
 
 학습 단계를 하나만 살펴보겠습니다. 여기에서는 ``torchvision`` 에서 미리 학습된 resnet18 모델을 불러옵니다.
 3채널짜리 높이와 넓이가 64인 이미지 하나를 표현하는 무작위의 데이터 텐서를 생성하고, 이에 상응하는 ``label(정답)`` 을
-무작위 값으로 초기화합니다.
+무작위 값으로 초기화합니다. 미리 학습된 모델의 정답(label)은 (1, 1000)의 모양(shape)을 갖습니다.
+
+.. note::
+     이 튜토리얼은 (텐서를 CUDA로 이동하더라도) GPU에서는 동작하지 않으며 CPU에서만 동작합니다.
+
 """
 import torch, torchvision
 model = torchvision.models.resnet18(pretrained=True)
@@ -59,8 +63,8 @@
 
 ############################################################
 # 다음으로, 옵티마이저(optimizer)를 불러옵니다.
-# 이 예제에서는 학습율(learning rate) 0.1과 모멘텀(momentum) 0.9를 갖는 SGD입니다.
-# 옵티마이저(optimizer)에 모델의 모든 매개변수를 등록합니다.
+# 이 예제에서는 학습율(learning rate) 0.1과 `모멘텀(momentum) <https://towardsdatascience.com/stochastic-gradient-descent-with-momentum-a84097641a5d>`__
+# 0.9를 갖는 SGD입니다. 옵티마이저(optimizer)에 모델의 모든 매개변수를 등록합니다.
 #
 
 optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
@@ -234,7 +238,7 @@
 # .. note::
 #   **PyTorch에서 DAG들은 동적(dynamic)입니다.**
 #   주목해야 할 중요한 점은 그래프가 처음부터(from scratch) 다시 생성된다는 것입니다; 매번 ``.backward()`` 가
-#   호출되고 나면, autograd는 새로운 그래프를 채우기(populate) 시작합니다. 이러한 점 덕분에 모델에서 
+#   호출되고 나면, autograd는 새로운 그래프를 채우기(populate) 시작합니다. 이러한 점 덕분에 모델에서
 #   흐름 제어(control flow) 구문들을 사용할 수 있게 되는 것입니다; 매번 반복(iteration)할 때마다 필요하면
 #   모양(shape)이나 크기(size), 연산(operation)을 바꿀 수 있습니다.
 #
@@ -286,14 +290,14 @@
 model.fc = nn.Linear(512, 10)
 
 ######################################################################
-# 이제 ``model.fc`` 를 제외한 모델의 모든 매개변수들이 고정되었습니다. 
+# 이제 ``model.fc`` 를 제외한 모델의 모든 매개변수들이 고정되었습니다.
 # 변화도를 계산하는 유일한 매개변수는 ``model.fc`` 의 가중치(weight)와 편향(bias)뿐입니다.
 
 # 분류기만 최적화합니다.
 optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
 
 ##########################################################################
-# 옵티마이저(optimizer)에 모든 매개변수를 등록하더라도, 
+# 옵티마이저(optimizer)에 모든 매개변수를 등록하더라도,
 # 변화도를 계산(하고 경사하강법으로 갱신)할 수 있는 매개변수들은 분류기의 가중치와 편향뿐입니다.
 #
 # 컨텍스트 매니저(context manager)에 `torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html>`__
diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py
index 013a0fc5c..da90b566a 100644
--- a/beginner_source/blitz/cifar10_tutorial.py
+++ b/beginner_source/blitz/cifar10_tutorial.py
@@ -109,7 +109,7 @@ def imshow(img):
 # 이미지 보여주기
 imshow(torchvision.utils.make_grid(images))
 # 정답(label) 출력
-print(' '.join('%5s' % classes[labels[j]] for j in range(batch_size)))
+print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))
 
 
 ########################################################################
@@ -181,8 +181,7 @@ def forward(self, x):
         # 통계를 출력합니다.
         running_loss += loss.item()
         if i % 2000 == 1999:    # print every 2000 mini-batches
-            print('[%d, %5d] loss: %.3f' %
-                  (epoch + 1, i + 1, running_loss / 2000))
+            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
             running_loss = 0.0
 
 print('Finished Training')
@@ -213,7 +212,7 @@ def forward(self, x):
 
 # 이미지를 출력합니다.
 imshow(torchvision.utils.make_grid(images))
-print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))
+print('GroundTruth: ', ' '.join(f'{classes[labels[j]]:5s}' for j in range(4)))
 
 ########################################################################
 # 이제, 저장했던 모델을 불러오도록 하겠습니다 (주: 모델을 저장하고 다시 불러오는
@@ -233,7 +232,7 @@ def forward(self, x):
 # 따라서, 가장 높은 값을 갖는 인덱스(index)를 뽑아보겠습니다:
 _, predicted = torch.max(outputs, 1)
 
-print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
+print('Predicted: ', ' '.join(f'{classes[predicted[j]]:5s}'
                               for j in range(4)))
 
 ########################################################################
@@ -254,8 +253,7 @@ def forward(self, x):
         total += labels.size(0)
         correct += (predicted == labels).sum().item()
 
-print('Accuracy of the network on the 10000 test images: %d %%' % (
-    100 * correct / total))
+print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
 
 ########################################################################
 # (10가지 분류 중에 하나를 무작위로) 찍었을 때의 정확도인 10% 보다는 나아보입니다.
@@ -283,8 +281,7 @@ def forward(self, x):
 # 각 분류별 정확도(accuracy)를 출력합니다
 for classname, correct_count in correct_pred.items():
     accuracy = 100 * float(correct_count) / total_pred[classname]
-    print("Accuracy for class {:5s} is: {:.1f} %".format(classname,
-                                                   accuracy))
+    print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')
 
 ########################################################################
 # 자, 이제 다음으로 무엇을 해볼까요?
@@ -297,7 +294,7 @@ def forward(self, x):
 #
 # 먼저 (CUDA를 사용할 수 있다면) 첫번째 CUDA 장치를 사용하도록 설정합니다:
 
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 
 # CUDA 기기가 존재한다면, 아래 코드가 CUDA 장치를 출력합니다:
 
diff --git a/beginner_source/data_loading_tutorial.py b/beginner_source/data_loading_tutorial.py
index ce9ead07a..2f3c7e198 100644
--- a/beginner_source/data_loading_tutorial.py
+++ b/beginner_source/data_loading_tutorial.py
@@ -62,8 +62,9 @@
 #     0805personali01.jpg,27,83,27,98, ... 84,134
 #     1084239450_e76e00b7e7.jpg,70,236,71,257, ... ,128,312
 #
-# 이제 CSV 파일을 불러와서 (N, 2) 배열안에 있는 랜드마크들을 잡아보겠습니다.
-# N은 랜드마크(landmarks)의 개수입니다.
+# 이제 CSV에서 이미지 이름과 그에 해당하는 데이터(annotation)을 가져와 보겠습니다. 예시로 person-7.jpg가 있는
+# 65번째 줄(row index number)을 가져오겠습니다.이미지 이름을 읽어 ``img_name`` 에 저장하고, 데이터는 (L, 2)
+# 배열인 ``landmarks`` 에 저장합니다. 이 때 L은 해당 행의 랜드마크의 개수입니다.
 
 landmarks_frame = pd.read_csv('data/faces/face_landmarks.csv')
 
@@ -396,6 +397,10 @@ def show_landmarks_batch(sample_batched):
 
         plt.title('Batch from dataloader')
 
+# Windows를 사용 중이라면, 다음 줄의 주석을 제거하고 for 반복문을 들여쓰기합니다.
+# "num_workers"를 0으로 변경해야 할 수도 있습니다.
+
+# if __name__ == '__main__':
 for i_batch, sample_batched in enumerate(dataloader):
     print(i_batch, sample_batched['image'].size(),
           sample_batched['landmarks'].size())
diff --git a/beginner_source/examples_nn/dynamic_net.py b/beginner_source/examples_nn/dynamic_net.py
index 91f6e89d3..185a9d3d4 100755
--- a/beginner_source/examples_nn/dynamic_net.py
+++ b/beginner_source/examples_nn/dynamic_net.py
@@ -4,7 +4,7 @@
 ---------------------------------------------------------------
 
 PyTorch 동적 그래프의 강력함을 보여주기 위해, 매우 이상한 모델을 구현해보겠습니다:
-각 순전파 단계에서 3 ~ 5 사이의 임의의 숫자를 선택하여 다차항들에서 사용하고, 동일한 가중치를 여러번
+각 순전파 단계에서 4 ~ 5 사이의 임의의 숫자를 선택하여 다차항들에서 사용하고, 동일한 가중치를 여러번
 재사용하여 4차항과 5차항을 계산하는 3-5차 다항식입니다.
 """
 import random
diff --git a/beginner_source/introyt.rst b/beginner_source/introyt.rst
new file mode 100644
index 000000000..6a4751766
--- /dev/null
+++ b/beginner_source/introyt.rst
@@ -0,0 +1,29 @@
+`Introduction <introyt/introyt1_tutorial.html>`_ ||
+`Tensors <introyt/tensors_deeper_tutorial.html>`_ ||
+`Autograd <introyt/autogradyt_tutorial.html>`_ ||
+`Building Models <introyt/modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <introyt/tensorboardyt_tutorial.html>`_ ||
+`Training Models <introyt/trainingyt.html>`_ ||
+`Model Understanding <introyt/captumyt.html>`_
+
+Introduction to PyTorch - YouTube Series
+========================================
+
+Authors: 
+`Brad Heintz <https://github.com/fbbradheintz>`_
+
+This tutorial follows along with the `PyTorch Beginner Series <https://www.youtube.com/playlist?list=PL_lsbAsL_o2CTlGHgMxNrKhzP97BaG9ZN>`_ on youtube.
+
+`This tutorial assumes a basic familiarity with Python and Deep Learning concepts.`
+
+Running the Tutorial Code
+-------------------------
+You can run this tutorial in a couple of ways:
+
+- **In the cloud**: This is the easiest way to get started! Each section has a Colab link at the top, which opens a notebook with the code in a fully-hosted environment. Pro tip: Use Colab with a GPU runtime to speed up operations *Runtime > Change runtime type > GPU*
+- **Locally**: This option requires you to setup PyTorch and TorchVision first on your local machine (`installation instructions <https://pytorch.org/get-started/locally/>`_). Download the notebook or copy the code into your favorite IDE.
+
+.. include:: /beginner_source/introyt/tocyt.txt
+
+.. toctree::
+   :hidden:
diff --git a/beginner_source/introyt/README.txt b/beginner_source/introyt/README.txt
new file mode 100644
index 000000000..e9505c42a
--- /dev/null
+++ b/beginner_source/introyt/README.txt
@@ -0,0 +1,34 @@
+Introduction to PyTorch on YouTube
+----------------------------------
+
+1. introyt.rst
+    Introduction to PyTorch - Youtube Series
+    https://tutorials.pytorch.kr/beginner/introyt/introyt.html
+
+2. introyt1_tutorial.py
+    Introduction to PyTorch
+    https://tutorials.pytorch.kr/beginner/introyt/introyt1_tutorial.html
+
+3. tensors_deeper_tutorial.py
+    PyTorch Tensors
+    https://tutorials.pytorch.kr/beginner/introyt/tensors_deeper_tutorial.html
+
+4. autogradyt_tutorial.py
+    The Fundamentals of Autograd
+    https://tutorials.pytorch.kr/beginner/introyt/autogradyt_tutorial.html
+
+5. modelsyt_tutorial.py
+    Building Models with PyTorch
+    https://tutorials.pytorch.kr/beginner/introyt/modelsyt_tutorial.html
+
+6. tensorboardyt_tutorial.py
+    PyTorch TensorBoard Support
+    https://tutorials.pytorch.kr/beginner/introyt/tensorboardyt_tutorial.html
+
+7. trainingyt_tutorial.py
+    Training with PyTorch
+    https://tutorials.pytorch.kr/beginner/introyt/trainingyt_tutorial.html
+
+8. captumyt_tutorial.py
+    Model Understanding with Captum
+    https://tutorials.pytorch.kr/beginner/introyt/captumyt_tutorial.html
diff --git a/beginner_source/introyt/autogradyt_tutorial.py b/beginner_source/introyt/autogradyt_tutorial.py
new file mode 100644
index 000000000..e5c47b25f
--- /dev/null
+++ b/beginner_source/introyt/autogradyt_tutorial.py
@@ -0,0 +1,655 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+**Autograd** ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+The Fundamentals of Autograd
+============================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=M0fX15_-xrY>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/M0fX15_-xrY" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+PyTorch’s *Autograd* feature is part of what make PyTorch flexible and
+fast for building machine learning projects. It allows for the rapid and
+easy computation of multiple partial derivatives (also referred to as
+*gradients)* over a complex computation. This operation is central to
+backpropagation-based neural network learning.
+
+The power of autograd comes from the fact that it traces your
+computation dynamically *at runtime,* meaning that if your model has
+decision branches, or loops whose lengths are not known until runtime,
+the computation will still be traced correctly, and you’ll get correct
+gradients to drive learning. This, combined with the fact that your
+models are built in Python, offers far more flexibility than frameworks
+that rely on static analysis of a more rigidly-structured model for
+computing gradients.
+
+What Do We Need Autograd For?
+-----------------------------
+
+"""
+
+###########################################################################
+# A machine learning model is a *function*, with inputs and outputs. For
+# this discussion, we’ll treat the inputs a as an *i*-dimensional vector
+# :math:`\vec{x}`, with elements :math:`x_{i}`. We can then express the
+# model, *M*, as a vector-valued function of the input: :math:`\vec{y} =
+# \vec{M}(\vec{x})`. (We treat the value of M’s output as
+# a vector because in general, a model may have any number of outputs.)
+#
+# Since we’ll mostly be discussing autograd in the context of training,
+# our output of interest will be the model’s loss. The *loss function*
+# L(:math:`\vec{y}`) = L(:math:`\vec{M}`\ (:math:`\vec{x}`)) is a
+# single-valued scalar function of the model’s output. This function
+# expresses how far off our model’s prediction was from a particular
+# input’s *ideal* output. *Note: After this point, we will often omit the
+# vector sign where it should be contextually clear - e.g.,* :math:`y`
+# instead of :math:`\vec y`.
+#
+# In training a model, we want to minimize the loss. In the idealized case
+# of a perfect model, that means adjusting its learning weights - that is,
+# the adjustable parameters of the function - such that loss is zero for
+# all inputs. In the real world, it means an iterative process of nudging
+# the learning weights until we see that we get a tolerable loss for a
+# wide variety of inputs.
+#
+# How do we decide how far and in which direction to nudge the weights? We
+# want to *minimize* the loss, which means making its first derivative
+# with respect to the input equal to 0:
+# :math:`\frac{\partial L}{\partial x} = 0`.
+#
+# Recall, though, that the loss is not *directly* derived from the input,
+# but a function of the model’s output (which is a function of the input
+# directly), :math:`\frac{\partial L}{\partial x}` =
+# :math:`\frac{\partial {L({\vec y})}}{\partial x}`. By the chain rule of
+# differential calculus, we have
+# :math:`\frac{\partial {L({\vec y})}}{\partial x}` =
+# :math:`\frac{\partial L}{\partial y}\frac{\partial y}{\partial x}` =
+# :math:`\frac{\partial L}{\partial y}\frac{\partial M(x)}{\partial x}`.
+#
+# :math:`\frac{\partial M(x)}{\partial x}` is where things get complex.
+# The partial derivatives of the model’s outputs with respect to its
+# inputs, if we were to expand the expression using the chain rule again,
+# would involve many local partial derivatives over every multiplied
+# learning weight, every activation function, and every other mathematical
+# transformation in the model. The full expression for each such partial
+# derivative is the sum of the products of the local gradient of *every
+# possible path* through the computation graph that ends with the variable
+# whose gradient we are trying to measure.
+#
+# In particular, the gradients over the learning weights are of interest
+# to us - they tell us *what direction to change each weight* to get the
+# loss function closer to zero.
+#
+# Since the number of such local derivatives (each corresponding to a
+# separate path through the model’s computation graph) will tend to go up
+# exponentially with the depth of a neural network, so does the complexity
+# in computing them. This is where autograd comes in: It tracks the
+# history of every computation. Every computed tensor in your PyTorch
+# model carries a history of its input tensors and the function used to
+# create it. Combined with the fact that PyTorch functions meant to act on
+# tensors each have a built-in implementation for computing their own
+# derivatives, this greatly speeds the computation of the local
+# derivatives needed for learning.
+#
+# A Simple Example
+# ----------------
+#
+# That was a lot of theory - but what does it look like to use autograd in
+# practice?
+#
+# Let’s start with a straightforward example. First, we’ll do some imports
+# to let us graph our results:
+#
+
+# %matplotlib inline
+
+import torch
+
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import math
+
+
+#########################################################################
+# Next, we’ll create an input tensor full of evenly spaced values on the
+# interval :math:`[0, 2{\pi}]`, and specify ``requires_grad=True``. (Like
+# most functions that create tensors, ``torch.linspace()`` accepts an
+# optional ``requires_grad`` option.) Setting this flag means that in
+# every computation that follows, autograd will be accumulating the
+# history of the computation in the output tensors of that computation.
+# 
+
+a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
+print(a)
+
+
+########################################################################
+# Next, we’ll perform a computation, and plot its output in terms of its
+# inputs:
+# 
+
+b = torch.sin(a)
+plt.plot(a.detach(), b.detach())
+
+
+########################################################################
+# Let’s have a closer look at the tensor ``b``. When we print it, we see
+# an indicator that it is tracking its computation history:
+# 
+
+print(b)
+
+
+#######################################################################
+# This ``grad_fn`` gives us a hint that when we execute the
+# backpropagation step and compute gradients, we’ll need to compute the
+# derivative of :math:`sin(x)` for all this tensor’s inputs.
+# 
+# Let’s perform some more computations:
+# 
+
+c = 2 * b
+print(c)
+
+d = c + 1
+print(d)
+
+
+##########################################################################
+# Finally, let’s compute a single-element output. When you call
+# ``.backward()`` on a tensor with no arguments, it expects the calling
+# tensor to contain only a single element, as is the case when computing a
+# loss function.
+# 
+
+out = d.sum()
+print(out)
+
+
+##########################################################################
+# Each ``grad_fn`` stored with our tensors allows you to walk the
+# computation all the way back to its inputs with its ``next_functions``
+# property. We can see below that drilling down on this property on ``d``
+# shows us the gradient functions for all the prior tensors. Note that
+# ``a.grad_fn`` is reported as ``None``, indicating that this was an input
+# to the function with no history of its own.
+# 
+
+print('d:')
+print(d.grad_fn)
+print(d.grad_fn.next_functions)
+print(d.grad_fn.next_functions[0][0].next_functions)
+print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions)
+print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0][0].next_functions)
+print('\nc:')
+print(c.grad_fn)
+print('\nb:')
+print(b.grad_fn)
+print('\na:')
+print(a.grad_fn)
+
+
+######################################################################
+# With all this machinery in place, how do we get derivatives out? You
+# call the ``backward()`` method on the output, and check the input’s
+# ``grad`` property to inspect the gradients:
+# 
+
+out.backward()
+print(a.grad)
+plt.plot(a.detach(), a.grad.detach())
+
+
+#########################################################################
+# Recall the computation steps we took to get here:
+# 
+# ::
+# 
+#    a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
+#    b = torch.sin(a)
+#    c = 2 * b
+#    d = c + 1
+#    out = d.sum()
+# 
+# Adding a constant, as we did to compute ``d``, does not change the
+# derivative. That leaves :math:`c = 2 * b = 2 * sin(a)`, the derivative
+# of which should be :math:`2 * cos(a)`. Looking at the graph above,
+# that’s just what we see.
+# 
+# Be aware than only *leaf nodes* of the computation have their gradients
+# computed. If you tried, for example, ``print(c.grad)`` you’d get back
+# ``None``. In this simple example, only the input is a leaf node, so only
+# it has gradients computed.
+# 
+# Autograd in Training
+# --------------------
+# 
+# We’ve had a brief look at how autograd works, but how does it look when
+# it’s used for its intended purpose? Let’s define a small model and
+# examine how it changes after a single training batch. First, define a
+# few constants, our model, and some stand-ins for inputs and outputs:
+# 
+
+BATCH_SIZE = 16
+DIM_IN = 1000
+HIDDEN_SIZE = 100
+DIM_OUT = 10
+
+class TinyModel(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyModel, self).__init__()
+        
+        self.layer1 = torch.nn.Linear(1000, 100)
+        self.relu = torch.nn.ReLU()
+        self.layer2 = torch.nn.Linear(100, 10)
+    
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.relu(x)
+        x = self.layer2(x)
+        return x
+    
+some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
+ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)
+
+model = TinyModel()
+
+
+##########################################################################
+# One thing you might notice is that we never specify
+# ``requires_grad=True`` for the model’s layers. Within a subclass of
+# ``torch.nn.Module``, it’s assumed that we want to track gradients on the
+# layers’ weights for learning.
+# 
+# If we look at the layers of the model, we can examine the values of the
+# weights, and verify that no gradients have been computed yet:
+# 
+
+print(model.layer2.weight[0][0:10]) # just a small slice
+print(model.layer2.weight.grad)
+
+
+##########################################################################
+# Let’s see how this changes when we run through one training batch. For a
+# loss function, we’ll just use the square of the Euclidean distance
+# between our ``prediction`` and the ``ideal_output``, and we’ll use a
+# basic stochastic gradient descent optimizer.
+# 
+
+optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+
+prediction = model(some_input)
+
+loss = (ideal_output - prediction).pow(2).sum()
+print(loss)
+
+
+######################################################################
+# Now, let’s call ``loss.backward()`` and see what happens:
+# 
+
+loss.backward()
+print(model.layer2.weight[0][0:10])
+print(model.layer2.weight.grad[0][0:10])
+
+
+########################################################################
+# We can see that the gradients have been computed for each learning
+# weight, but the weights remain unchanged, because we haven’t run the
+# optimizer yet. The optimizer is responsible for updating model weights
+# based on the computed gradients.
+# 
+
+optimizer.step()
+print(model.layer2.weight[0][0:10])
+print(model.layer2.weight.grad[0][0:10])
+
+
+######################################################################
+# You should see that ``layer2``\ ’s weights have changed.
+# 
+# One important thing about the process: After calling
+# ``optimizer.step()``, you need to call ``optimizer.zero_grad()``, or
+# else every time you run ``loss.backward()``, the gradients on the
+# learning weights will accumulate:
+# 
+
+print(model.layer2.weight.grad[0][0:10])
+
+for i in range(0, 5):
+    prediction = model(some_input)
+    loss = (ideal_output - prediction).pow(2).sum()
+    loss.backward()
+    
+print(model.layer2.weight.grad[0][0:10])
+
+optimizer.zero_grad()
+
+print(model.layer2.weight.grad[0][0:10])
+
+
+#########################################################################
+# After running the cell above, you should see that after running
+# ``loss.backward()`` multiple times, the magnitudes of most of the
+# gradients will be much larger. Failing to zero the gradients before
+# running your next training batch will cause the gradients to blow up in
+# this manner, causing incorrect and unpredictable learning results.
+# 
+# Turning Autograd Off and On
+# ---------------------------
+# 
+# There are situations where you will need fine-grained control over
+# whether autograd is enabled. There are multiple ways to do this,
+# depending on the situation.
+# 
+# The simplest is to change the ``requires_grad`` flag on a tensor
+# directly:
+# 
+
+a = torch.ones(2, 3, requires_grad=True)
+print(a)
+
+b1 = 2 * a
+print(b1)
+
+a.requires_grad = False
+b2 = 2 * a
+print(b2)
+
+
+##########################################################################
+# In the cell above, we see that ``b1`` has a ``grad_fn`` (i.e., a traced
+# computation history), which is what we expect, since it was derived from
+# a tensor, ``a``, that had autograd turned on. When we turn off autograd
+# explicitly with ``a.requires_grad = False``, computation history is no
+# longer tracked, as we see when we compute ``b2``.
+# 
+# If you only need autograd turned off temporarily, a better way is to use
+# the ``torch.no_grad()``:
+# 
+
+a = torch.ones(2, 3, requires_grad=True) * 2
+b = torch.ones(2, 3, requires_grad=True) * 3
+
+c1 = a + b
+print(c1)
+
+with torch.no_grad():
+    c2 = a + b
+
+print(c2)
+
+c3 = a * b
+print(c3)
+
+
+##########################################################################
+# ``torch.no_grad()`` can also be used as a function or method dectorator:
+# 
+
+def add_tensors1(x, y):
+    return x + y
+
+@torch.no_grad()
+def add_tensors2(x, y):
+    return x + y
+
+
+a = torch.ones(2, 3, requires_grad=True) * 2
+b = torch.ones(2, 3, requires_grad=True) * 3
+
+c1 = add_tensors1(a, b)
+print(c1)
+
+c2 = add_tensors2(a, b)
+print(c2)
+
+
+##########################################################################
+# There’s a corresponding context manager, ``torch.enable_grad()``, for
+# turning autograd on when it isn’t already. It may also be used as a
+# decorator.
+# 
+# Finally, you may have a tensor that requires gradient tracking, but you
+# want a copy that does not. For this we have the ``Tensor`` object’s
+# ``detach()`` method - it creates a copy of the tensor that is *detached*
+# from the computation history:
+# 
+
+x = torch.rand(5, requires_grad=True)
+y = x.detach()
+
+print(x)
+print(y)
+
+
+#########################################################################
+# We did this above when we wanted to graph some of our tensors. This is
+# because ``matplotlib`` expects a NumPy array as input, and the implicit
+# conversion from a PyTorch tensor to a NumPy array is not enabled for
+# tensors with requires_grad=True. Making a detached copy lets us move
+# forward.
+# 
+# Autograd and In-place Operations
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# In every example in this notebook so far, we’ve used variables to
+# capture the intermediate values of a computation. Autograd needs these
+# intermediate values to perform gradient computations. *For this reason,
+# you must be careful about using in-place operations when using
+# autograd.* Doing so can destroy information you need to compute
+# derivatives in the ``backward()`` call. PyTorch will even stop you if
+# you attempt an in-place operation on leaf variable that requires
+# autograd, as shown below.
+# 
+# .. note::
+#     The following code cell throws a runtime error. This is expected.
+# 
+# ::
+#
+#    a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
+#    torch.sin_(a)
+#
+
+#########################################################################
+# Autograd Profiler
+# -----------------
+# 
+# Autograd tracks every step of your computation in detail. Such a
+# computation history, combined with timing information, would make a
+# handy profiler - and autograd has that feature baked in. Here’s a quick
+# example usage:
+# 
+
+device = torch.device('cpu')
+run_on_gpu = False
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+    run_on_gpu = True
+    
+x = torch.randn(2, 3, requires_grad=True)
+y = torch.rand(2, 3, requires_grad=True)
+z = torch.ones(2, 3, requires_grad=True)
+
+with torch.autograd.profiler.profile(use_cuda=run_on_gpu) as prf:
+    for _ in range(1000):
+        z = (z / x) * y
+        
+print(prf.key_averages().table(sort_by='self_cpu_time_total'))
+
+
+##########################################################################
+# The profiler can also label individual sub-blocks of code, break out the
+# data by input tensor shape, and export data as a Chrome tracing tools
+# file. For full details of the API, see the
+# `documentation <https://pytorch.org/docs/stable/autograd.html#profiler>`__.
+# 
+# Advanced Topic: More Autograd Detail and the High-Level API
+# -----------------------------------------------------------
+# 
+# If you have a function with an n-dimensional input and m-dimensional
+# output, :math:`\vec{y}=f(\vec{x})`, the complete gradient is a matrix of
+# the derivative of every output with respect to every input, called the
+# *Jacobian:*
+# 
+# .. math::
+#
+#      J
+#      =
+#      \left(\begin{array}{ccc}
+#      \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{1}}{\partial x_{n}}\\
+#      \vdots & \ddots & \vdots\\
+#      \frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
+#      \end{array}\right)
+# 
+# If you have a second function, :math:`l=g\left(\vec{y}\right)` that
+# takes m-dimensional input (that is, the same dimensionality as the
+# output above), and returns a scalar output, you can express its
+# gradients with respect to :math:`\vec{y}` as a column vector,
+# :math:`v=\left(\begin{array}{ccc}\frac{\partial l}{\partial y_{1}} & \cdots & \frac{\partial l}{\partial y_{m}}\end{array}\right)^{T}`
+# - which is really just a one-column Jacobian.
+# 
+# More concretely, imagine the first function as your PyTorch model (with
+# potentially many inputs and many outputs) and the second function as a
+# loss function (with the model’s output as input, and the loss value as
+# the scalar output).
+# 
+# If we multiply the first function’s Jacobian by the gradient of the
+# second function, and apply the chain rule, we get:
+# 
+# .. math::
+#
+#    J^{T}\cdot v=\left(\begin{array}{ccc}
+#    \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{1}}\\
+#    \vdots & \ddots & \vdots\\
+#    \frac{\partial y_{1}}{\partial x_{n}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
+#    \end{array}\right)\left(\begin{array}{c}
+#    \frac{\partial l}{\partial y_{1}}\\
+#    \vdots\\
+#    \frac{\partial l}{\partial y_{m}}
+#    \end{array}\right)=\left(\begin{array}{c}
+#    \frac{\partial l}{\partial x_{1}}\\
+#    \vdots\\
+#    \frac{\partial l}{\partial x_{n}}
+#    \end{array}\right)
+# 
+# Note: You could also use the equivalent operation :math:`v^{T}\cdot J`,
+# and get back a row vector.
+# 
+# The resulting column vector is the *gradient of the second function with
+# respect to the inputs of the first* - or in the case of our model and
+# loss function, the gradient of the loss with respect to the model
+# inputs.
+# 
+# **``torch.autograd`` is an engine for computing these products.** This
+# is how we accumulate the gradients over the learning weights during the
+# backward pass.
+# 
+# For this reason, the ``backward()`` call can *also* take an optional
+# vector input. This vector represents a set of gradients over the tensor,
+# which are multiplied by the Jacobian of the autograd-traced tensor that
+# precedes it. Let’s try a specific example with a small vector:
+# 
+
+x = torch.randn(3, requires_grad=True)
+
+y = x * 2
+while y.data.norm() < 1000:
+    y = y * 2
+
+print(y)
+
+
+##########################################################################
+# If we tried to call ``y.backward()`` now, we’d get a runtime error and a
+# message that gradients can only be *implicitly* computed for scalar
+# outputs. For a multi-dimensional output, autograd expects us to provide
+# gradients for those three outputs that it can multiply into the
+# Jacobian:
+# 
+
+v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float) # stand-in for gradients
+y.backward(v)
+
+print(x.grad)
+
+
+##########################################################################
+# (Note that the output gradients are all related to powers of two - which
+# we’d expect from a repeated doubling operation.)
+# 
+# The High-Level API
+# ~~~~~~~~~~~~~~~~~~
+# 
+# There is an API on autograd that gives you direct access to important
+# differential matrix and vector operations. In particular, it allows you
+# to calculate the Jacobian and the *Hessian* matrices of a particular
+# function for particular inputs. (The Hessian is like the Jacobian, but
+# expresses all partial *second* derivatives.) It also provides methods
+# for taking vector products with these matrices.
+# 
+# Let’s take the Jacobian of a simple function, evaluated for a 2
+# single-element inputs:
+# 
+
+def exp_adder(x, y):
+    return 2 * x.exp() + 3 * y
+
+inputs = (torch.rand(1), torch.rand(1)) # arguments for the function
+print(inputs)
+torch.autograd.functional.jacobian(exp_adder, inputs)
+
+
+########################################################################
+# If you look closely, the first output should equal :math:`2e^x` (since
+# the derivative of :math:`e^x` is :math:`e^x`), and the second value
+# should be 3.
+# 
+# You can, of course, do this with higher-order tensors:
+# 
+
+inputs = (torch.rand(3), torch.rand(3)) # arguments for the function
+print(inputs)
+torch.autograd.functional.jacobian(exp_adder, inputs)
+
+
+#########################################################################
+# The ``torch.autograd.functional.hessian()`` method works identically
+# (assuming your function is twice differentiable), but returns a matrix
+# of all second derivatives.
+# 
+# There is also a function to directly compute the vector-Jacobian
+# product, if you provide the vector:
+# 
+
+def do_some_doubling(x):
+    y = x * 2
+    while y.data.norm() < 1000:
+        y = y * 2
+    return y
+
+inputs = torch.randn(3)
+my_gradients = torch.tensor([0.1, 1.0, 0.0001])
+torch.autograd.functional.vjp(do_some_doubling, inputs, v=my_gradients)
+
+
+##############################################################################
+# The ``torch.autograd.functional.jvp()`` method performs the same matrix
+# multiplication as ``vjp()`` with the operands reversed. The ``vhp()``
+# and ``hvp()`` methods do the same for a vector-Hessian product.
+# 
+# For more information, including preformance notes on the `docs for the
+# functional
+# API <https://pytorch.org/docs/stable/autograd.html#functional-higher-level-api>`__
+# 
diff --git a/beginner_source/introyt/captumyt.py b/beginner_source/introyt/captumyt.py
new file mode 100644
index 000000000..0a6f2ad33
--- /dev/null
+++ b/beginner_source/introyt/captumyt.py
@@ -0,0 +1,492 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+**Model Understanding**
+
+Model Understanding with Captum
+===============================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=Am2EF9CLu-g>`__. Download the notebook and corresponding files
+`here <https://pytorch-tutorial-assets.s3.amazonaws.com/youtube-series/video7.zip>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/Am2EF9CLu-g" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+`Captum <https://captum.ai/>`__ (“comprehension” in Latin) is an open
+source, extensible library for model interpretability built on PyTorch.
+
+With the increase in model complexity and the resulting lack of
+transparency, model interpretability methods have become increasingly
+important. Model understanding is both an active area of research as
+well as an area of focus for practical applications across industries
+using machine learning. Captum provides state-of-the-art algorithms,
+including Integrated Gradients, to provide researchers and developers
+with an easy way to understand which features are contributing to a
+model’s output.
+
+Full documentation, an API reference, and a suite of tutorials on
+specific topics are available at the `captum.ai <https://captum.ai/>`__
+website.
+
+Introduction
+------------
+
+Captum’s approach to model interpretability is in terms of
+*attributions.* There are three kinds of attributions available in
+Captum:
+
+-  **Feature Attribution** seeks to explain a particular output in terms
+   of features of the input that generated it. Explaining whether a
+   movie review was positive or negative in terms of certain words in
+   the review is an example of feature attribution.
+-  **Layer Attribution** examines the activity of a model’s hidden layer
+   subsequent to a particular input. Examining the spatially-mapped
+   output of a convolutional layer in response to an input image in an
+   example of layer attribution.
+-  **Neuron Attribution** is analagous to layer attribution, but focuses
+   on the activity of a single neuron.
+
+In this interactive notebook, we’ll look at Feature Attribution and
+Layer Attribution.
+
+Each of the three attribution types has multiple **attribution
+algorithms** associated with it. Many attribution algorithms fall into
+two broad categories:
+
+-  **Gradient-based algorithms** calculate the backward gradients of a
+   model output, layer output, or neuron activation with respect to the
+   input. **Integrated Gradients** (for features), **Layer Gradient \*
+   Activation**, and **Neuron Conductance** are all gradient-based
+   algorithms.
+-  **Perturbation-based algorithms** examine the changes in the output
+   of a model, layer, or neuron in response to changes in the input. The
+   input perturbations may be directed or random. **Occlusion,**
+   **Feature Ablation,** and **Feature Permutation** are all
+   perturbation-based algorithms.
+
+We’ll be examining algorithms of both types below.
+
+Especially where large models are involved, it can be valuable to
+visualize attribution data in ways that relate it easily to the input
+features being examined. While it is certainly possible to create your
+own visualizations with Matplotlib, Plotly, or similar tools, Captum
+offers enhanced tools specific to its attributions:
+
+-  The ``captum.attr.visualization`` module (imported below as ``viz``)
+   provides helpful functions for visualizing attributions related to
+   images.
+-  **Captum Insights** is an easy-to-use API on top of Captum that
+   provides a visualization widget with ready-made visualizations for
+   image, text, and arbitrary model types.
+
+Both of these visualization toolsets will be demonstrated in this
+notebook. The first few examples will focus on computer vision use
+cases, but the Captum Insights section at the end will demonstrate
+visualization of attributions in a multi-model, visual
+question-and-answer model.
+
+Installation
+------------
+
+Before you get started, you need to have a Python environment with:
+
+-  Python version 3.6 or higher
+-  For the Captum Insights example, Flask 1.1 or higher
+-  PyTorch version 1.2 or higher (the latest version is recommended)
+-  TorchVision version 0.6 or higher (the latest version is recommended)
+-  Captum (the latest version is recommended)
+
+To install Captum in an Anaconda or pip virtual environment, use the
+appropriate command for your environment below:
+
+With ``conda``:
+
+``conda install pytorch torchvision captum -c pytorch``
+
+With ``pip``:
+
+``pip install torch torchvision captum``
+
+Restart this notebook in the environment you set up, and you’re ready to
+go!
+
+
+A First Example
+---------------
+ 
+To start, let’s take a simple, visual example. We’ll start with a ResNet
+model pretrained on the ImageNet dataset. We’ll get a test input, and
+use different **Feature Attribution** algorithms to examine how the
+input images affect the output, and see a helpful visualization of this
+input attribution map for some test images.
+ 
+First, some imports: 
+
+"""
+
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+import torchvision.models as models
+
+import captum
+from captum.attr import IntegratedGradients, Occlusion, LayerGradCam, LayerAttribution
+from captum.attr import visualization as viz
+
+import os, sys
+import json
+
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+from matplotlib.colors import LinearSegmentedColormap
+
+
+#########################################################################
+# Now we’ll use the TorchVision model library to download a pretrained
+# ResNet. Since we’re not training, we’ll place it in evaluation mode for
+# now.
+# 
+
+model = models.resnet101(pretrained=True)
+model = model.eval()
+
+
+#######################################################################
+# The place where you got this interactive notebook should also have an
+# ``img`` folder with a file ``cat.jpg`` in it.
+# 
+
+test_img = Image.open('img/cat.jpg')
+test_img_data = np.asarray(test_img)
+plt.imshow(test_img_data)
+plt.show()
+
+
+##########################################################################
+# Our ResNet model was trained on the ImageNet dataset, and expects images
+# to be of a certain size, with the channel data normalized to a specific
+# range of values. We’ll also pull in the list of human-readable labels
+# for the categories our model recognizes - that should be in the ``img``
+# folder as well.
+# 
+
+# model expects 224x224 3-color image
+transform = transforms.Compose([
+ transforms.Resize(224),
+ transforms.CenterCrop(224),
+ transforms.ToTensor()
+])
+
+# standard ImageNet normalization
+transform_normalize = transforms.Normalize(
+     mean=[0.485, 0.456, 0.406],
+     std=[0.229, 0.224, 0.225]
+ )
+
+transformed_img = transform(test_img)
+input_img = transform_normalize(transformed_img)
+input_img = input_img.unsqueeze(0) # the model requires a dummy batch dimension
+
+labels_path = 'img/imagenet_class_index.json'
+with open(labels_path) as json_data:
+    idx_to_labels = json.load(json_data)
+
+
+######################################################################
+# Now, we can ask the question: What does our model think this image
+# represents?
+# 
+
+output = model(input_img)
+output = F.softmax(output, dim=1)
+prediction_score, pred_label_idx = torch.topk(output, 1)
+pred_label_idx.squeeze_()
+predicted_label = idx_to_labels[str(pred_label_idx.item())][1]
+print('Predicted:', predicted_label, '(', prediction_score.squeeze().item(), ')')
+
+
+######################################################################
+# We’ve confirmed that ResNet thinks our image of a cat is, in fact, a
+# cat. But *why* does the model think this is an image of a cat?
+# 
+# For the answer to that, we turn to Captum.
+# 
+
+
+##########################################################################
+# Feature Attribution with Integrated Gradients
+# ---------------------------------------------
+# 
+# **Feature attribution** attributes a particular output to features of
+# the input. It uses a specific input - here, our test image - to generate
+# a map of the relative importance of each input feature to a particular
+# output feature.
+# 
+# `Integrated
+# Gradients <https://captum.ai/api/integrated_gradients.html>`__ is one of
+# the feature attribution algorithms available in Captum. Integrated
+# Gradients assigns an importance score to each input feature by
+# approximating the integral of the gradients of the model’s output with
+# respect to the inputs.
+# 
+# In our case, we’re going to be taking a specific element of the output
+# vector - that is, the one indicating the model’s confidence in its
+# chosen category - and use Integrated Gradients to understand what parts
+# of the input image contributed to this output.
+# 
+# Once we have the importance map from Integrated Gradients, we’ll use the
+# visualization tools in Captum to give a helpful representation of the
+# importance map. Captum’s ``visualize_image_attr()`` function provides a
+# variety of options for customizing display of your attribution data.
+# Here, we pass in a custom Matplotlib color map.
+# 
+# Running the cell with the ``integrated_gradients.attribute()`` call will
+# usually take a minute or two.
+# 
+
+# Initialize the attribution algorithm with the model
+integrated_gradients = IntegratedGradients(model)
+
+# Ask the algorithm to attribute our output target to 
+attributions_ig = integrated_gradients.attribute(input_img, target=pred_label_idx, n_steps=200)
+
+# Show the original image for comparison
+_ = viz.visualize_image_attr(None, np.transpose(transformed_img.squeeze().cpu().detach().numpy(), (1,2,0)), 
+                      method="original_image", title="Original Image")
+
+default_cmap = LinearSegmentedColormap.from_list('custom blue', 
+                                                 [(0, '#ffffff'),
+                                                  (0.25, '#0000ff'),
+                                                  (1, '#0000ff')], N=256)
+
+_ = viz.visualize_image_attr(np.transpose(attributions_ig.squeeze().cpu().detach().numpy(), (1,2,0)),
+                             np.transpose(transformed_img.squeeze().cpu().detach().numpy(), (1,2,0)),
+                             method='heat_map',
+                             cmap=default_cmap,
+                             show_colorbar=True,
+                             sign='positive',
+                             title='Integrated Gradients')
+
+
+#######################################################################
+# In the image above, you should see that Integrated Gradients gives us
+# the strongest signal around the cat’s location in the image.
+# 
+
+
+##########################################################################
+# Feature Attribution with Occlusion
+# ----------------------------------
+# 
+# Gradient-based attribution methods help to understand the model in terms
+# of directly computing out the output changes with respect to the input.
+# *Perturbation-based attribution* methods approach this more directly, by
+# introducing changes to the output to measure the effect on the output.
+# `Occlusion <https://captum.ai/api/occlusion.html>`__ is one such method.
+# It involves replacing sections of the input image, and examining the
+# effect on the output signal.
+# 
+# Below, we set up Occlusion attribution. Similarly to configuring a
+# convolutional neural network, you can specify the size of the target
+# region, and a stride length to determine the spacing of individual
+# measurements. We’ll visualize the output of our Occlusion attribution
+# with ``visualize_image_attr_multiple()``, showing heat maps of both
+# positive and negative attribution by region, and by masking the original
+# image with the positive attribution regions. The masking gives a very
+# instructive view of what regions of our cat photo the model found to be
+# most “cat-like”.
+# 
+
+occlusion = Occlusion(model)
+
+attributions_occ = occlusion.attribute(input_img,
+                                       target=pred_label_idx,
+                                       strides=(3, 8, 8),
+                                       sliding_window_shapes=(3,15, 15),
+                                       baselines=0)
+
+
+_ = viz.visualize_image_attr_multiple(np.transpose(attributions_occ.squeeze().cpu().detach().numpy(), (1,2,0)),
+                                      np.transpose(transformed_img.squeeze().cpu().detach().numpy(), (1,2,0)),
+                                      ["original_image", "heat_map", "heat_map", "masked_image"],
+                                      ["all", "positive", "negative", "positive"],
+                                      show_colorbar=True,
+                                      titles=["Original", "Positive Attribution", "Negative Attribution", "Masked"],
+                                      fig_size=(18, 6)
+                                     )
+
+
+######################################################################
+# Again, we see greater significance placed on the region of the image
+# that contains the cat.
+# 
+
+
+#########################################################################
+# Layer Attribution with Layer GradCAM
+# ------------------------------------
+# 
+# **Layer Attribution** allows you to attribute the activity of hidden
+# layers within your model to features of your input. Below, we’ll use a
+# layer attribution algorithm to examine the activity of one of the
+# convolutional layers within our model.
+# 
+# GradCAM computes the gradients of the target output with respect to the
+# given layer, averages for each output channel (dimension 2 of output),
+# and multiplies the average gradient for each channel by the layer
+# activations. The results are summed over all channels. GradCAM is
+# designed for convnets; since the activity of convolutional layers often
+# maps spatially to the input, GradCAM attributions are often upsampled
+# and used to mask the input.
+# 
+# Layer attribution is set up similarly to input attribution, except that
+# in addition to the model, you must specify a hidden layer within the
+# model that you wish to examine. As above, when we call ``attribute()``,
+# we specify the target class of interest.
+# 
+
+layer_gradcam = LayerGradCam(model, model.layer3[1].conv2)
+attributions_lgc = layer_gradcam.attribute(input_img, target=pred_label_idx)
+
+_ = viz.visualize_image_attr(attributions_lgc[0].cpu().permute(1,2,0).detach().numpy(),
+                             sign="all",
+                             title="Layer 3 Block 1 Conv 2")
+
+
+##########################################################################
+# We’ll use the convenience method ``interpolate()`` in the
+# `LayerAttribution <https://captum.ai/api/base_classes.html?highlight=layerattribution#captum.attr.LayerAttribution>`__
+# base class to upsample this attribution data for comparison to the input
+# image.
+# 
+
+upsamp_attr_lgc = LayerAttribution.interpolate(attributions_lgc, input_img.shape[2:])
+
+print(attributions_lgc.shape)
+print(upsamp_attr_lgc.shape)
+print(input_img.shape)
+
+_ = viz.visualize_image_attr_multiple(upsamp_attr_lgc[0].cpu().permute(1,2,0).detach().numpy(),
+                                      transformed_img.permute(1,2,0).numpy(),
+                                      ["original_image","blended_heat_map","masked_image"],
+                                      ["all","positive","positive"],
+                                      show_colorbar=True,
+                                      titles=["Original", "Positive Attribution", "Masked"],
+                                      fig_size=(18, 6))
+
+
+#######################################################################
+# Visualizations such as this can give you novel insights into how your
+# hidden layers respond to your input.
+# 
+
+
+##########################################################################
+# Visualization with Captum Insights
+# ----------------------------------
+# 
+# Captum Insights is an interpretability visualization widget built on top
+# of Captum to facilitate model understanding. Captum Insights works
+# across images, text, and other features to help users understand feature
+# attribution. It allows you to visualize attribution for multiple
+# input/output pairs, and provides visualization tools for image, text,
+# and arbitrary data.
+# 
+# In this section of the notebook, we’ll visualize multiple image
+# classification inferences with Captum Insights.
+# 
+# First, let’s gather some image and see what the model thinks of them.
+# For variety, we’ll take our cat, a teapot, and a trilobite fossil:
+# 
+
+imgs = ['img/cat.jpg', 'img/teapot.jpg', 'img/trilobite.jpg']
+
+for img in imgs:
+    img = Image.open(img)
+    transformed_img = transform(img)
+    input_img = transform_normalize(transformed_img)
+    input_img = input_img.unsqueeze(0) # the model requires a dummy batch dimension
+
+    output = model(input_img)
+    output = F.softmax(output, dim=1)
+    prediction_score, pred_label_idx = torch.topk(output, 1)
+    pred_label_idx.squeeze_()
+    predicted_label = idx_to_labels[str(pred_label_idx.item())][1]
+    print('Predicted:', predicted_label, '/', pred_label_idx.item(), ' (', prediction_score.squeeze().item(), ')')
+
+
+##########################################################################
+# …and it looks like our model is identifying them all correctly - but of
+# course, we want to dig deeper. For that we’ll use the Captum Insights
+# widget, which we configure with an ``AttributionVisualizer`` object,
+# imported below. The ``AttributionVisualizer`` expects batches of data,
+# so we’ll bring in Captum’s ``Batch`` helper class. And we’ll be looking
+# at images specifically, so well also import ``ImageFeature``.
+# 
+# We configure the ``AttributionVisualizer`` with the following arguments:
+# 
+# -  An array of models to be examined (in our case, just the one)
+# -  A scoring function, which allows Captum Insights to pull out the
+#    top-k predictions from a model
+# -  An ordered, human-readable list of classes our model is trained on
+# -  A list of features to look for - in our case, an ``ImageFeature``
+# -  A dataset, which is an iterable object returning batches of inputs
+#    and labels - just like you’d use for training
+# 
+
+from captum.insights import AttributionVisualizer, Batch
+from captum.insights.attr_vis.features import ImageFeature
+
+# Baseline is all-zeros input - this may differ depending on your data
+def baseline_func(input):
+    return input * 0
+
+# merging our image transforms from above
+def full_img_transform(input):
+    i = Image.open(input)
+    i = transform(i)
+    i = transform_normalize(i)
+    i = i.unsqueeze(0)
+    return i
+
+
+input_imgs = torch.cat(list(map(lambda i: full_img_transform(i), imgs)), 0)
+
+visualizer = AttributionVisualizer(
+    models=[model],
+    score_func=lambda o: torch.nn.functional.softmax(o, 1),
+    classes=list(map(lambda k: idx_to_labels[k][1], idx_to_labels.keys())),
+    features=[
+        ImageFeature(
+            "Photo",
+            baseline_transforms=[baseline_func],
+            input_transforms=[],
+        )
+    ],
+    dataset=[Batch(input_imgs, labels=[282,849,69])]
+)
+
+
+#########################################################################
+# Note that running the cell above didn’t take much time at all, unlike
+# our attributions above. That’s because Captum Insights lets you
+# configure different attribution algorithms in a visual widget, after
+# which it will compute and display the attributions. *That* process will
+# take a few minutes.
+# 
+# Running the cell below will render the Captum Insights widget. You can
+# then choose attributions methods and their arguments, filter model
+# responses based on predicted class or prediction correctness, see the
+# model’s predictions with associated probabilities, and view heatmaps of
+# the attribution compared with the original image.
+# 
+
+visualizer.render()
diff --git a/beginner_source/introyt/introyt1_tutorial.py b/beginner_source/introyt/introyt1_tutorial.py
new file mode 100644
index 000000000..219186e9a
--- /dev/null
+++ b/beginner_source/introyt/introyt1_tutorial.py
@@ -0,0 +1,613 @@
+"""
+**Introduction** ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+Introduction to PyTorch
+=======================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=IC0_FRiX-sw>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/IC0_FRiX-sw" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+PyTorch Tensors
+---------------
+
+Follow along with the video beginning at `03:50 <https://www.youtube.com/watch?v=IC0_FRiX-sw&t=230s>`__.
+
+First, we’ll import pytorch.
+
+"""
+
+import torch
+
+######################################################################
+# Let’s see a few basic tensor manipulations. First, just a few of the
+# ways to create tensors:
+# 
+
+z = torch.zeros(5, 3)
+print(z)
+print(z.dtype)
+
+
+#########################################################################
+# Above, we create a 5x3 matrix filled with zeros, and query its datatype
+# to find out that the zeros are 32-bit floating point numbers, which is
+# the default PyTorch.
+# 
+# What if you wanted integers instead? You can always override the
+# default:
+# 
+
+i = torch.ones((5, 3), dtype=torch.int16)
+print(i)
+
+
+######################################################################
+# You can see that when we do change the default, the tensor helpfully
+# reports this when printed.
+# 
+# It’s common to initialize learning weights randomly, often with a
+# specific seed for the PRNG for reproducibility of results:
+# 
+
+torch.manual_seed(1729)
+r1 = torch.rand(2, 2)
+print('A random tensor:')
+print(r1)
+
+r2 = torch.rand(2, 2)
+print('\nA different random tensor:')
+print(r2) # new values
+
+torch.manual_seed(1729)
+r3 = torch.rand(2, 2)
+print('\nShould match r1:')
+print(r3) # repeats values of r1 because of re-seed
+
+
+#######################################################################
+# PyTorch tensors perform arithmetic operations intuitively. Tensors of
+# similar shapes may be added, multiplied, etc. Operations with scalars
+# are distributed over the tensor:
+# 
+
+ones = torch.ones(2, 3)
+print(ones)
+
+twos = torch.ones(2, 3) * 2 # every element is multiplied by 2
+print(twos)
+
+threes = ones + twos       # additon allowed because shapes are similar
+print(threes)              # tensors are added element-wise
+print(threes.shape)        # this has the same dimensions as input tensors
+
+r1 = torch.rand(2, 3)
+r2 = torch.rand(3, 2)
+# uncomment this line to get a runtime error
+# r3 = r1 + r2
+
+
+######################################################################
+# Here’s a small sample of the mathematical operations available:
+# 
+
+r = (torch.rand(2, 2) - 0.5) * 2 # values between -1 and 1
+print('A random matrix, r:')
+print(r)
+
+# Common mathematical operations are supported:
+print('\nAbsolute value of r:')
+print(torch.abs(r))
+
+# ...as are trigonometric functions:
+print('\nInverse sine of r:')
+print(torch.asin(r))
+
+# ...and linear algebra operations like determinant and singular value decomposition
+print('\nDeterminant of r:')
+print(torch.det(r))
+print('\nSingular value decomposition of r:')
+print(torch.svd(r))
+
+# ...and statistical and aggregate operations:
+print('\nAverage and standard deviation of r:')
+print(torch.std_mean(r))
+print('\nMaximum value of r:')
+print(torch.max(r))
+
+
+##########################################################################
+# There’s a good deal more to know about the power of PyTorch tensors,
+# including how to set them up for parallel computations on GPU - we’ll be
+# going into more depth in another video.
+# 
+# PyTorch Models
+# --------------
+#
+# Follow along with the video beginning at `10:00 <https://www.youtube.com/watch?v=IC0_FRiX-sw&t=600s>`__.
+#
+# Let’s talk about how we can express models in PyTorch
+#
+
+import torch                     # for all things PyTorch
+import torch.nn as nn            # for torch.nn.Module, the parent object for PyTorch models
+import torch.nn.functional as F  # for the activation function
+
+
+#########################################################################
+# .. figure:: /_static/img/mnist.png
+#    :alt: le-net-5 diagram
+#
+# *Figure: LeNet-5*
+# 
+# Above is a diagram of LeNet-5, one of the earliest convolutional neural
+# nets, and one of the drivers of the explosion in Deep Learning. It was
+# built to read small images of handwritten numbers (the MNIST dataset),
+# and correctly classify which digit was represented in the image.
+# 
+# Here’s the abridged version of how it works:
+# 
+# -  Layer C1 is a convolutional layer, meaning that it scans the input
+#    image for features it learned during training. It outputs a map of
+#    where it saw each of its learned features in the image. This
+#    “activation map” is downsampled in layer S2.
+# -  Layer C3 is another convolutional layer, this time scanning C1’s
+#    activation map for *combinations* of features. It also puts out an
+#    activation map describing the spatial locations of these feature
+#    combinations, which is downsampled in layer S4.
+# -  Finally, the fully-connected layers at the end, F5, F6, and OUTPUT,
+#    are a *classifier* that takes the final activation map, and
+#    classifies it into one of ten bins representing the 10 digits.
+# 
+# How do we express this simple neural network in code?
+# 
+
+class LeNet(nn.Module):
+
+    def __init__(self):
+        super(LeNet, self).__init__()
+        # 1 input image channel (black & white), 6 output channels, 3x3 square convolution
+        # kernel
+        self.conv1 = nn.Conv2d(1, 6, 3)
+        self.conv2 = nn.Conv2d(6, 16, 3)
+        # an affine operation: y = Wx + b
+        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        # Max pooling over a (2, 2) window
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        # If the size is a square you can only specify a single number
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = x.view(-1, self.num_flat_features(x))
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+    def num_flat_features(self, x):
+        size = x.size()[1:]  # all dimensions except the batch dimension
+        num_features = 1
+        for s in size:
+            num_features *= s
+        return num_features
+
+
+############################################################################
+# Looking over this code, you should be able to spot some structural
+# similarities with the diagram above.
+# 
+# This demonstrates the structure of a typical PyTorch model: 
+#
+# -  It inherits from ``torch.nn.Module`` - modules may be nested - in fact,
+#    even the ``Conv2d`` and ``Linear`` layer classes inherit from
+#    ``torch.nn.Module``.
+# -  A model will have an ``__init__()`` function, where it instantiates
+#    its layers, and loads any data artifacts it might
+#    need (e.g., an NLP model might load a vocabulary).
+# -  A model will have a ``forward()`` function. This is where the actual
+#    computation happens: An input is passed through the network layers
+#    and various functions to generate an output.
+# -  Other than that, you can build out your model class like any other
+#    Python class, adding whatever properties and methods you need to
+#    support your model’s computation.
+# 
+# Let’s instantiate this object and run a sample input through it.
+# 
+
+net = LeNet()
+print(net)                         # what does the object tell us about itself?
+
+input = torch.rand(1, 1, 32, 32)   # stand-in for a 32x32 black & white image
+print('\nImage batch shape:')
+print(input.shape)
+
+output = net(input)                # we don't call forward() directly
+print('\nRaw output:')
+print(output)
+print(output.shape)
+
+
+##########################################################################
+# There are a few important things happening above:
+# 
+# First, we instantiate the ``LeNet`` class, and we print the ``net``
+# object. A subclass of ``torch.nn.Module`` will report the layers it has
+# created and their shapes and parameters. This can provide a handy
+# overview of a model if you want to get the gist of its processing.
+# 
+# Below that, we create a dummy input representing a 32x32 image with 1
+# color channel. Normally, you would load an image tile and convert it to
+# a tensor of this shape.
+# 
+# You may have noticed an extra dimension to our tensor - the *batch
+# dimension.* PyTorch models assume they are working on *batches* of data
+# - for example, a batch of 16 of our image tiles would have the shape
+# ``(16, 1, 32, 32)``. Since we’re only using one image, we create a batch
+# of 1 with shape ``(1, 1, 32, 32)``.
+# 
+# We ask the model for an inference by calling it like a function:
+# ``net(input)``. The output of this call represents the model’s
+# confidence that the input represents a particular digit. (Since this
+# instance of the model hasn’t learned anything yet, we shouldn’t expect
+# to see any signal in the output.) Looking at the shape of ``output``, we
+# can see that it also has a batch dimension, the size of which should
+# always match the input batch dimension. If we had passed in an input
+# batch of 16 instances, ``output`` would have a shape of ``(16, 10)``.
+# 
+# Datasets and Dataloaders
+# ------------------------
+#
+# Follow along with the video beginning at `14:00 <https://www.youtube.com/watch?v=IC0_FRiX-sw&t=840s>`__.
+#
+# Below, we’re going to demonstrate using one of the ready-to-download,
+# open-access datasets from TorchVision, how to transform the images for
+# consumption by your model, and how to use the DataLoader to feed batches
+# of data to your model.
+#
+# The first thing we need to do is transform our incoming images into a
+# PyTorch tensor.
+#
+
+#%matplotlib inline
+
+import torch
+import torchvision
+import torchvision.transforms as transforms
+
+transform = transforms.Compose(
+    [transforms.ToTensor(),
+     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+
+##########################################################################
+# Here, we specify two transformations for our input:
+#
+# -  ``transforms.ToTensor()`` converts images loaded by Pillow into 
+#    PyTorch tensors.
+# -  ``transforms.Normalize()`` adjusts the values of the tensor so
+#    that their average is zero and their standard deviation is 0.5. Most
+#    activation functions have their strongest gradients around x = 0, so
+#    centering our data there can speed learning.
+# 
+# There are many more transforms available, including cropping, centering,
+# rotation, and reflection.
+# 
+# Next, we’ll create an instance of the CIFAR10 dataset. This is a set of
+# 32x32 color image tiles representing 10 classes of objects: 6 of animals
+# (bird, cat, deer, dog, frog, horse) and 4 of vehicles (airplane,
+# automobile, ship, truck):
+# 
+
+trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
+                                        download=True, transform=transform)
+
+
+##########################################################################
+# .. note::
+#      When you run the cell above, it may take a little time for the 
+#      dataset to download.
+# 
+# This is an example of creating a dataset object in PyTorch. Downloadable
+# datasets (like CIFAR-10 above) are subclasses of
+# ``torch.utils.data.Dataset``. ``Dataset`` classes in PyTorch include the
+# downloadable datasets in TorchVision, Torchtext, and TorchAudio, as well
+# as utility dataset classes such as ``torchvision.datasets.ImageFolder``,
+# which will read a folder of labeled images. You can also create your own
+# subclasses of ``Dataset``.
+# 
+# When we instantiate our dataset, we need to tell it a few things:
+#
+# -  The filesystem path to where we want the data to go. 
+# -  Whether or not we are using this set for training; most datasets
+#    will be split into training and test subsets.
+# -  Whether we would like to download the dataset if we haven’t already.
+# -  The transformations we want to apply to the data.
+# 
+# Once your dataset is ready, you can give it to the ``DataLoader``:
+# 
+
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
+                                          shuffle=True, num_workers=2)
+
+
+##########################################################################
+# A ``Dataset`` subclass wraps access to the data, and is specialized to
+# the type of data it’s serving. The ``DataLoader`` knows *nothing* about
+# the data, but organizes the input tensors served by the ``Dataset`` into
+# batches with the parameters you specify.
+# 
+# In the example above, we’ve asked a ``DataLoader`` to give us batches of
+# 4 images from ``trainset``, randomizing their order (``shuffle=True``),
+# and we told it to spin up two workers to load data from disk.
+# 
+# It’s good practice to visualize the batches your ``DataLoader`` serves:
+# 
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+classes = ('plane', 'car', 'bird', 'cat',
+           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
+
+def imshow(img):
+    img = img / 2 + 0.5     # unnormalize
+    npimg = img.numpy()
+    plt.imshow(np.transpose(npimg, (1, 2, 0)))
+
+
+# get some random training images
+dataiter = iter(trainloader)
+images, labels = dataiter.next()
+
+# show images
+imshow(torchvision.utils.make_grid(images))
+# print labels
+print(' '.join('%5s' % classes[labels[j]] for j in range(4)))
+
+
+########################################################################
+# Running the above cell should show you a strip of four images, and the
+# correct label for each.
+# 
+# Training Your PyTorch Model
+# ---------------------------
+#
+# Follow along with the video beginning at `17:10 <https://www.youtube.com/watch?v=IC0_FRiX-sw&t=1030s>`__.
+#
+# Let’s put all the pieces together, and train a model:
+#
+
+#%matplotlib inline
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+import torchvision
+import torchvision.transforms as transforms
+
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+#########################################################################
+# First, we’ll need training and test datasets. If you haven’t already,
+# run the cell below to make sure the dataset is downloaded. (It may take
+# a minute.)
+# 
+
+transform = transforms.Compose(
+    [transforms.ToTensor(),
+     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
+                                        download=True, transform=transform)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
+                                          shuffle=True, num_workers=2)
+
+testset = torchvision.datasets.CIFAR10(root='./data', train=False,
+                                       download=True, transform=transform)
+testloader = torch.utils.data.DataLoader(testset, batch_size=4,
+                                         shuffle=False, num_workers=2)
+
+classes = ('plane', 'car', 'bird', 'cat',
+           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
+
+
+######################################################################
+# We’ll run our check on the output from ``DataLoader``:
+# 
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+# functions to show an image
+
+
+def imshow(img):
+    img = img / 2 + 0.5     # unnormalize
+    npimg = img.numpy()
+    plt.imshow(np.transpose(npimg, (1, 2, 0)))
+
+
+# get some random training images
+dataiter = iter(trainloader)
+images, labels = dataiter.next()
+
+# show images
+imshow(torchvision.utils.make_grid(images))
+# print labels
+print(' '.join('%5s' % classes[labels[j]] for j in range(4)))
+
+
+##########################################################################
+# This is the model we’ll train. If it looks familiar, that’s because it’s
+# a variant of LeNet - discussed earlier in this video - adapted for
+# 3-color images.
+# 
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+net = Net()
+
+
+######################################################################
+# The last ingredients we need are a loss function and an optimizer:
+# 
+
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+
+##########################################################################
+# The loss function, as discussed earlier in this video, is a measure of
+# how far from our ideal output the model’s prediction was. Cross-entropy
+# loss is a typical loss function for classification models like ours.
+# 
+# The **optimizer** is what drives the learning. Here we have created an
+# optimizer that implements *stochastic gradient descent,* one of the more
+# straightforward optimization algorithms. Besides parameters of the
+# algorithm, like the learning rate (``lr``) and momentum, we also pass in
+# ``net.parameters()``, which is a collection of all the learning weights
+# in the model - which is what the optimizer adjusts.
+# 
+# Finally, all of this is assembled into the training loop. Go ahead and
+# run this cell, as it will likely take a few minutes to execute:
+# 
+
+for epoch in range(2):  # loop over the dataset multiple times
+
+    running_loss = 0.0
+    for i, data in enumerate(trainloader, 0):
+        # get the inputs
+        inputs, labels = data
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # print statistics
+        running_loss += loss.item()
+        if i % 2000 == 1999:    # print every 2000 mini-batches
+            print('[%d, %5d] loss: %.3f' %
+                  (epoch + 1, i + 1, running_loss / 2000))
+            running_loss = 0.0
+
+print('Finished Training')
+
+
+########################################################################
+# Here, we are doing only **2 training epochs** (line 1) - that is, two
+# passes over the training dataset. Each pass has an inner loop that
+# **iterates over the training data** (line 4), serving batches of
+# transformed input images and their correct labels.
+# 
+# **Zeroing the gradients** (line 9) is an important step. Gradients are
+# accumulated over a batch; if we do not reset them for every batch, they
+# will keep accumulating, which will provide incorrect gradient values,
+# making learning impossible.
+# 
+# In line 12, we **ask the model for its predictions** on this batch. In
+# the following line (13), we compute the loss - the difference between
+# ``outputs`` (the model prediction) and ``labels`` (the correct output).
+# 
+# In line 14, we do the ``backward()`` pass, and calculate the gradients
+# that will direct the learning.
+# 
+# In line 15, the optimizer performs one learning step - it uses the
+# gradients from the ``backward()`` call to nudge the learning weights in
+# the direction it thinks will reduce the loss.
+# 
+# The remainder of the loop does some light reporting on the epoch number,
+# how many training instances have been completed, and what the collected
+# loss is over the training loop.
+# 
+# **When you run the cell above,** you should see something like this:
+# 
+# ::
+# 
+#    [1,  2000] loss: 2.235
+#    [1,  4000] loss: 1.940
+#    [1,  6000] loss: 1.713
+#    [1,  8000] loss: 1.573
+#    [1, 10000] loss: 1.507
+#    [1, 12000] loss: 1.442
+#    [2,  2000] loss: 1.378
+#    [2,  4000] loss: 1.364
+#    [2,  6000] loss: 1.349
+#    [2,  8000] loss: 1.319
+#    [2, 10000] loss: 1.284
+#    [2, 12000] loss: 1.267
+#    Finished Training
+# 
+# Note that the loss is monotonically descending, indicating that our
+# model is continuing to improve its performance on the training dataset.
+# 
+# As a final step, we should check that the model is actually doing
+# *general* learning, and not simply “memorizing” the dataset. This is
+# called **overfitting,** and usually indicates that the dataset is too
+# small (not enough examples for general learning), or that the model has
+# more learning parameters than it needs to correctly model the dataset.
+# 
+# This is the reason datasets are split into training and test subsets -
+# to test the generality of the model, we ask it to make predictions on
+# data it hasn’t trained on:
+# 
+
+correct = 0
+total = 0
+with torch.no_grad():
+    for data in testloader:
+        images, labels = data
+        outputs = net(images)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+
+print('Accuracy of the network on the 10000 test images: %d %%' % (
+    100 * correct / total))
+
+
+#########################################################################
+# If you followed along, you should see that the model is roughly 50%
+# accurate at this point. That’s not exactly state-of-the-art, but it’s
+# far better than the 10% accuracy we’d expect from a random output. This
+# demonstrates that some general learning did happen in the model.
+# 
diff --git a/beginner_source/introyt/modelsyt_tutorial.py b/beginner_source/introyt/modelsyt_tutorial.py
new file mode 100644
index 000000000..58abe51af
--- /dev/null
+++ b/beginner_source/introyt/modelsyt_tutorial.py
@@ -0,0 +1,422 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+**Building Models** ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+Building Models with PyTorch
+============================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=OSqIP-mOWOI>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/OSqIP-mOWOI" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+``torch.nn.Module`` and ``torch.nn.Parameter``
+----------------------------------------------
+
+In this video, we’ll be discussing some of the tools PyTorch makes
+available for building deep learning networks.
+
+Except for ``Parameter``, the classes we discuss in this video are all
+subclasses of ``torch.nn.Module``. This is the PyTorch base class meant
+to encapsulate behaviors specific to PyTorch Models and their
+components.
+
+One important behavior of ``torch.nn.Module`` is registering parameters.
+If a particular ``Module`` subclass has learning weights, these weights
+are expressed as instances of ``torch.nn.Parameter``. The ``Parameter``
+class is a subclass of ``torch.Tensor``, with the special behavior that
+when they are assigned as attributes of a ``Module``, they are added to
+the list of that modules parameters. These parameters may be accessed
+through the ``parameters()`` method on the ``Module`` class.
+
+As a simple example, here’s a very simple model with two linear layers
+and an activation function. We’ll create an instance of it and ask it to
+report on its parameters:
+
+"""
+
+import torch
+
+class TinyModel(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyModel, self).__init__()
+
+        self.linear1 = torch.nn.Linear(100, 200)
+        self.activation = torch.nn.ReLU()
+        self.linear2 = torch.nn.Linear(200, 10)
+        self.softmax = torch.nn.Softmax()
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.activation(x)
+        x = self.linear2(x)
+        x = self.softmax(x)
+        return x
+
+tinymodel = TinyModel()
+
+print('The model:')
+print(tinymodel)
+
+print('\n\nJust one layer:')
+print(tinymodel.linear2)
+
+print('\n\nModel params:')
+for param in tinymodel.parameters():
+    print(param)
+
+print('\n\nLayer params:')
+for param in tinymodel.linear2.parameters():
+    print(param)
+
+
+#########################################################################
+# This shows the fundamental structure of a PyTorch model: there is an
+# ``__init__()`` method that defines the layers and other components of a
+# model, and a ``forward()`` method where the computation gets done. Note
+# that we can print the model, or any of its submodules, to learn about
+# its structure.
+#
+# Common Layer Types
+# ------------------
+#
+# Linear Layers
+# ~~~~~~~~~~~~~
+#
+# The most basic type of neural network layer is a *linear* or *fully
+# connected* layer. This is a layer where every input influences every
+# output of the layer to a degree specified by the layer’s weights. If a
+# model has *m* inputs and *n* outputs, the weights will be an *m*x*n*
+# matrix. For example:
+#
+
+lin = torch.nn.Linear(3, 2)
+x = torch.rand(1, 3)
+print('Input:')
+print(x)
+
+print('\n\nWeight and Bias parameters:')
+for param in lin.parameters():
+    print(param)
+
+y = lin(x)
+print('\n\nOutput:')
+print(y)
+
+
+#########################################################################
+# If you do the matrix multiplication of ``x`` by the linear layer’s
+# weights, and add the biases, you’ll find that you get the output vector
+# ``y``.
+#
+# One other important feature to note: When we checked the weights of our
+# layer with ``lin.weight``, it reported itself as a ``Parameter`` (which
+# is a subclass of ``Tensor``), and let us know that it’s tracking
+# gradients with autograd. This is a default behavior for ``Parameter``
+# that differs from ``Tensor``.
+#
+# Linear layers are used widely in deep learning models. One of the most
+# common places you’ll see them is in classifier models, which will
+# usually have one or more linear layers at the end, where the last layer
+# will have *n* outputs, where *n* is the number of classes the classifier
+# addresses.
+#
+# Convolutional Layers
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# *Convolutional* layers are built to handle data with a high degree of
+# spatial correlation. They are very commonly used in computer vision,
+# where they detect close groupings of features which the compose into
+# higher-level features. They pop up in other contexts too - for example,
+# in NLP applications, where the a word’s immediate context (that is, the
+# other words nearby in the sequence) can affect the meaning of a
+# sentence.
+#
+# We saw convolutional layers in action in LeNet5 in an earlier video:
+#
+
+import torch.functional as F
+
+
+class LeNet(torch.nn.Module):
+
+    def __init__(self):
+        super(LeNet, self).__init__()
+        # 1 input image channel (black & white), 6 output channels, 5x5 square convolution
+        # kernel
+        self.conv1 = torch.nn.Conv2d(1, 6, 5)
+        self.conv2 = torch.nn.Conv2d(6, 16, 3)
+        # an affine operation: y = Wx + b
+        self.fc1 = torch.nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
+        self.fc2 = torch.nn.Linear(120, 84)
+        self.fc3 = torch.nn.Linear(84, 10)
+
+    def forward(self, x):
+        # Max pooling over a (2, 2) window
+        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
+        # If the size is a square you can only specify a single number
+        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
+        x = x.view(-1, self.num_flat_features(x))
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+    def num_flat_features(self, x):
+        size = x.size()[1:]  # all dimensions except the batch dimension
+        num_features = 1
+        for s in size:
+            num_features *= s
+        return num_features
+
+
+##########################################################################
+# Let’s break down what’s happening in the convolutional layers of this
+# model. Starting with ``conv1``:
+#
+# -  LeNet5 is meant to take in a 1x32x32 black & white image. **The first
+#    argument to a convolutional layer’s constructor is the number of
+#    input channels.** Here, it is 1. If we were building this model to
+#    look at 3-color channels, it would be 3.
+# -  A convolutional layer is like a window that scans over the image,
+#    looking for a pattern it recognizes. These patterns are called
+#    *features,* and one of the parameters of a convolutional layer is the
+#    number of features we would like it to learn. **This is the second
+#    argument to the constructor is the number of output features.** Here,
+#    we’re asking our layer to learn 6 features.
+# -  Just above, I likened the convolutional layer to a window - but how
+#    big is the window? **The third argument is the window or kernel
+#    size.** Here, the “5” means we’ve chosen a 5x5 kernel. (If you want a
+#    kernel with height different from width, you can specify a tuple for
+#    this argument - e.g., ``(3, 5)`` to get a 3x5 convolution kernel.)
+#
+# The output of a convolutional layer is an *activation map* - a spatial
+# representation of the presence of features in the input tensor.
+# ``conv1`` will give us an output tensor of 6x28x28; 6 is the number of
+# features, and 28 is the height and width of our map. (The 28 comes from
+# the fact that when scanning a 5-pixel window over a 32-pixel row, there
+# are only 28 valid positions.)
+#
+# We then pass the output of the convolution through a ReLU activation
+# function (more on activation functions later), then through a max
+# pooling layer. The max pooling layer takes features near each other in
+# the activation map and groups them together. It does this by reducing
+# the tensor, merging every 2x2 group of cells in the output into a single
+# cell, and assigning that cell the maximum value of the 4 cells that went
+# into it. This gives us a lower-resolution version of the activation map,
+# with dimensions 6x14x14.
+#
+# Our next convolutional layer, ``conv2``, expects 6 input channels
+# (corresponding to the 6 features sought by the first layer), has 16
+# output channels, and a 3x3 kernel. It puts out a 16x12x12 activation
+# map, which is again reduced by a max pooling layer to 16x6x6. Prior to
+# passing this output to the linear layers, it is reshaped to a 16 \* 6 \*
+# 6 = 576-element vector for consumption by the next layer.
+#
+# There are convolutional layers for addressing 1D, 2D, and 3D tensors.
+# There are also many more optional arguments for a conv layer
+# constructor, including stride length(e.g., only scanning every second or
+# every third position) in the input, padding (so you can scan out to the
+# edges of the input), and more. See the
+# `documentation <https://pytorch.org/docs/stable/nn.html#convolution-layers>`__
+# for more information.
+#
+# Recurrent Layers
+# ~~~~~~~~~~~~~~~~
+#
+# *Recurrent neural networks* (or *RNNs)* are used for sequential data -
+# anything from time-series measurements from a scientific instrument to
+# natural language sentences to DNA nucleotides. An RNN does this by
+# maintaining a *hidden state* that acts as a sort of memory for what it
+# has seen in the sequence so far.
+#
+# The internal structure of an RNN layer - or its variants, the LSTM (long
+# short-term memory) and GRU (gated recurrent unit) - is moderately
+# complex and beyond the scope of this video, but we’ll show you what one
+# looks like in action with an LSTM-based part-of-speech tagger (a type of
+# classifier that tells you if a word is a noun, verb, etc.):
+#
+
+class LSTMTagger(torch.nn.Module):
+
+    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
+        super(LSTMTagger, self).__init__()
+        self.hidden_dim = hidden_dim
+
+        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
+
+        # The LSTM takes word embeddings as inputs, and outputs hidden states
+        # with dimensionality hidden_dim.
+        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
+
+        # The linear layer that maps from hidden state space to tag space
+        self.hidden2tag = torch.nn.Linear(hidden_dim, tagset_size)
+
+    def forward(self, sentence):
+        embeds = self.word_embeddings(sentence)
+        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
+        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
+        tag_scores = F.log_softmax(tag_space, dim=1)
+        return tag_scores
+
+
+########################################################################
+# The constructor has four arguments:
+#
+# -  ``vocab_size`` is the number of words in the input vocabulary. Each
+#    word is a one-hot vector (or unit vector) in a
+#    ``vocab_size``-dimensional space.
+# -  ``tagset_size`` is the number of tags in the output set.
+# -  ``embedding_dim`` is the size of the *embedding* space for the
+#    vocabulary. An embedding maps a vocabulary onto a low-dimensional
+#    space, where words with similar meanings are close together in the
+#    space.
+# -  ``hidden_dim`` is the size of the LSTM’s memory.
+#
+# The input will be a sentence with the words represented as indices of
+# one-hot vectors. The embedding layer will then map these down to an
+# ``embedding_dim``-dimensional space. The LSTM takes this sequence of
+# embeddings and iterates over it, fielding an output vector of length
+# ``hidden_dim``. The final linear layer acts as a classifier; applying
+# ``log_softmax()`` to the output of the final layer converts the output
+# into a normalized set of estimated probabilities that a given word maps
+# to a given tag.
+#
+# If you’d like to see this network in action, check out the `Sequence
+# Models and LSTM
+# Networks <https://tutorials.pytorch.kr/beginner/nlp/sequence_models_tutorial.html>`__
+# tutorial on pytorch.org.
+#
+# Transformers
+# ~~~~~~~~~~~~
+#
+# *Transformers* are multi-purpose networks that have taken over the state
+# of the art in NLP with models like BERT. A discussion of transformer
+# architecture is beyond the scope of this video, but PyTorch has a
+# ``Transformer`` class that allows you to define the overall parameters
+# of a transformer model - the number of attention heads, the number of
+# encoder & decoder layers, dropout and activation functions, etc. (You
+# can even build the BERT model from this single class, with the right
+# parameters!) The ``torch.nn.Transformer`` class also has classes to
+# encapsulate the individual components (``TransformerEncoder``,
+# ``TransformerDecoder``) and subcomponents (``TransformerEncoderLayer``,
+# ``TransformerDecoderLayer``). For details, check out the
+# `documentation <https://pytorch.org/docs/stable/nn.html#transformer-layers>`__
+# on transformer classes, and the relevant
+# `tutorial <https://tutorials.pytorch.kr/beginner/transformer_tutorial.html>`__
+# on pytorch.org.
+#
+# Other Layers and Functions
+# --------------------------
+#
+# Data Manipulation Layers
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# There are other layer types that perform important functions in models,
+# but don’t participate in the learning process themselves.
+#
+# **Max pooling** (and its twin, min pooling) reduce a tensor by combining
+# cells, and assigning the maximum value of the input cells to the output
+# cell (we saw this). For example:
+#
+
+my_tensor = torch.rand(1, 6, 6)
+print(my_tensor)
+
+maxpool_layer = torch.nn.MaxPool2d(3)
+print(maxpool_layer(my_tensor))
+
+
+#########################################################################
+# If you look closely at the values above, you’ll see that each of the
+# values in the maxpooled output is the maximum value of each quadrant of
+# the 6x6 input.
+#
+# **Normalization layers** re-center and normalize the output of one layer
+# before feeding it to another. Centering the and scaling the intermediate
+# tensors has a number of beneficial effects, such as letting you use
+# higher learning rates without exploding/vanishing gradients.
+#
+
+my_tensor = torch.rand(1, 4, 4) * 20 + 5
+print(my_tensor)
+
+print(my_tensor.mean())
+
+norm_layer = torch.nn.BatchNorm1d(4)
+normed_tensor = norm_layer(my_tensor)
+print(normed_tensor)
+
+print(normed_tensor.mean())
+
+
+
+##########################################################################
+# Running the cell above, we’ve added a large scaling factor and offset to
+# an input tensor; you should see the input tensor’s ``mean()`` somewhere
+# in the neighborhood of 15. After running it through the normalization
+# layer, you can see that the values are smaller, and grouped around zero
+# - in fact, the mean should be very small (> 1e-8).
+#
+# This is beneficial because many activation functions (discussed below)
+# have their strongest gradients near 0, but sometimes suffer from
+# vanishing or exploding gradients for inputs that drive them far away
+# from zero. Keeping the data centered around the area of steepest
+# gradient will tend to mean faster, better learning and higher feasible
+# learning rates.
+#
+# **Dropout layers** are a tool for encouraging *sparse representations*
+# in your model - that is, pushing it to do inference with less data.
+#
+# Dropout layers work by randomly setting parts of the input tensor
+# *during training* - dropout layers are always turned off for inference.
+# This forces the model to learn against this masked or reduced dataset.
+# For example:
+#
+
+my_tensor = torch.rand(1, 4, 4)
+
+dropout = torch.nn.Dropout(p=0.4)
+print(dropout(my_tensor))
+print(dropout(my_tensor))
+
+
+##########################################################################
+# Above, you can see the effect of dropout on a sample tensor. You can use
+# the optional ``p`` argument to set the probability of an individual
+# weight dropping out; if you don’t it defaults to 0.5.
+#
+# Activation Functions
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# Activation functions make deep learning possible. A neural network is
+# really a program - with many parameters - that *simulates a mathematical
+# function*. If all we did was multiple tensors by layer weights
+# repeatedly, we could only simulate *linear functions;* further, there
+# would be no point to having many layers, as the whole network would
+# reduce could be reduced to a single matrix multiplication. Inserting
+# *non-linear* activation functions between layers is what allows a deep
+# learning model to simulate any function, rather than just linear ones.
+#
+# ``torch.nn.Module`` has objects encapsulating all of the major
+# activation functions including ReLU and its many variants, Tanh,
+# Hardtanh, sigmoid, and more. It also includes other functions, such as
+# Softmax, that are most useful at the output stage of a model.
+#
+# Loss Functions
+# ~~~~~~~~~~~~~~
+#
+# Loss functions tell us how far a model’s prediction is from the correct
+# answer. PyTorch contains a variety of loss functions, including common
+# MSE (mean squared error = L2 norm), Cross Entropy Loss and Negative
+# Likelihood Loss (useful for classifiers), and others.
+#
diff --git a/beginner_source/introyt/tensorboardyt_tutorial.py b/beginner_source/introyt/tensorboardyt_tutorial.py
new file mode 100644
index 000000000..934700f1c
--- /dev/null
+++ b/beginner_source/introyt/tensorboardyt_tutorial.py
@@ -0,0 +1,315 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+**TensorBoard Support** ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+PyTorch TensorBoard Support
+===========================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=6CEld3hZgqc>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/6CEld3hZgqc" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+Before You Start
+----------------
+
+To run this tutorial, you’ll need to install PyTorch, TorchVision,
+Matplotlib, and TensorBoard.
+
+With ``conda``:
+
+``conda install pytorch torchvision -c pytorch``
+``conda install matplotlib tensorboard``
+
+With ``pip``:
+
+``pip install torch torchvision matplotlib tensorboard``
+
+Once the dependencies are installed, restart this notebook in the Python
+environment where you installed them.
+
+
+Introduction
+------------
+
+In this notebook, we’ll be training a variant of LeNet-5 against the
+Fashion-MNIST dataset. Fashion-MNIST is a set of image tiles depicting
+various garments, with ten class labels indicating the type of garment
+depicted.
+
+"""
+
+# PyTorch model and training necessities
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+# Image datasets and image manipulation
+import torchvision
+import torchvision.transforms as transforms
+
+# Image display
+import matplotlib.pyplot as plt
+import numpy as np
+
+# PyTorch TensorBoard support
+from torch.utils.tensorboard import SummaryWriter
+
+
+######################################################################
+# Showing Images in TensorBoard
+# -----------------------------
+#
+# Let’s start by adding sample images from our dataset to TensorBoard:
+#
+
+# Gather datasets and prepare them for consumption
+transform = transforms.Compose(
+    [transforms.ToTensor(),
+    transforms.Normalize((0.5,), (0.5,))])
+
+# Store separate training and validations splits in ./data
+training_set = torchvision.datasets.FashionMNIST('./data',
+    download=True,
+    train=True,
+    transform=transform)
+validation_set = torchvision.datasets.FashionMNIST('./data',
+    download=True,
+    train=False,
+    transform=transform)
+
+training_loader = torch.utils.data.DataLoader(training_set,
+                                              batch_size=4,
+                                              shuffle=True,
+                                              num_workers=2)
+
+
+validation_loader = torch.utils.data.DataLoader(validation_set,
+                                                batch_size=4,
+                                                shuffle=False,
+                                                num_workers=2)
+
+# Class labels
+classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
+        'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot')
+
+# Helper function for inline image display
+def matplotlib_imshow(img, one_channel=False):
+    if one_channel:
+        img = img.mean(dim=0)
+    img = img / 2 + 0.5     # unnormalize
+    npimg = img.numpy()
+    if one_channel:
+        plt.imshow(npimg, cmap="Greys")
+    else:
+        plt.imshow(np.transpose(npimg, (1, 2, 0)))
+
+# Extract a batch of 4 images
+dataiter = iter(training_loader)
+images, labels = dataiter.next()
+
+# Create a grid from the images and show them
+img_grid = torchvision.utils.make_grid(images)
+matplotlib_imshow(img_grid, one_channel=True)
+
+
+########################################################################
+# Above, we used TorchVision and Matplotlib to create a visual grid of a
+# minibatch of our input data. Below, we use the ``add_image()`` call on
+# ``SummaryWriter`` to log the image for consumption by TensorBoard, and
+# we also call ``flush()`` to make sure it’s written to disk right away.
+#
+
+# Default log_dir argument is "runs" - but it's good to be specific
+# torch.utils.tensorboard.SummaryWriter is imported above
+writer = SummaryWriter('runs/fashion_mnist_experiment_1')
+
+# Write image data to TensorBoard log dir
+writer.add_image('Four Fashion-MNIST Images', img_grid)
+writer.flush()
+
+# To view, start TensorBoard on the command line with:
+#   tensorboard --logdir=runs
+# ...and open a browser tab to http://localhost:6006/
+
+
+##########################################################################
+# If you start TensorBoard at the command line and open it in a new
+# browser tab (usually at `localhost:6006 <localhost:6006>`__), you should
+# see the image grid under the IMAGES tab.
+#
+# Graphing Scalars to Visualize Training
+# --------------------------------------
+#
+# TensorBoard is useful for tracking the progress and efficacy of your
+# training. Below, we’ll run a training loop, track some metrics, and save
+# the data for TensorBoard’s consumption.
+#
+# Let’s define a model to categorize our image tiles, and an optimizer and
+# loss function for training:
+#
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 4 * 4, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 4 * 4)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+net = Net()
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+
+##########################################################################
+# Now let’s train a single epoch, and evaluate the training vs. validation
+# set losses every 1000 batches:
+#
+
+print(len(validation_loader))
+for epoch in range(1):  # loop over the dataset multiple times
+    running_loss = 0.0
+
+    for i, data in enumerate(training_loader, 0):
+        # basic training loop
+        inputs, labels = data
+        optimizer.zero_grad()
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        running_loss += loss.item()
+        if i % 1000 == 999:    # Every 1000 mini-batches...
+            print('Batch {}'.format(i + 1))
+            # Check against the validation set
+            running_vloss = 0.0
+
+            net.train(False) # Don't need to track gradents for validation
+            for j, vdata in enumerate(validation_loader, 0):
+                vinputs, vlabels = vdata
+                voutputs = net(vinputs)
+                vloss = criterion(voutputs, vlabels)
+                running_vloss += vloss.item()
+            net.train(True) # Turn gradients back on for training
+
+            avg_loss = running_loss / 1000
+            avg_vloss = running_vloss / len(validation_loader)
+
+            # Log the running loss averaged per batch
+            writer.add_scalars('Training vs. Validation Loss',
+                            { 'Training' : avg_loss, 'Validation' : avg_vloss },
+                            epoch * len(training_loader) + i)
+
+            running_loss = 0.0
+print('Finished Training')
+
+writer.flush()
+
+
+#########################################################################
+# Switch to your open TensorBoard and have a look at the SCALARS tab.
+#
+# Visualizing Your Model
+# ----------------------
+#
+# TensorBoard can also be used to examine the data flow within your model.
+# To do this, call the ``add_graph()`` method with a model and sample
+# input. When you open
+#
+
+# Again, grab a single mini-batch of images
+dataiter = iter(training_loader)
+images, labels = dataiter.next()
+
+# add_graph() will trace the sample input through your model,
+# and render it as a graph.
+writer.add_graph(net, images)
+writer.flush()
+
+
+#########################################################################
+# When you switch over to TensorBoard, you should see a GRAPHS tab.
+# Double-click the “NET” node to see the layers and data flow within your
+# model.
+#
+# Visualizing Your Dataset with Embeddings
+# ----------------------------------------
+#
+# The 28-by-28 image tiles we’re using can be modeled as 784-dimensional
+# vectors (28 \* 28 = 784). It can be instructive to project this to a
+# lower-dimensional representation. The ``add_embedding()`` method will
+# project a set of data onto the three dimensions with highest variance,
+# and display them as an interactive 3D chart. The ``add_embedding()``
+# method does this automatically by projecting to the three dimensions
+# with highest variance.
+#
+# Below, we’ll take a sample of our data, and generate such an embedding:
+#
+
+# Select a random subset of data and corresponding labels
+def select_n_random(data, labels, n=100):
+    assert len(data) == len(labels)
+
+    perm = torch.randperm(len(data))
+    return data[perm][:n], labels[perm][:n]
+
+# Extract a random subset of data
+images, labels = select_n_random(training_set.data, training_set.targets)
+
+# get the class labels for each image
+class_labels = [classes[label] for label in labels]
+
+# log embeddings
+features = images.view(-1, 28 * 28)
+writer.add_embedding(features,
+                    metadata=class_labels,
+                    label_img=images.unsqueeze(1))
+writer.flush()
+writer.close()
+
+
+#######################################################################
+# Now if you switch to TensorBoard and select the PROJECTOR tab, you
+# should see a 3D representation of the projection. You can rotate and
+# zoom the model. Examine it at large and small scales, and see whether
+# you can spot patterns in the projected data and the clustering of
+# labels.
+#
+# For better visibility, it’s recommended to:
+#
+# - Select “label” from the “Color by” drop-down on the left.
+# - Toggle the Night Mode icon along the top to place the
+#   light-colored images on a dark background.
+#
+# Other Resources
+# ---------------
+#
+# For more information, have a look at:
+#
+# - PyTorch documentation on `torch.utils.tensorboard.SummaryWriter <https://pytorch.org/docs/stable/tensorboard.html?highlight=summarywriter>`__
+# - Tensorboard tutorial content in the `PyTorch.org Tutorials <https://tutorials.pytorch.kr/>`__
+# - For more information about TensorBoard, see the `TensorBoard
+#   documentation <https://www.tensorflow.org/tensorboard>`__
diff --git a/beginner_source/introyt/tensors_deeper_tutorial.py b/beginner_source/introyt/tensors_deeper_tutorial.py
new file mode 100644
index 000000000..1f6d72488
--- /dev/null
+++ b/beginner_source/introyt/tensors_deeper_tutorial.py
@@ -0,0 +1,951 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+**Tensors** ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+`Training Models <trainingyt.html>`_ ||
+`Model Understanding <captumyt.html>`_
+
+Introduction to PyTorch Tensors
+===============================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=r7QDUPb2dCM>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/r7QDUPb2dCM" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+Tensors are the central data abstraction in PyTorch. This interactive
+notebook provides an in-depth introduction to the ``torch.Tensor``
+class.
+
+First things first, let’s import the PyTorch module. We’ll also add
+Python’s math module to facilitate some of the examples.
+
+"""
+
+import torch
+import math
+
+
+#########################################################################
+# Creating Tensors
+# ----------------
+# 
+# The simplest way to create a tensor is with the ``torch.empty()`` call:
+# 
+
+x = torch.empty(3, 4)
+print(type(x))
+print(x)
+
+
+##########################################################################
+# Let’s unpack what we just did:
+# 
+# -  We created a tensor using one of the numerous factory methods
+#    attached to the ``torch`` module.
+# -  The tensor itself is 2-dimensional, having 3 rows and 4 columns.
+# -  The type of the object returned is ``torch.Tensor``, which is an
+#    alias for ``torch.FloatTensor``; by default, PyTorch tensors are
+#    populated with 32-bit floating point numbers. (More on data types
+#    below.)
+# -  You will probably see some random-looking values when printing your
+#    tensor. The ``torch.empty()`` call allocates memory for the tensor,
+#    but does not initialize it with any values - so what you’re seeing is
+#    whatever was in memory at the time of allocation.
+# 
+# A brief note about tensors and their number of dimensions, and
+# terminology:
+# 
+# -  You will sometimes see a 1-dimensional tensor called a
+#    *vector.* 
+# -  Likewise, a 2-dimensional tensor is often referred to as a
+#    *matrix.* 
+# -  Anything with more than two dimensions is generally just
+#    called a tensor.
+# 
+# More often than not, you’ll want to initialize your tensor with some
+# value. Common cases are all zeros, all ones, or random values, and the
+# ``torch`` module provides factory methods for all of these:
+# 
+
+zeros = torch.zeros(2, 3)
+print(zeros)
+
+ones = torch.ones(2, 3)
+print(ones)
+
+torch.manual_seed(1729)
+random = torch.rand(2, 3)
+print(random)
+
+
+#########################################################################
+# The factory methods all do just what you’d expect - we have a tensor
+# full of zeros, another full of ones, and another with random values
+# between 0 and 1.
+# 
+# Random Tensors and Seeding
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# Speaking of the random tensor, did you notice the call to
+# ``torch.manual_seed()`` immediately preceding it? Initializing tensors,
+# such as a model’s learning weights, with random values is common but
+# there are times - especially in research settings - where you’ll want
+# some assurance of the reproducibility of your results. Manually setting
+# your random number generator’s seed is the way to do this. Let’s look
+# more closely:
+# 
+
+torch.manual_seed(1729)
+random1 = torch.rand(2, 3)
+print(random1)
+
+random2 = torch.rand(2, 3)
+print(random2)
+
+torch.manual_seed(1729)
+random3 = torch.rand(2, 3)
+print(random3)
+
+random4 = torch.rand(2, 3)
+print(random4)
+
+
+############################################################################
+# What you should see above is that ``random1`` and ``random3`` carry
+# identical values, as do ``random2`` and ``random4``. Manually setting
+# the RNG’s seed resets it, so that identical computations depending on
+# random number should, in most settings, provide identical results.
+# 
+# For more information, see the `PyTorch documentation on
+# reproducibility <https://pytorch.org/docs/stable/notes/randomness.html>`__.
+# 
+# Tensor Shapes
+# ~~~~~~~~~~~~~
+# 
+# Often, when you’re performing operations on two or more tensors, they
+# will need to be of the same *shape* - that is, having the same number of
+# dimensions and the same number of cells in each dimension. For that, we
+# have the ``torch.*_like()`` methods:
+# 
+
+x = torch.empty(2, 2, 3)
+print(x.shape)
+print(x)
+
+empty_like_x = torch.empty_like(x)
+print(empty_like_x.shape)
+print(empty_like_x)
+
+zeros_like_x = torch.zeros_like(x)
+print(zeros_like_x.shape)
+print(zeros_like_x)
+
+ones_like_x = torch.ones_like(x)
+print(ones_like_x.shape)
+print(ones_like_x)
+
+rand_like_x = torch.rand_like(x)
+print(rand_like_x.shape)
+print(rand_like_x)
+
+
+#########################################################################
+# The first new thing in the code cell above is the use of the ``.shape``
+# property on a tensor. This property contains a list of the extent of
+# each dimension of a tensor - in our case, ``x`` is a three-dimensional
+# tensor with shape 2 x 2 x 3.
+# 
+# Below that, we call the ``.empty_like()``, ``.zeros_like()``,
+# ``.ones_like()``, and ``.rand_like()`` methods. Using the ``.shape``
+# property, we can verify that each of these methods returns a tensor of
+# identical dimensionality and extent.
+# 
+# The last way to create a tensor that will cover is to specify its data
+# directly from a PyTorch collection:
+# 
+
+some_constants = torch.tensor([[3.1415926, 2.71828], [1.61803, 0.0072897]])
+print(some_constants)
+
+some_integers = torch.tensor((2, 3, 5, 7, 11, 13, 17, 19))
+print(some_integers)
+
+more_integers = torch.tensor(((2, 4, 6), [3, 6, 9]))
+print(more_integers)
+
+
+######################################################################
+# Using ``torch.tensor()`` is the most straightforward way to create a
+# tensor if you already have data in a Python tuple or list. As shown
+# above, nesting the collections will result in a multi-dimensional
+# tensor.
+# 
+# .. note::
+#      ``torch.tensor()`` creates a copy of the data.
+# 
+# Tensor Data Types
+# ~~~~~~~~~~~~~~~~~
+# 
+# Setting the datatype of a tensor is possible a couple of ways:
+# 
+
+a = torch.ones((2, 3), dtype=torch.int16)
+print(a)
+
+b = torch.rand((2, 3), dtype=torch.float64) * 20.
+print(b)
+
+c = b.to(torch.int32)
+print(c)
+
+
+##########################################################################
+# The simplest way to set the underlying data type of a tensor is with an
+# optional argument at creation time. In the first line of the cell above,
+# we set ``dtype=torch.int16`` for the tensor ``a``. When we print ``a``,
+# we can see that it’s full of ``1`` rather than ``1.`` - Python’s subtle
+# cue that this is an integer type rather than floating point.
+# 
+# Another thing to notice about printing ``a`` is that, unlike when we
+# left ``dtype`` as the default (32-bit floating point), printing the
+# tensor also specifies its ``dtype``.
+# 
+# You may have also spotted that we went from specifying the tensor’s
+# shape as a series of integer arguments, to grouping those arguments in a
+# tuple. This is not strictly necessary - PyTorch will take a series of
+# initial, unlabeled integer arguments as a tensor shape - but when adding
+# the optional arguments, it can make your intent more readable.
+# 
+# The other way to set the datatype is with the ``.to()`` method. In the
+# cell above, we create a random floating point tensor ``b`` in the usual
+# way. Following that, we create ``c`` by converting ``b`` to a 32-bit
+# integer with the ``.to()`` method. Note that ``c`` contains all the same
+# values as ``b``, but truncated to integers.
+# 
+# Available data types include:
+# 
+# -  ``torch.bool``
+# -  ``torch.int8``
+# -  ``torch.uint8``
+# -  ``torch.int16``
+# -  ``torch.int32``
+# -  ``torch.int64``
+# -  ``torch.half``
+# -  ``torch.float``
+# -  ``torch.double``
+# -  ``torch.bfloat``
+# 
+# Math & Logic with PyTorch Tensors
+# ---------------------------------
+# 
+# Now that you know some of the ways to create a tensor… what can you do
+# with them?
+# 
+# Let’s look at basic arithmetic first, and how tensors interact with
+# simple scalars:
+# 
+
+ones = torch.zeros(2, 2) + 1
+twos = torch.ones(2, 2) * 2
+threes = (torch.ones(2, 2) * 7 - 1) / 2
+fours = twos ** 2
+sqrt2s = twos ** 0.5
+
+print(ones)
+print(twos)
+print(threes)
+print(fours)
+print(sqrt2s)
+
+
+##########################################################################
+# As you can see above, arithmetic operations between tensors and scalars,
+# such as addition, subtraction, multiplication, division, and
+# exponentiation are distributed over every element of the tensor. Because
+# the output of such an operation will be a tensor, you can chain them
+# together with the usual operator precedence rules, as in the line where
+# we create ``threes``.
+# 
+# Similar operations between two tensors also behave like you’d
+# intuitively expect:
+# 
+
+powers2 = twos ** torch.tensor([[1, 2], [3, 4]])
+print(powers2)
+
+fives = ones + fours
+print(fives)
+
+dozens = threes * fours
+print(dozens)
+
+
+##########################################################################
+# It’s important to note here that all of the tensors in the previous code
+# cell were of identical shape. What happens when we try to perform a
+# binary operation on tensors if dissimilar shape?
+# 
+# .. note::
+#      The following cell throws a run-time error. This is intentional.
+#
+# ::
+#
+#    a = torch.rand(2, 3)
+#    b = torch.rand(3, 2)
+#
+#    print(a * b)
+#
+
+
+##########################################################################
+# In the general case, you cannot operate on tensors of different shape
+# this way, even in a case like the cell above, where the tensors have an
+# identical number of elements.
+# 
+# In Brief: Tensor Broadcasting
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# .. note::
+#      If you are familiar with broadcasting semantics in NumPy
+#      ndarrays, you’ll find the same rules apply here.
+# 
+# The exception to the same-shapes rule is *tensor broadcasting.* Here’s
+# an example:
+# 
+
+rand = torch.rand(2, 4)
+doubled = rand * (torch.ones(1, 4) * 2)
+
+print(rand)
+print(doubled)
+
+
+#########################################################################
+# What’s the trick here? How is it we got to multiply a 2x4 tensor by a
+# 1x4 tensor?
+# 
+# Broadcasting is a way to perform an operation between tensors that have
+# similarities in their shapes. In the example above, the one-row,
+# four-column tensor is multiplied by *both rows* of the two-row,
+# four-column tensor.
+# 
+# This is an important operation in Deep Learning. The common example is
+# multiplying a tensor of learning weights by a *batch* of input tensors,
+# applying the operation to each instance in the batch separately, and
+# returning a tensor of identical shape - just like our (2, 4) \* (1, 4)
+# example above returned a tensor of shape (2, 4).
+# 
+# The rules for broadcasting are:
+# 
+# -  Each tensor must have at least one dimension - no empty tensors.
+# 
+# -  Comparing the dimension sizes of the two tensors, *going from last to
+#    first:*
+# 
+#    -  Each dimension must be equal, *or*
+# 
+#    -  One of the dimensions must be of size 1, *or*
+# 
+#    -  The dimension does not exist in one of the tensors
+# 
+# Tensors of identical shape, of course, are trivially “broadcastable”, as
+# you saw earlier.
+# 
+# Here are some examples of situations that honor the above rules and
+# allow broadcasting:
+# 
+
+a =     torch.ones(4, 3, 2)
+
+b = a * torch.rand(   3, 2) # 3rd & 2nd dims identical to a, dim 1 absent
+print(b)
+
+c = a * torch.rand(   3, 1) # 3rd dim = 1, 2nd dim identical to a
+print(c)
+
+d = a * torch.rand(   1, 2) # 3rd dim identical to a, 2nd dim = 1
+print(d)
+
+
+#############################################################################
+# Look closely at the values of each tensor above: 
+#
+# -  The multiplication operation that created ``b`` was 
+#    broadcast over every “layer” of ``a``.
+# -  For ``c``, the operation was broadcast over ever layer and row of
+#    ``a`` - every 3-element column is identical. 
+# -  For ``d``, we switched it around - now every *row* is identical,
+#    across layers and columns.
+# 
+# For more information on broadcasting, see the `PyTorch
+# documentation <https://pytorch.org/docs/stable/notes/broadcasting.html>`__
+# on the topic.
+# 
+# Here are some examples of attempts at broadcasting that will fail:
+# 
+# .. note::
+#       The following cell throws a run-time error. This is intentional.
+#
+# ::
+#
+#    a =     torch.ones(4, 3, 2)
+#
+#    b = a * torch.rand(4, 3)    # dimensions must match last-to-first
+#
+#    c = a * torch.rand(   2, 3) # both 3rd & 2nd dims different
+#
+#    d = a * torch.rand((0, ))   # can't broadcast with an empty tensor
+#
+
+
+###########################################################################
+# More Math with Tensors
+# ~~~~~~~~~~~~~~~~~~~~~~
+# 
+# PyTorch tensors have over three hundred operations that can be performed
+# on them.
+# 
+# Here is a small sample from some of the major categories of operations:
+# 
+
+# common functions
+a = torch.rand(2, 4) * 2 - 1
+print('Common functions:')
+print(torch.abs(a))
+print(torch.ceil(a))
+print(torch.floor(a))
+print(torch.clamp(a, -0.5, 0.5))
+
+# trigonometric functions and their inverses
+angles = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4])
+sines = torch.sin(angles)
+inverses = torch.asin(sines)
+print('\nSine and arcsine:')
+print(angles)
+print(sines)
+print(inverses)
+
+# bitwise operations
+print('\nBitwise XOR:')
+b = torch.tensor([1, 5, 11])
+c = torch.tensor([2, 7, 10])
+print(torch.bitwise_xor(b, c))
+
+# comparisons:
+print('\nBroadcasted, element-wise equality comparison:')
+d = torch.tensor([[1., 2.], [3., 4.]])
+e = torch.ones(1, 2)  # many comparison ops support broadcasting!
+print(torch.eq(d, e)) # returns a tensor of type bool
+
+# reductions:
+print('\nReduction ops:')
+print(torch.max(d))        # returns a single-element tensor
+print(torch.max(d).item()) # extracts the value from the returned tensor
+print(torch.mean(d))       # average
+print(torch.std(d))        # standard deviation
+print(torch.prod(d))       # product of all numbers
+print(torch.unique(torch.tensor([1, 2, 1, 2, 1, 2]))) # filter unique elements
+
+# vector and linear algebra operations
+v1 = torch.tensor([1., 0., 0.])         # x unit vector
+v2 = torch.tensor([0., 1., 0.])         # y unit vector
+m1 = torch.rand(2, 2)                   # random matrix
+m2 = torch.tensor([[3., 0.], [0., 3.]]) # three times identity matrix
+
+print('\nVectors & Matrices:')
+print(torch.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1)
+print(m1)
+m3 = torch.matmul(m1, m2)
+print(m3)                  # 3 times m1
+print(torch.svd(m3))       # singular value decomposition
+
+
+##################################################################################
+# This is a small sample of operations. For more details and the full inventory of
+# math functions, have a look at the
+# `documentation <https://pytorch.org/docs/stable/torch.html#math-operations>`__.
+# 
+# Altering Tensors in Place
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# Most binary operations on tensors will return a third, new tensor. When
+# we say ``c = a * b`` (where ``a`` and ``b`` are tensors), the new tensor
+# ``c`` will occupy a region of memory distinct from the other tensors.
+# 
+# There are times, though, that you may wish to alter a tensor in place -
+# for example, if you’re doing an element-wise computation where you can
+# discard intermediate values. For this, most of the math functions have a
+# version with an appended underscore (``_``) that will alter a tensor in
+# place.
+# 
+# For example:
+# 
+
+a = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4])
+print('a:')
+print(a)
+print(torch.sin(a))   # this operation creates a new tensor in memory
+print(a)              # a has not changed
+
+b = torch.tensor([0, math.pi / 4, math.pi / 2, 3 * math.pi / 4])
+print('\nb:')
+print(b)
+print(torch.sin_(b))  # note the underscore
+print(b)              # b has changed
+
+
+#######################################################################
+# For arithmetic operations, there are functions that behave similarly:
+# 
+
+a = torch.ones(2, 2)
+b = torch.rand(2, 2)
+
+print('Before:')
+print(a)
+print(b)
+print('\nAfter adding:')
+print(a.add_(b))
+print(a)
+print(b)
+print('\nAfter multiplying')
+print(b.mul_(b))
+print(b)
+
+
+##########################################################################
+# Note that these in-place arithmetic functions are methods on the
+# ``torch.Tensor`` object, not attached to the ``torch`` module like many
+# other functions (e.g., ``torch.sin()``). As you can see from
+# ``a.add_(b)``, *the calling tensor is the one that gets changed in
+# place.*
+# 
+# There is another option for placing the result of a computation in an
+# existing, allocated tensor. Many of the methods and functions we’ve seen
+# so far - including creation methods! - have an ``out`` argument that
+# lets you specify a tensor to receive the output. If the ``out`` tensor
+# is the correct shape and ``dtype``, this can happen without a new memory
+# allocation:
+# 
+
+a = torch.rand(2, 2)
+b = torch.rand(2, 2)
+c = torch.zeros(2, 2)
+old_id = id(c)
+
+print(c)
+d = torch.matmul(a, b, out=c)
+print(c)                # contents of c have changed
+
+assert c is d           # test c & d are same object, not just containing equal values
+assert id(c), old_id    # make sure that our new c is the same object as the old one
+
+torch.rand(2, 2, out=c) # works for creation too!
+print(c)                # c has changed again
+assert id(c), old_id    # still the same object!
+
+
+##########################################################################
+# Copying Tensors
+# ---------------
+# 
+# As with any object in Python, assigning a tensor to a variable makes the
+# variable a *label* of the tensor, and does not copy it. For example:
+# 
+
+a = torch.ones(2, 2)
+b = a
+
+a[0][1] = 561  # we change a...
+print(b)       # ...and b is also altered
+
+
+######################################################################
+# But what if you want a separate copy of the data to work on? The
+# ``clone()`` method is there for you:
+# 
+
+a = torch.ones(2, 2)
+b = a.clone()
+
+assert b is not a      # different objects in memory...
+print(torch.eq(a, b))  # ...but still with the same contents!
+
+a[0][1] = 561          # a changes...
+print(b)               # ...but b is still all ones
+
+
+#########################################################################
+# **There is an important thing to be aware of when using ``clone()``.**
+# If your source tensor has autograd, enabled then so will the clone.
+# **This will be covered more deeply in the video on autograd,** but if
+# you want the light version of the details, continue on.
+# 
+# *In many cases, this will be what you want.* For example, if your model
+# has multiple computation paths in its ``forward()`` method, and *both*
+# the original tensor and its clone contribute to the model’s output, then
+# to enable model learning you want autograd turned on for both tensors.
+# If your source tensor has autograd enabled (which it generally will if
+# it’s a set of learning weights or derived from a computation involving
+# the weights), then you’ll get the result you want.
+# 
+# On the other hand, if you’re doing a computation where *neither* the
+# original tensor nor its clone need to track gradients, then as long as
+# the source tensor has autograd turned off, you’re good to go.
+# 
+# *There is a third case,* though: Imagine you’re performing a computation
+# in your model’s ``forward()`` function, where gradients are turned on
+# for everything by default, but you want to pull out some values
+# mid-stream to generate some metrics. In this case, you *don’t* want the
+# cloned copy of your source tensor to track gradients - performance is
+# improved with autograd’s history tracking turned off. For this, you can
+# use the ``.detach()`` method on the source tensor:
+# 
+
+a = torch.rand(2, 2, requires_grad=True) # turn on autograd
+print(a)
+
+b = a.clone()
+print(b)
+
+c = a.detach().clone()
+print(c)
+
+print(a)
+
+
+#########################################################################
+# What’s happening here?
+# 
+# -  We create ``a`` with ``requires_grad=True`` turned on. **We haven’t
+#    covered this optional argument yet, but will during the unit on
+#    autograd.**
+# -  When we print ``a``, it informs us that the property
+#    ``requires_grad=True`` - this means that autograd and computation
+#    history tracking are turned on.
+# -  We clone ``a`` and label it ``b``. When we print ``b``, we can see
+#    that it’s tracking its computation history - it has inherited
+#    ``a``\ ’s autograd settings, and added to the computation history.
+# -  We clone ``a`` into ``c``, but we call ``detach()`` first.
+# -  Printing ``c``, we see no computation history, and no
+#    ``requires_grad=True``.
+# 
+# The ``detach()`` method *detaches the tensor from its computation
+# history.* It says, “do whatever comes next as if autograd was off.” It
+# does this *without* changing ``a`` - you can see that when we print
+# ``a`` again at the end, it retains its ``requires_grad=True`` property.
+# 
+# Moving to GPU
+# -------------
+# 
+# One of the major advantages of PyTorch is its robust acceleration on
+# CUDA-compatible Nvidia GPUs. (“CUDA” stands for *Compute Unified Device
+# Architecture*, which is Nvidia’s platform for parallel computing.) So
+# far, everything we’ve done has been on CPU. How do we move to the faster
+# hardware?
+# 
+# First, we should check whether a GPU is available, with the
+# ``is_available()`` method.
+# 
+# .. note::
+#      If you do not have a CUDA-compatible GPU and CUDA drivers
+#      installed, the executable cells in this section will not execute any
+#      GPU-related code.
+# 
+
+if torch.cuda.is_available():
+    print('We have a GPU!')
+else:
+    print('Sorry, CPU only.')
+
+
+##########################################################################
+# Once we’ve determined that one or more GPUs is available, we need to put
+# our data someplace where the GPU can see it. Your CPU does computation
+# on data in your computer’s RAM. Your GPU has dedicated memory attached
+# to it. Whenever you want to perform a computation on a device, you must
+# move *all* the data needed for that computation to memory accessible by
+# that device. (Colloquially, “moving the data to memory accessible by the
+# GPU” is shorted to, “moving the data to the GPU”.)
+# 
+# There are multiple ways to get your data onto your target device. You
+# may do it at creation time:
+# 
+
+if torch.cuda.is_available():
+    gpu_rand = torch.rand(2, 2, device='cuda')
+    print(gpu_rand)
+else:
+    print('Sorry, CPU only.')
+
+
+##########################################################################
+# By default, new tensors are created on the CPU, so we have to specify
+# when we want to create our tensor on the GPU with the optional
+# ``device`` argument. You can see when we print the new tensor, PyTorch
+# informs us which device it’s on (if it’s not on CPU).
+# 
+# You can query the number of GPUs with ``torch.cuda.device_count()``. If
+# you have more than one GPU, you can specify them by index:
+# ``device='cuda:0'``, ``device='cuda:1'``, etc.
+# 
+# As a coding practice, specifying our devices everywhere with string
+# constants is pretty fragile. In an ideal world, your code would perform
+# robustly whether you’re on CPU or GPU hardware. You can do this by
+# creating a device handle that can be passed to your tensors instead of a
+# string:
+# 
+
+if torch.cuda.is_available():
+    my_device = torch.device('cuda')
+else:
+    my_device = torch.device('cpu')
+print('Device: {}'.format(my_device))
+
+x = torch.rand(2, 2, device=my_device)
+print(x)
+
+
+#########################################################################
+# If you have an existing tensor living on one device, you can move it to
+# another with the ``to()`` method. The following line of code creates a
+# tensor on CPU, and moves it to whichever device handle you acquired in
+# the previous cell.
+# 
+
+y = torch.rand(2, 2)
+y = y.to(my_device)
+
+
+##########################################################################
+# It is important to know that in order to do computation involving two or
+# more tensors, *all of the tensors must be on the same device*. The
+# following code will throw a runtime error, regardless of whether you
+# have a GPU device available:
+# 
+# ::
+# 
+#    x = torch.rand(2, 2)
+#    y = torch.rand(2, 2, device='gpu')
+#    z = x + y  # exception will be thrown
+# 
+
+
+###########################################################################
+# Manipulating Tensor Shapes
+# --------------------------
+# 
+# Sometimes, you’ll need to change the shape of your tensor. Below, we’ll
+# look at a few common cases, and how to handle them.
+# 
+# Changing the Number of Dimensions
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+# One case where you might need to change the number of dimensions is
+# passing a single instance of input to your model. PyTorch models
+# generally expect *batches* of input.
+# 
+# For example, imagine having a model that works on 3 x 226 x 226 images -
+# a 226-pixel square with 3 color channels. When you load and transform
+# it, you’ll get a tensor of shape ``(3, 226, 226)``. Your model, though,
+# is expecting input of shape ``(N, 3, 226, 226)``, where ``N`` is the
+# number of images in the batch. So how do you make a batch of one?
+# 
+
+a = torch.rand(3, 226, 226)
+b = a.unsqueeze(0)
+
+print(a.shape)
+print(b.shape)
+
+
+##########################################################################
+# The ``unsqueeze()`` method adds a dimension of extent 1.
+# ``unsqueeze(0)`` adds it as a new zeroth dimension - now you have a
+# batch of one!
+# 
+# So if that’s *un*\ squeezing? What do we mean by squeezing? We’re taking
+# advantage of the fact that any dimension of extent 1 *does not* change
+# the number of elements in the tensor.
+# 
+
+c = torch.rand(1, 1, 1, 1, 1)
+print(c)
+
+
+##########################################################################
+# Continuing the example above, let’s say the model’s output is a
+# 20-element vector for each input. You would then expect the output to
+# have shape ``(N, 20)``, where ``N`` is the number of instances in the
+# input batch. That means that for our single-input batch, we’ll get an
+# output of shape ``(1, 20)``.
+# 
+# What if you want to do some *non-batched* computation with that output -
+# something that’s just expecting a 20-element vector?
+# 
+
+a = torch.rand(1, 20)
+print(a.shape)
+print(a)
+
+b = a.squeeze(0)
+print(b.shape)
+print(b)
+
+c = torch.rand(2, 2)
+print(c.shape)
+
+d = c.squeeze(0)
+print(d.shape)
+
+
+#########################################################################
+# You can see from the shapes that our 2-dimensional tensor is now
+# 1-dimensional, and if you look closely at the output of the cell above
+# you’ll see that printing ``a`` shows an “extra” set of square brackets
+# ``[]`` due to having an extra dimension.
+# 
+# You may only ``squeeze()`` dimensions of extent 1. See above where we
+# try to squeeze a dimension of size 2 in ``c``, and get back the same
+# shape we started with. Calls to ``squeeze()`` and ``unsqueeze()`` can
+# only act on dimensions of extent 1 because to do otherwise would change
+# the number of elements in the tensor.
+# 
+# Another place you might use ``unsqueeze()`` is to ease broadcasting.
+# Recall the example above where we had the following code:
+# 
+# ::
+# 
+#    a =     torch.ones(4, 3, 2)
+# 
+#    c = a * torch.rand(   3, 1) # 3rd dim = 1, 2nd dim identical to a
+#    print(c)
+# 
+# The net effect of that was to broadcast the operation over dimensions 0
+# and 2, causing the random, 3 x 1 tensor to be multiplied element-wise by
+# every 3-element column in ``a``.
+# 
+# What if the random vector had just been 3-element vector? We’d lose the
+# ability to do the broadcast, because the final dimensions would not
+# match up according to the broadcasting rules. ``unsqueeze()`` comes to
+# the rescue:
+# 
+
+a = torch.ones(4, 3, 2)
+b = torch.rand(   3)     # trying to multiply a * b will give a runtime error
+c = b.unsqueeze(1)       # change to a 2-dimensional tensor, adding new dim at the end
+print(c.shape)
+print(a * c)             # broadcasting works again!
+
+
+######################################################################
+# The ``squeeze()`` and ``unsqueeze()`` methods also have in-place
+# versions, ``squeeze_()`` and ``unsqueeze_()``:
+# 
+
+batch_me = torch.rand(3, 226, 226)
+print(batch_me.shape)
+batch_me.unsqueeze_(0)
+print(batch_me.shape)
+
+
+##########################################################################
+# Sometimes you’ll want to change the shape of a tensor more radically,
+# while still preserving the number of elements and their contents. One
+# case where this happens is at the interface between a convolutional
+# layer of a model and a linear layer of the model - this is common in
+# image classification models. A convolution kernel will yield an output
+# tensor of shape *features x width x height,* but the following linear
+# layer expects a 1-dimensional input. ``reshape()`` will do this for you,
+# provided that the dimensions you request yield the same number of
+# elements as the input tensor has:
+# 
+
+output3d = torch.rand(6, 20, 20)
+print(output3d.shape)
+
+input1d = output3d.reshape(6 * 20 * 20)
+print(input1d.shape)
+
+# can also call it as a method on the torch module:
+print(torch.reshape(output3d, (6 * 20 * 20,)).shape)
+
+
+###############################################################################
+# .. note::
+#      The ``(6 * 20 * 20,)`` argument in the final line of the cell
+#      above is because PyTorch expects a **tuple** when specifying a
+#      tensor shape - but when the shape is the first argument of a method, it
+#      lets us cheat and just use a series of integers. Here, we had to add the
+#      parentheses and comma to convince the method that this is really a
+#      one-element tuple.
+# 
+# When it can, ``reshape()`` will return a *view* on the tensor to be
+# changed - that is, a separate tensor object looking at the same
+# underlying region of memory. *This is important:* That means any change
+# made to the source tensor will be reflected in the view on that tensor,
+# unless you ``clone()`` it.
+# 
+# There *are* conditions, beyond the scope of this introduction, where
+# ``reshape()`` has to return a tensor carrying a copy of the data. For
+# more information, see the
+# `docs <https://pytorch.org/docs/stable/torch.html#torch.reshape>`__.
+# 
+
+
+#######################################################################
+# NumPy Bridge
+# ------------
+# 
+# In the section above on broadcasting, it was mentioned that PyTorch’s
+# broadcast semantics are compatible with NumPy’s - but the kinship
+# between PyTorch and NumPy goes even deeper than that.
+# 
+# If you have existing ML or scientific code with data stored in NumPy
+# ndarrays, you may wish to express that same data as PyTorch tensors,
+# whether to take advantage of PyTorch’s GPU acceleration, or its
+# efficient abstractions for building ML models. It’s easy to switch
+# between ndarrays and PyTorch tensors:
+# 
+
+import numpy as np
+
+numpy_array = np.ones((2, 3))
+print(numpy_array)
+
+pytorch_tensor = torch.from_numpy(numpy_array)
+print(pytorch_tensor)
+
+
+##########################################################################
+# PyTorch creates a tensor of the same shape and containing the same data
+# as the NumPy array, going so far as to keep NumPy’s default 64-bit float
+# data type.
+# 
+# The conversion can just as easily go the other way:
+# 
+
+pytorch_rand = torch.rand(2, 3)
+print(pytorch_rand)
+
+numpy_rand = pytorch_rand.numpy()
+print(numpy_rand)
+
+
+##########################################################################
+# It is important to know that these converted objects are using *the same
+# underlying memory* as their source objects, meaning that changes to one
+# are reflected in the other:
+# 
+
+numpy_array[1, 1] = 23
+print(pytorch_tensor)
+
+pytorch_rand[1, 1] = 17
+print(numpy_rand)
diff --git a/beginner_source/introyt/tocyt.txt b/beginner_source/introyt/tocyt.txt
new file mode 100644
index 000000000..f956671c1
--- /dev/null
+++ b/beginner_source/introyt/tocyt.txt
@@ -0,0 +1,8 @@
+1. `Introduction to PyTorch <introyt/introyt1_tutorial.html>`_
+2. `Introduction to PyTorch Tensors <introyt/tensors_deeper_tutorial.html>`_
+3. `The Fundamentals of Autograd <introyt/autogradyt_tutorial.html>`_
+4. `Building Models with PyTorch <introyt/modelsyt_tutorial.html>`_
+5. `PyTorch TensorBoard Support <introyt/tensorboardyt_tutorial.html>`_
+6. `Training with PyTorch <introyt/trainingyt.html>`_
+7. `Model Understanding with Captum <introyt/captumyt.html>`_
+8. `Production Inference Deployment with PyTorch <https://www.youtube.com/watch?v=Dk88zv1KYMI>`_ (video only)
diff --git a/beginner_source/introyt/trainingyt.py b/beginner_source/introyt/trainingyt.py
new file mode 100644
index 000000000..b1f71b75f
--- /dev/null
+++ b/beginner_source/introyt/trainingyt.py
@@ -0,0 +1,364 @@
+"""
+`Introduction <introyt1_tutorial.html>`_ ||
+`Tensors <tensors_deeper_tutorial.html>`_ ||
+`Autograd <autogradyt_tutorial.html>`_ ||
+`Building Models <modelsyt_tutorial.html>`_ ||
+`TensorBoard Support <tensorboardyt_tutorial.html>`_ ||
+**Training Models** ||
+`Model Understanding <captumyt.html>`_
+
+Training with PyTorch
+=====================
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch?v=jF43_wj_DCQ>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/jF43_wj_DCQ" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+Introduction
+------------
+
+In past videos, we’ve discussed and demonstrated:
+
+- Building models with the neural network layers and functions of the torch.nn module
+- The mechanics of automated gradient computation, which is central to
+  gradient-based model training
+- Using TensorBoard to visualize training progress and other activities
+
+In this video, we’ll be adding some new tools to your inventory:
+
+- We’ll get familiar with the dataset and dataloader abstractions, and how
+  they ease the process of feeding data to your model during a training loop
+- We’ll discuss specific loss functions and when to use them
+- We’ll look at PyTorch optimizers, which implement algorithms to adjust
+  model weights based on the outcome of a loss function
+
+Finally, we’ll pull all of these together and see a full PyTorch
+training loop in action.
+
+
+Dataset and DataLoader
+----------------------
+
+The ``Dataset`` and ``DataLoader`` classes encapsulate the process of
+pulling your data from storage and exposing it to your training loop in
+batches.
+
+The ``Dataset`` is responsible for accessing and processing single
+instances of data.
+
+The ``DataLoader`` pulls instances of data from the ``Dataset`` (either
+automatically or with a sampler that you define), collects them in
+batches, and returns them for consumption by your training loop. The
+``DataLoader`` works with all kinds of datasets, regardless of the type
+of data they contain.
+
+For this tutorial, we’ll be using the Fashion-MNIST dataset provided by
+TorchVision. We use ``torchvision.transforms.Normalize()`` to
+zero-center and normalize the distribution of the image tile content,
+and download both training and validation data splits.
+
+"""
+
+import torch
+import torchvision
+import torchvision.transforms as transforms
+
+# PyTorch TensorBoard support
+from torch.utils.tensorboard import SummaryWriter
+from datetime import datetime
+
+
+transform = transforms.Compose(
+    [transforms.ToTensor(),
+    transforms.Normalize((0.5,), (0.5,))])
+
+# Create datasets for training & validation, download if necessary
+training_set = torchvision.datasets.FashionMNIST('./data', train=True, transform=transform, download=True)
+validation_set = torchvision.datasets.FashionMNIST('./data', train=False, transform=transform, download=True)
+
+# Create data loaders for our datasets; shuffle for training, not for validation
+training_loader = torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=True, num_workers=2)
+validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=4, shuffle=False, num_workers=2)
+
+# Class labels
+classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
+        'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot')
+
+# Report split sizes
+print('Training set has {} instances'.format(len(training_set)))
+print('Validation set has {} instances'.format(len(validation_set)))
+
+
+######################################################################
+# As always, let’s visualize the data as a sanity check:
+#
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Helper function for inline image display
+def matplotlib_imshow(img, one_channel=False):
+    if one_channel:
+        img = img.mean(dim=0)
+    img = img / 2 + 0.5     # unnormalize
+    npimg = img.numpy()
+    if one_channel:
+        plt.imshow(npimg, cmap="Greys")
+    else:
+        plt.imshow(np.transpose(npimg, (1, 2, 0)))
+
+dataiter = iter(training_loader)
+images, labels = dataiter.next()
+
+# Create a grid from the images and show them
+img_grid = torchvision.utils.make_grid(images)
+matplotlib_imshow(img_grid, one_channel=True)
+print('  '.join(classes[labels[j]] for j in range(4)))
+
+
+#########################################################################
+# The Model
+# ---------
+#
+# The model we’ll use in this example is a variant of LeNet-5 - it should
+# be familiar if you’ve watched the previous videos in this series.
+#
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+# PyTorch models inherit from torch.nn.Module
+class GarmentClassifier(nn.Module):
+    def __init__(self):
+        super(GarmentClassifier, self).__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 4 * 4, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 4 * 4)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+model = GarmentClassifier()
+
+
+##########################################################################
+# Loss Function
+# -------------
+#
+# For this example, we’ll be using a cross-entropy loss. For demonstration
+# purposes, we’ll create batches of dummy output and label values, run
+# them through the loss function, and examine the result.
+#
+
+loss_fn = torch.nn.CrossEntropyLoss()
+
+# NB: Loss functions expect data in batches, so we're creating batches of 4
+# Represents the model's confidence in each of the 10 classes for a given input
+dummy_outputs = torch.rand(4, 10)
+# Represents the correct class among the 10 being tested
+dummy_labels = torch.tensor([1, 5, 3, 7])
+
+print(dummy_outputs)
+print(dummy_labels)
+
+loss = loss_fn(dummy_outputs, dummy_labels)
+print('Total loss for this batch: {}'.format(loss.item()))
+
+
+#################################################################################
+# Optimizer
+# ---------
+#
+# For this example, we’ll be using simple `stochastic gradient
+# descent <https://pytorch.org/docs/stable/optim.html>`__ with momentum.
+#
+# It can be instructive to try some variations on this optimization
+# scheme:
+#
+# - Learning rate determines the size of the steps the optimizer
+#   takes. What does a different learning rate do to the your training
+#   results, in terms of accuracy and convergence time?
+# - Momentum nudges the optimizer in the direction of strongest gradient over
+#   multiple steps. What does changing this value do to your results?
+# - Try some different optimization algorithms, such as averaged SGD, Adagrad, or
+#   Adam. How do your results differ?
+#
+
+# Optimizers specified in the torch.optim package
+optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+
+
+#######################################################################################
+# The Training Loop
+# -----------------
+#
+# Below, we have a function that performs one training epoch. It
+# enumerates data from the DataLoader, and on each pass of the loop does
+# the following:
+#
+# - Gets a batch of training data from the DataLoader
+# - Zeros the optimizer’s gradients
+# - Performs an inference - that is, gets predictions from the model for an input batch
+# - Calculates the loss for that set of predictions vs. the labels on the dataset
+# - Calculates the backward gradients over the learning weights
+# - Tells the optimizer to perform one learning step - that is, adjust the model’s
+#   learning weights based on the observed gradients for this batch, according to the
+#   optimization algorithm we chose
+# - It reports on the loss for every 1000 batches.
+# - Finally, it reports the average per-batch loss for the last
+#   1000 batches, for comparison with a validation run
+#
+
+def train_one_epoch(epoch_index, tb_writer):
+    running_loss = 0.
+    last_loss = 0.
+
+    # Here, we use enumerate(training_loader) instead of
+    # iter(training_loader) so that we can track the batch
+    # index and do some intra-epoch reporting
+    for i, data in enumerate(training_loader):
+        # Every data instance is an input + label pair
+        inputs, labels = data
+
+        # Zero your gradients for every batch!
+        optimizer.zero_grad()
+
+        # Make predictions for this batch
+        outputs = model(inputs)
+
+        # Compute the loss and its gradients
+        loss = loss_fn(outputs, labels)
+        loss.backward()
+
+        # Adjust learning weights
+        optimizer.step()
+
+        # Gather data and report
+        running_loss += loss.item()
+        if i % 1000 == 999:
+            last_loss = running_loss / 1000 # loss per batch
+            print('  batch {} loss: {}'.format(i + 1, last_loss))
+            tb_x = epoch_index * len(training_loader) + i + 1
+            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
+            running_loss = 0.
+
+    return last_loss
+
+
+##################################################################################
+# Per-Epoch Activity
+# ~~~~~~~~~~~~~~~~~~
+#
+# There are a couple of things we’ll want to do once per epoch:
+#
+# - Perform validation by checking our relative loss on a set of data that was not
+#   used for training, and report this
+# - Save a copy of the model
+#
+# Here, we’ll do our reporting in TensorBoard. This will require going to
+# the command line to start TensorBoard, and opening it in another browser
+# tab.
+#
+
+# Initializing in a separate cell so we can easily add more epochs to the same run
+timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
+epoch_number = 0
+
+EPOCHS = 5
+
+best_vloss = 1_000_000.
+
+for epoch in range(EPOCHS):
+    print('EPOCH {}:'.format(epoch_number + 1))
+
+    # Make sure gradient tracking is on, and do a pass over the data
+    model.train(True)
+    avg_loss = train_one_epoch(epoch_number, writer)
+
+    # We don't need gradients on to do reporting
+    model.train(False)
+
+    running_vloss = 0.0
+    for i, vdata in enumerate(validation_loader):
+        vinputs, vlabels = vdata
+        voutputs = model(vinputs)
+        vloss = loss_fn(voutputs, vlabels)
+        running_vloss += vloss
+
+    avg_vloss = running_vloss / (i + 1)
+    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
+
+    # Log the running loss averaged per batch
+    # for both training and validation
+    writer.add_scalars('Training vs. Validation Loss',
+                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
+                    epoch_number + 1)
+    writer.flush()
+
+    # Track best performance, and save the model's state
+    if avg_vloss < best_vloss:
+        best_vloss = avg_vloss
+        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
+        torch.save(model.state_dict(), model_path)
+
+    epoch_number += 1
+
+
+#########################################################################
+# To load a saved version of the model:
+#
+# ::
+#
+#    saved_model = GarmentClassifier()
+#    saved_model.load_state_dict(torch.load(PATH))
+#
+# Once you’ve loaded the model, it’s ready for whatever you need it for -
+# more training, inference, or analysis.
+#
+# Note that if your model has constructor parameters that affect model
+# structure, you’ll need to provide them and configure the model
+# identically to the state in which it was saved.
+#
+# Other Resources
+# ---------------
+#
+# -  Docs on the `data
+#    utilities <https://pytorch.org/docs/stable/data.html>`__, including
+#    Dataset and DataLoader, at pytorch.org
+# -  A `note on the use of pinned
+#    memory <https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-pinning>`__
+#    for GPU training
+# -  Documentation on the datasets available in
+#    `TorchVision <https://pytorch.org/vision/stable/datasets.html>`__,
+#    `TorchText <https://pytorch.org/text/stable/datasets.html>`__, and
+#    `TorchAudio <https://pytorch.org/audio/stable/datasets.html>`__
+# -  Documentation on the `loss
+#    functions <https://pytorch.org/docs/stable/nn.html#loss-functions>`__
+#    available in PyTorch
+# -  Documentation on the `torch.optim
+#    package <https://pytorch.org/docs/stable/optim.html>`__, which
+#    includes optimizers and related tools, such as learning rate
+#    scheduling
+# -  A detailed `tutorial on saving and loading
+#    models <https://tutorials.pytorch.kr/beginner/saving_loading_models.html>`__
+# -  The `Tutorials section of
+#    pytorch.org <https://tutorials.pytorch.kr/>`__ contains tutorials on
+#    a broad variety of training tasks, including classification in
+#    different domains, generative adversarial networks, reinforcement
+#    learning, and more
+#
diff --git a/beginner_source/nlp/sequence_models_tutorial.py b/beginner_source/nlp/sequence_models_tutorial.py
index de92b5101..bb8d97d69 100644
--- a/beginner_source/nlp/sequence_models_tutorial.py
+++ b/beginner_source/nlp/sequence_models_tutorial.py
@@ -13,7 +13,7 @@
 
 A recurrent neural network is a network that maintains some kind of
 state. For example, its output could be used as part of the next input,
-so that information can propogate along as the network passes over the
+so that information can propagate along as the network passes over the
 sequence. In the case of an LSTM, for each element in the sequence,
 there is a corresponding *hidden state* :math:`h_t`, which in principle
 can contain information from arbitrary points earlier in the sequence.
diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py
index 67adc34cc..bdb9d8f25 100644
--- a/beginner_source/nn_tutorial.py
+++ b/beginner_source/nn_tutorial.py
@@ -138,7 +138,7 @@ def model(xb):
     return log_softmax(xb @ weights + bias)
 
 ###############################################################################
-# 위에서, ``@`` 기호는 점곱(dot product) 연산을 나타냅니다.
+# 위에서, ``@`` 기호는 행렬 곱셈(matrix multiplication) 연산을 나타냅니다.
 # 우리는 하나의 배치(batch) 데이터(이 경우에는 64개의 이미지들)에 대하여 함수를 호출할 것입니다.
 # 이것은 하나의 *포워드 전달(forward pass)* 입니다. 이 단계에서 우리는 무작위(random) 가중치로
 # 시작했기 때문에 우리의 예측이 무작위 예측보다 전혀 나은 점이 없을 것입니다.
diff --git a/beginner_source/ptcheat.rst b/beginner_source/ptcheat.rst
index 542536ddd..55bfb1345 100644
--- a/beginner_source/ptcheat.rst
+++ b/beginner_source/ptcheat.rst
@@ -10,7 +10,7 @@ General
 .. code-block:: python
 
     import torch                                        # root package
-    from torch.utils.data import Dataset, Dataloader    # dataset representation and loading
+    from torch.utils.data import Dataset, DataLoader    # dataset representation and loading
 
 Neural Network API
 ------------------
diff --git a/beginner_source/saving_loading_models.py b/beginner_source/saving_loading_models.py
index 969299d35..79f8f635b 100644
--- a/beginner_source/saving_loading_models.py
+++ b/beginner_source/saving_loading_models.py
@@ -213,6 +213,41 @@
 # 정규화를 평가 모드로 설정하여야 합니다. 이것을 하지 않으면 추론 결과가 일관성
 # 없게 출력됩니다.
 #
+# Export/Load Model in TorchScript Format
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# One common way to do inference with a trained model is to use
+# `TorchScript <https://pytorch.org/docs/stable/jit.html>`__, an intermediate
+# representation of a PyTorch model that can be run in Python as well as in a
+# high performance environment like C++. TorchScript is actually the recommended model format
+# for scaled inference and deployment.
+#
+# .. note::
+#    Using the TorchScript format, you will be able to load the exported model and
+#    run inference without defining the model class.
+#
+# **Export:**
+#
+# .. code:: python
+#
+#    model_scripted = torch.jit.script(model) # Export to TorchScript
+#    model_scripted.save('model_scripted.pt') # Save
+#
+# **Load:**
+#
+# .. code:: python
+#
+#    model = torch.jit.load('model_scripted.pt')
+#    model.eval()
+#
+# Remember that you must call ``model.eval()`` to set dropout and batch
+# normalization layers to evaluation mode before running inference.
+# Failing to do this will yield inconsistent inference results.
+#
+# For more information on TorchScript, feel free to visit the dedicated
+# `tutorials <https://tutorials.pytorch.kr/beginner/Intro_to_TorchScript_tutorial.html>`__.
+# You will get familiar with the tracing conversion and learn how to
+# run a TorchScript module in a `C++ environment <https://tutorials.pytorch.kr/advanced/cpp_export.html>`__.
 
 
 ######################################################################
diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py
index 2dfcd91ce..50fd4a7b8 100644
--- a/beginner_source/transfer_learning_tutorial.py
+++ b/beginner_source/transfer_learning_tutorial.py
@@ -282,7 +282,7 @@ def visualize_model(model, num_images=6):
 # ---------------------------------------
 #
 # 이제, 마지막 계층을 제외한 신경망의 모든 부분을 고정해야 합니다.
-# ``requires_grad == False`` 로 설정하여 매개변수를 고정하여 ``backward()`` 중에
+# ``requires_grad = False`` 로 설정하여 매개변수를 고정하여 ``backward()`` 중에
 # 경사도가 계산되지 않도록 해야합니다.
 #
 # 이에 대한 문서는
diff --git a/index.rst b/index.rst
index 5ed917aae..1887d1cf3 100644
--- a/index.rst
+++ b/index.rst
@@ -57,6 +57,13 @@
    :link: beginner/basics/intro.html
    :tags: Getting-Started
 
+.. customcarditem::
+   :header: Introduction to PyTorch on YouTube
+   :card_description: An introduction to building a complete ML workflow with PyTorch. Follows the PyTorch Beginner Series on YouTube.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: beginner/introyt.html
+   :tags: Getting-Started
+
 .. customcarditem::
    :header: 예제로 배우는 파이토치(PyTorch)
    :card_description: 튜토리얼에 포함된 예제들로 PyTorch의 기본 개념을 이해합니다.
@@ -355,31 +362,38 @@
 .. customcarditem::
    :header: Registering a Dispatched Operator in C++
    :card_description: The dispatcher is an internal component of PyTorch which is responsible for figuring out what code should actually get run when you call a function like torch::add.
-   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.PNG
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
    :link: advanced/dispatcher.html
    :tags: Extending-PyTorch,Frontend-APIs,C++
 
 .. customcarditem::
    :header: Extending Dispatcher For a New Backend in C++
    :card_description: Learn how to extend the dispatcher to add a new device living outside of the pytorch/pytorch repo and maintain it to keep in sync with native PyTorch devices.
-   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.PNG
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
    :link: advanced/extend_dispatcher.html
    :tags: Extending-PyTorch,Frontend-APIs,C++
 
 .. customcarditem::
    :header: Custom Function Tutorial: Double Backward
    :card_description: Learn how to write a custom autograd Function that supports double backward.
-   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.PNG
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
    :link: intermediate/custom_function_double_backward_tutorial.html
    :tags: Extending-PyTorch,Frontend-APIs
 
 .. customcarditem::
    :header: Custom Function Tutorial: Fusing Convolution and Batch Norm
    :card_description: Learn how to create a custom autograd Function that fuses batch norm into a convolution to improve memory usage.
-   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.PNG
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
    :link: intermediate/custom_function_conv_bn_tutorial.html
    :tags: Extending-PyTorch,Frontend-APIs
 
+.. customcarditem::
+   :header: Forward-mode Automatic Differentiation
+   :card_description: Learn how to use forward-mode automatic differentiation.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/forward_ad_usage.html
+   :tags: Frontend-APIs
+
 .. Model Optimization
 
 .. customcarditem::
@@ -645,6 +659,21 @@
    beginner/basics/optimization_tutorial
    beginner/basics/saveloadrun_tutorial
 
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :includehidden:
+   :caption: Introduction to PyTorch on YouTube
+
+   beginner/introyt
+   beginner/introyt/introyt1_tutorial
+   beginner/introyt/tensors_deeper_tutorial
+   beginner/introyt/autogradyt_tutorial
+   beginner/introyt/modelsyt_tutorial
+   beginner/introyt/tensorboardyt_tutorial
+   beginner/introyt/trainingyt
+   beginner/introyt/captumyt
+
 .. toctree::
    :maxdepth: 2
    :hidden:
@@ -735,6 +764,7 @@
    :caption: 프론트엔드 API
 
    intermediate/memory_format_tutorial
+   intermediate/forward_ad_usage
    advanced/cpp_frontend
    advanced/torch-script-parallelism
    advanced/cpp_autograd
diff --git a/intermediate_source/README.txt b/intermediate_source/README.txt
index 9057a106b..f76bfeca2 100644
--- a/intermediate_source/README.txt
+++ b/intermediate_source/README.txt
@@ -3,7 +3,7 @@ Intermediate tutorials
 
 1. tensorboard_tutorial.py
 	Classifying Names with a Character-Level RNN
-	https://tutorials.pytorch.kr/beginner/tensorboard_tutorial.html
+	https://tutorials.pytorch.kr/intermediate/tensorboard_tutorial.html
 
 2. char_rnn_classification_tutorial.py
 	Classifying Names with a Character-Level RNN
diff --git a/intermediate_source/forward_ad_usage.py b/intermediate_source/forward_ad_usage.py
new file mode 100644
index 000000000..81ac5bcea
--- /dev/null
+++ b/intermediate_source/forward_ad_usage.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+"""
+Forward-mode Automatic Differentiation (Beta)
+=============================================
+
+This tutorial demonstrates how to use forward-mode AD to compute
+directional derivatives (or equivalently, Jacobian-vector products).
+
+The tutorial below uses some APIs only available in versions >= 1.11
+(or nightly builds).
+
+Also note that forward-mode AD is currently in beta. The API is
+subject to change and operator coverage is still incomplete.
+
+Basic Usage
+--------------------------------------------------------------------
+Unlike reverse-mode AD, forward-mode AD computes gradients eagerly
+alongside the forward pass. We can use forward-mode AD to compute a
+directional derivative by performing the forward pass as before,
+except we first associate our input with another tensor representing
+the direction of the directional derivative (or equivalently, the ``v``
+in a Jacobian-vector product). When an input, which we call "primal", is
+associated with a "direction" tensor, which we call "tangent", the
+resultant new tensor object is called a "dual tensor" for its connection
+to dual numbers[0].
+
+As the forward pass is performed, if any input tensors are dual tensors,
+extra computation is performed to propogate this "sensitivity" of the
+function.
+
+"""
+
+import torch
+import torch.autograd.forward_ad as fwAD
+
+primal = torch.randn(10, 10)
+tangent = torch.randn(10, 10)
+
+def fn(x, y):
+    return x ** 2 + y ** 2
+
+# All forward AD computation must be performed in the context of
+# a ``dual_level`` context. All dual tensors created in such a context
+# will have their tangents destroyed upon exit. This is to ensure that
+# if the output or intermediate results of this computation are reused
+# in a future forward AD computation, their tangents (which are associated
+# with this computation) won't be confused with tangents from the later
+# computation.
+with fwAD.dual_level():
+    # To create a dual tensor we associate a tensor, which we call the
+    # primal with another tensor of the same size, which we call the tangent.
+    # If the layout of the tangent is different from that of the primal,
+    # The values of the tangent are copied into a new tensor with the same
+    # metadata as the primal. Otherwise, the tangent itself is used as-is.
+    #
+    # It is also important to note that the dual tensor created by
+    # ``make_dual`` is a view of the primal.
+    dual_input = fwAD.make_dual(primal, tangent)
+    assert fwAD.unpack_dual(dual_input).tangent is tangent
+
+    # To demonstrate the case where the copy of the tangent happens,
+    # we pass in a tangent with a layout different from that of the primal
+    dual_input_alt = fwAD.make_dual(primal, tangent.T)
+    assert fwAD.unpack_dual(dual_input_alt).tangent is not tangent
+
+    # Tensors that do not not have an associated tangent are automatically
+    # considered to have a zero-filled tangent of the same shape.
+    plain_tensor = torch.randn(10, 10)
+    dual_output = fn(dual_input, plain_tensor)
+
+    # Unpacking the dual returns a namedtuple with ``primal`` and ``tangent``
+    # as attributes
+    jvp = fwAD.unpack_dual(dual_output).tangent
+
+assert fwAD.unpack_dual(dual_output).tangent is None
+
+######################################################################
+# Usage with Modules
+# --------------------------------------------------------------------
+# To use ``nn.Module`` with forward AD, replace the parameters of your
+# model with dual tensors before performing the forward pass. At the
+# time of writing, it is not possible to create dual tensor
+# `nn.Parameter`s. As a workaround, one must register the dual tensor
+# as a non-parameter attribute of the module.
+
+import torch.nn as nn
+
+model = nn.Linear(5, 5)
+input = torch.randn(16, 5)
+
+params = {name: p for name, p in model.named_parameters()}
+tangents = {name: torch.rand_like(p) for name, p in params.items()}
+
+with fwAD.dual_level():
+    for name, p in params.items():
+        delattr(model, name)
+        setattr(model, name, fwAD.make_dual(p, tangents[name]))
+
+    out = model(input)
+    jvp = fwAD.unpack_dual(out).tangent
+
+######################################################################
+# Using Modules stateless API (experimental)
+# --------------------------------------------------------------------
+# Another way to use ``nn.Module`` with forward AD is to utilize
+# the stateless API. NB: At the time of writing the stateless API is still
+# experimental and may be subject to change.
+
+from torch.nn.utils._stateless import functional_call
+
+# We need a fresh module because the functional call requires the
+# the model to have parameters registered.
+model = nn.Linear(5, 5)
+
+dual_params = {}
+with fwAD.dual_level():
+    for name, p in params.items():
+        # Using the same ``tangents`` from the above section
+        dual_params[name] = fwAD.make_dual(p, tangents[name])
+    out = functional_call(model, dual_params, input)
+    jvp2 = fwAD.unpack_dual(out).tangent
+
+# Check our results
+assert torch.allclose(jvp, jvp2)
+
+######################################################################
+# Custom autograd Function
+# --------------------------------------------------------------------
+# Custom Functions also support forward-mode AD. To create custom Function
+# supporting forward-mode AD, register the ``jvp()`` static method. It is
+# possible, but not mandatory for custom Functions to support both forward
+# and backward AD. See the
+# `documentation <https://pytorch.org/docs/master/notes/extending.html#forward-mode-ad>`_
+# for more information.
+
+class Fn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, foo):
+        result = torch.exp(foo)
+        # Tensors stored in ctx can be used in the subsequent forward grad
+        # computation.
+        ctx.result = result
+        return result
+
+    @staticmethod
+    def jvp(ctx, gI):
+        gO = gI * ctx.result
+        # If the tensor stored in ctx will not also be used in the backward pass,
+        # one can manually free it using ``del``
+        del ctx.result
+        return gO
+
+fn = Fn.apply
+
+primal = torch.randn(10, 10, dtype=torch.double, requires_grad=True)
+tangent = torch.randn(10, 10)
+
+with fwAD.dual_level():
+    dual_input = fwAD.make_dual(primal, tangent)
+    dual_output = fn(dual_input)
+    jvp = fwAD.unpack_dual(dual_output).tangent
+
+# It is important to use ``autograd.gradcheck`` to verify that your
+# custom autograd Function computes the gradients correctly. By default,
+# gradcheck only checks the backward-mode (reverse-mode) AD gradients. Specify
+# ``check_forward_ad=True`` to also check forward grads. If you did not
+# implement the backward formula for your function, you can also tell gradcheck
+# to skip the tests that require backward-mode AD by specifying
+# ``check_backward_ad=False``, ``check_undefined_grad=False``, and
+# ``check_batched_grad=False``.
+torch.autograd.gradcheck(Fn.apply, (primal,), check_forward_ad=True,
+                         check_backward_ad=False, check_undefined_grad=False,
+                         check_batched_grad=False)
+
+######################################################################
+# [0] https://en.wikipedia.org/wiki/Dual_number
diff --git a/prototype_source/tracing_based_selective_build.rst b/prototype_source/tracing_based_selective_build.rst
index ac71e258d..273a9db18 100644
--- a/prototype_source/tracing_based_selective_build.rst
+++ b/prototype_source/tracing_based_selective_build.rst
@@ -1,5 +1,5 @@
 (prototype) Tracing-based Selective Build Mobile Interpreter in Android and iOS
-
+===============================================================================
 
 
 *Author*: Chen Lai <https://github.com/cccclai>, Dhruv Matani <https://github.com/dhruvbird>
@@ -8,6 +8,7 @@
     Tracing-based selective build a prototype feature to minimize library size. Since the traced result relies on the model input and traced environment, if the tracer runs in a different environment than mobile interpreter, the operator list might be different from the actual used operator list and missing operators error might raise.
 
 Introduction
+------------
 
 
 This tutorial introduces a new way to custom build mobile interpreter to further optimize mobile interpreter size. It restricts the set of operators included in the compiled binary to only the set of operators actually needed by target models. It is a technique to reduce the binary size of PyTorch for mobile deployments. Tracing Based Selective Build runs a model with specific representative inputs, and records which operators were called. The build then includes just those operators.
@@ -83,6 +84,7 @@ Following are the processes to use tracing-based selective approach to build a c
 
 
 Android
+-------
 
 Get the Image Segmentation demo app in Android: https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation
 
@@ -149,6 +151,7 @@ Update `all projects` part in ``ImageSegmentation/build.gradle`` to
 
 
 iOS
+---
 
 Get ImageSegmentation demo app in iOS: https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation
 
diff --git a/recipes_source/intel_neural_compressor_for_pytorch.rst b/recipes_source/intel_neural_compressor_for_pytorch.rst
new file mode 100755
index 000000000..67f1a7f33
--- /dev/null
+++ b/recipes_source/intel_neural_compressor_for_pytorch.rst
@@ -0,0 +1,373 @@
+Ease-of-use quantization for PyTorch with Intel® Neural Compressor
+==================================================================
+
+Overview
+--------
+
+Most deep learning applications are using 32-bits of floating-point precision
+for inference. But low precision data types, especially int8, are getting more
+focus due to significant performance boost. One of the essential concerns on
+adopting low precision is how to easily mitigate the possible accuracy loss
+and reach predefined accuracy requirement.
+
+Intel® Neural Compressor aims to address the aforementioned concern by extending
+PyTorch with accuracy-driven automatic tuning strategies to help user quickly find
+out the best quantized model on Intel hardware, including Intel Deep Learning
+Boost (`Intel DL Boost <https://www.intel.com/content/www/us/en/artificial-intelligence/deep-learning-boost.html>`_)
+and Intel Advanced Matrix Extensions (`Intel AMX <https://www.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-amx-instructions/intrinsics-for-amx-tile-instructions.html>`_).
+
+Intel® Neural Compressor has been released as an open-source project
+at `Github <https://github.com/intel/neural-compressor>`_.
+
+Features
+--------
+
+- **Ease-of-use Python API:** Intel® Neural Compressor provides simple frontend
+  Python APIs and utilities for users to do neural network compression with few
+  line code changes.
+  Typically, only 5 to 6 clauses are required to be added to the original code.
+
+- **Quantization:** Intel® Neural Compressor supports accuracy-driven automatic
+  tuning process on post-training static quantization, post-training dynamic
+  quantization, and quantization-aware training on PyTorch fx graph mode and
+  eager model.
+
+*This tutorial mainly focuses on the quantization part. As for how to use Intel®
+Neural Compressor to do pruning and distillation, please refer to corresponding
+documents in the Intel® Neural Compressor github repo.*
+
+Getting Started
+---------------
+
+Installation
+~~~~~~~~~~~~
+
+.. code:: bash
+
+    # install stable version from pip
+    pip install neural-compressor
+
+    # install nightly version from pip
+    pip install -i https://test.pypi.org/simple/ neural-compressor
+
+    # install stable version from from conda
+    conda install neural-compressor -c conda-forge -c intel
+
+*Supported python versions are 3.6 or 3.7 or 3.8 or 3.9*
+
+Usages
+~~~~~~
+
+Minor code changes are required for users to get started with Intel® Neural Compressor
+quantization API. Both PyTorch fx graph mode and eager mode are supported.
+
+Intel® Neural Compressor takes a FP32 model and a yaml configuration file as inputs.
+To construct the quantization process, users can either specify the below settings via
+the yaml configuration file or python APIs:
+
+1. Calibration Dataloader (Needed for static quantization)
+2. Evaluation Dataloader
+3. Evaluation Metric
+
+Intel® Neural Compressor supports some popular dataloaders and evaluation metrics. For
+how to configure them in yaml configuration file, user could refer to `Built-in Datasets
+<https://github.com/intel/neural-compressor/blob/master/docs/dataset.md>`_.
+
+If users want to use a self-developed dataloader or evaluation metric, Intel® Neural
+Compressor supports this by the registration of customized dataloader/metric using python code.
+
+For the yaml configuration file format please refer to `yaml template
+<https://github.com/intel/neural-compressor/blob/master/neural_compressor/template/ptq.yaml>`_.
+
+The code changes that are required for *Intel® Neural Compressor* are highlighted with
+comments in the line above.
+
+Model
+^^^^^
+
+In this tutorial, the LeNet model is used to demonstrate how to deal with *Intel® Neural Compressor*.
+
+.. code-block:: python3
+
+    # main.py
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+
+    # LeNet Model definition
+    class Net(nn.Module):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+            self.conv2_drop = nn.Dropout2d()
+            self.fc1 = nn.Linear(320, 50)
+            self.fc1_drop = nn.Dropout()
+            self.fc2 = nn.Linear(50, 10)
+
+        def forward(self, x):
+            x = F.relu(F.max_pool2d(self.conv1(x), 2))
+            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+            x = x.reshape(-1, 320)
+            x = F.relu(self.fc1(x))
+            x = self.fc1_drop(x)
+            x = self.fc2(x)
+            return F.log_softmax(x, dim=1)
+
+    model = Net()
+    model.load_state_dict(torch.load('./lenet_mnist_model.pth'))
+
+The pretrained model weight `lenet_mnist_model.pth` comes from
+`here <https://drive.google.com/drive/folders/1fn83DF14tWmit0RTKWRhPq5uVXt73e0h?usp=sharing>`_.
+
+Accuracy driven quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Intel® Neural Compressor supports accuracy-driven automatic tuning to generate the optimal
+int8 model which meets a predefined accuracy goal.
+
+Below is an example of how to quantize a simple network on PyTorch
+`FX graph mode <https://pytorch.org/docs/stable/fx.html>`_ by auto-tuning.
+
+.. code-block:: yaml
+
+    # conf.yaml
+    model:
+        name: LeNet
+        framework: pytorch_fx
+
+    evaluation:
+        accuracy:
+            metric:
+                topk: 1
+
+    tuning:
+      accuracy_criterion:
+        relative: 0.01
+
+.. code-block:: python3
+
+    # main.py
+    model.eval()
+
+    from torchvision import datasets, transforms
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('./data', train=False, download=True,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                       ])),
+        batch_size=1)
+
+    # launch code for Intel® Neural Compressor
+    from neural_compressor.experimental import Quantization
+    quantizer = Quantization("./conf.yaml")
+    quantizer.model = model
+    quantizer.calib_dataloader = test_loader
+    quantizer.eval_dataloader = test_loader
+    q_model = quantizer()
+    q_model.save('./output')
+
+In the `conf.yaml` file, the built-in metric `top1` of Intel® Neural Compressor is specified as
+the evaluation method, and `1%` relative accuracy loss is set as the accuracy target for auto-tuning.
+Intel® Neural Compressor will traverse all possible quantization config combinations on per-op level
+to find out the optimal int8 model that reaches the predefined accuracy target.
+
+Besides those built-in metrics, Intel® Neural Compressor also supports customized metric through
+python code:
+
+.. code-block:: yaml
+
+    # conf.yaml
+    model:
+        name: LeNet
+        framework: pytorch_fx
+
+    tuning:
+        accuracy_criterion:
+            relative: 0.01
+
+.. code-block:: python3
+
+    # main.py
+    model.eval()
+
+    from torchvision import datasets, transforms
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('./data', train=False, download=True,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                       ])),
+        batch_size=1)
+
+    # define a customized metric
+    class Top1Metric(object):
+        def __init__(self):
+            self.correct = 0
+        def update(self, output, label):
+            pred = output.argmax(dim=1, keepdim=True)
+            self.correct += pred.eq(label.view_as(pred)).sum().item()
+        def reset(self):
+            self.correct = 0
+        def result(self):
+            return 100. * self.correct / len(test_loader.dataset)
+
+    # launch code for Intel® Neural Compressor
+    from neural_compressor.experimental import Quantization
+    quantizer = Quantization("./conf.yaml")
+    quantizer.model = model
+    quantizer.calib_dataloader = test_loader
+    quantizer.eval_dataloader = test_loader
+    quantizer.metric = Top1Metric()
+    q_model = quantizer()
+    q_model.save('./output')
+
+In the above example, a `class` which contains `update()` and `result()` function is implemented
+to record per mini-batch result and calculate final accuracy at the end.
+
+Quantization aware training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Besides post-training static quantization and post-training dynamic quantization, Intel® Neural
+Compressor supports quantization-aware training with an accuracy-driven automatic tuning mechanism.
+
+Below is an example of how to do quantization aware training on a simple network on PyTorch
+`FX graph mode <https://pytorch.org/docs/stable/fx.html>`_.
+
+.. code-block:: yaml
+
+    # conf.yaml
+    model:
+        name: LeNet
+        framework: pytorch_fx
+
+    quantization:
+        approach: quant_aware_training
+
+    evaluation:
+        accuracy:
+            metric:
+                topk: 1
+
+    tuning:
+        accuracy_criterion:
+            relative: 0.01
+
+.. code-block:: python3
+
+    # main.py
+    model.eval()
+
+    from torchvision import datasets, transforms
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('./data', train=True, download=True,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ])),
+        batch_size=64, shuffle=True)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('./data', train=False, download=True,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ])),
+        batch_size=1)
+
+    import torch.optim as optim
+    optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.1)
+
+    def training_func(model):
+        model.train()
+        for epoch in range(1, 3):
+            for batch_idx, (data, target) in enumerate(train_loader):
+                optimizer.zero_grad()
+                output = model(data)
+                loss = F.nll_loss(output, target)
+                loss.backward()
+                optimizer.step()
+                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                      epoch, batch_idx * len(data), len(train_loader.dataset),
+                      100. * batch_idx / len(train_loader), loss.item()))
+
+    # launch code for Intel® Neural Compressor
+    from neural_compressor.experimental import Quantization
+    quantizer = Quantization("./conf.yaml")
+    quantizer.model = model
+    quantizer.q_func = training_func
+    quantizer.eval_dataloader = test_loader
+    q_model = quantizer()
+    q_model.save('./output')
+
+Performance only quantization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Intel® Neural Compressor supports directly yielding int8 model with dummy dataset for the
+performance benchmarking purpose.
+
+Below is an example of how to quantize a simple network on PyTorch
+`FX graph mode <https://pytorch.org/docs/stable/fx.html>`_ with a dummy dataset.
+
+.. code-block:: yaml
+
+    # conf.yaml
+    model:
+        name: lenet
+        framework: pytorch_fx
+
+.. code-block:: python3
+
+    # main.py
+    model.eval()
+
+    # launch code for Intel® Neural Compressor
+    from neural_compressor.experimental import Quantization, common
+    from neural_compressor.experimental.data.datasets.dummy_dataset import DummyDataset
+    quantizer = Quantization("./conf.yaml")
+    quantizer.model = model
+    quantizer.calib_dataloader = common.DataLoader(DummyDataset([(1, 1, 28, 28)]))
+    q_model = quantizer()
+    q_model.save('./output')
+
+Quantization outputs
+~~~~~~~~~~~~~~~~~~~~
+
+Users could know how many ops get quantized from log printed by Intel® Neural Compressor
+like below:
+
+::
+
+    2021-12-08 14:58:35 [INFO] |********Mixed Precision Statistics*******|
+    2021-12-08 14:58:35 [INFO] +------------------------+--------+-------+
+    2021-12-08 14:58:35 [INFO] |        Op Type         | Total  |  INT8 |
+    2021-12-08 14:58:35 [INFO] +------------------------+--------+-------+
+    2021-12-08 14:58:35 [INFO] |  quantize_per_tensor   |   2    |   2   |
+    2021-12-08 14:58:35 [INFO] |         Conv2d         |   2    |   2   |
+    2021-12-08 14:58:35 [INFO] |       max_pool2d       |   1    |   1   |
+    2021-12-08 14:58:35 [INFO] |          relu          |   1    |   1   |
+    2021-12-08 14:58:35 [INFO] |       dequantize       |   2    |   2   |
+    2021-12-08 14:58:35 [INFO] |       LinearReLU       |   1    |   1   |
+    2021-12-08 14:58:35 [INFO] |         Linear         |   1    |   1   |
+    2021-12-08 14:58:35 [INFO] +------------------------+--------+-------+
+
+The quantized model will be generated under `./output` directory, in which there are two files:
+1. best_configure.yaml
+2. best_model_weights.pt
+
+The first file contains the quantization configurations of each op, the second file contains
+int8 weights and zero point and scale info of activations.
+
+Deployment
+~~~~~~~~~~
+
+Users could use the below code to load quantized model and then do inference or performance benchmark.
+
+.. code-block:: python3
+
+    from neural_compressor.utils.pytorch import load
+    int8_model = load('./output', model)
+
+Tutorials
+---------
+
+Please visit `Intel® Neural Compressor Github repo <https://github.com/intel/neural-compressor>`_
+for more tutorials.
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index 09a2bfde3..14a2942c6 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -253,6 +253,14 @@ Recipes are bite-sized bite-sized, actionable examples of how to use specific Py
    :link: ../recipes/intel_extension_for_pytorch.html
    :tags: Model-Optimization
 
+.. Intel(R) Neural Compressor for PyTorch*
+.. customcarditem::
+   :header: Intel® Neural Compressor for PyTorch
+   :card_description: Ease-of-use quantization for PyTorch with Intel® Neural Compressor.
+   :image: ../_static/img/thumbnails/cropped/profiler.png
+   :link: ../recipes/intel_neural_compressor_for_pytorch.html
+   :tags: Quantization,Model-Optimization
+
 .. Distributed Training
 .. customcarditem::
    :header: Shard Optimizer States with ZeroRedundancyOptimizer
diff --git a/requirements.txt b/requirements.txt
index d924426fa..1b0fa1fc7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ awscli==1.16.35
 flask
 spacy==2.3.2
 ray[tune]
-
+tensorboard
 
 # PyTorch Theme
 -e git+git://github.com/9bow/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme