docs: switch links from PDF to abstract

Borda · Borda · commit d9eb63a65b3f · 2024-01-09T00:50:17.000+01:00
diff --git a/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py b/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py
@@ -523,7 +523,7 @@ def xavier_init(model):
 #
 # Thus, we see that we have an additional factor of 1/2 in the equation, so that our desired weight variance becomes $2/d_x$.
 # This gives us the Kaiming initialization (see [He, K. et al.
-# (2015)](https://arxiv.org/pdf/1502.01852.pdf)).
+# (2015)](https://arxiv.org/abs/1502.01852)).
 # Note that the Kaiming initialization does not use the harmonic mean between input and output size.
 # In their paper (Section 2.2, Backward Propagation, last paragraph), they argue that using $d_x$ or $d_y$ both lead to stable gradients throughout the network, and only depend on the overall input and output size of the network.
 # Hence, we can use here only the input $d_x$:
@@ -1095,7 +1095,7 @@ def comb_func(w1, w2):
 # The short answer: no.
 # There are many papers saying that in certain situations, SGD (with momentum) generalizes better where Adam often tends to overfit [5,6].
 # This is related to the idea of finding wider optima.
-# For instance, see the illustration of different optima below (credit: [Keskar et al., 2017](https://arxiv.org/pdf/1609.04836.pdf)):
+# For instance, see the illustration of different optima below (credit: [Keskar et al., 2017](https://arxiv.org/abs/1609.04836)):
 #
 # <center width="100%"><img src="flat_vs_sharp_minima.svg" width="500px"></center>
 #
@@ -1125,7 +1125,7 @@ def comb_func(w1, w2):
 # "Understanding the difficulty of training deep feedforward neural networks."
 # Proceedings of the thirteenth international conference on artificial intelligence and statistics.
 # 2010.
-# [link](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)
+# [link](https://proceedings.mlr.press/v9/glorot10a)
 #
 # [2] He, Kaiming, et al.
 # "Delving deep into rectifiers: Surpassing human-level performance on imagenet classification."
diff --git a/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py b/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py
@@ -243,7 +243,7 @@ def configure_optimizers(self):
         # We will support Adam or SGD as optimizers.
         if self.hparams.optimizer_name == "Adam":
             # AdamW is Adam with a correct implementation of weight decay (see here
-            # for details: https://arxiv.org/pdf/1711.05101.pdf)
+            # for details: https://arxiv.org/abs/1711.05101)
             optimizer = optim.AdamW(self.parameters(), **self.hparams.optimizer_hparams)
         elif self.hparams.optimizer_name == "SGD":
             optimizer = optim.SGD(self.parameters(), **self.hparams.optimizer_hparams)
@@ -869,8 +869,8 @@ def forward(self, x):
 # One difference to the GoogleNet training is that we explicitly use SGD with Momentum as optimizer instead of Adam.
 # Adam often leads to a slightly worse accuracy on plain, shallow ResNets.
 # It is not 100% clear why Adam performs worse in this context, but one possible explanation is related to ResNet's loss surface.
-# ResNet has been shown to produce smoother loss surfaces than networks without skip connection (see [Li et al., 2018](https://arxiv.org/pdf/1712.09913.pdf) for details).
-# A possible visualization of the loss surface with/out skip connections is below (figure credit - [Li et al. ](https://arxiv.org/pdf/1712.09913.pdf)):
+# ResNet has been shown to produce smoother loss surfaces than networks without skip connection (see [Li et al., 2018](https://arxiv.org/abs/1712.09913) for details).
+# A possible visualization of the loss surface with/out skip connections is below (figure credit - [Li et al. ](https://arxiv.org/abs/1712.09913)):
 #
 # <center width="100%"><img src="resnet_loss_surface.png" style="display: block; margin-left: auto; margin-right: auto;" width="600px"/></center>
 #
diff --git a/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py b/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py
@@ -658,7 +658,7 @@ def forward(self, x):
 # In fact, training a deep Transformer without learning rate warm-up can make the model diverge
 # and achieve a much worse performance on training and testing.
 # Take for instance the following plot by [Liu et al.
-# (2019)](https://arxiv.org/pdf/1908.03265.pdf) comparing Adam-vanilla (i.e. Adam without warm-up)
+# (2019)](https://arxiv.org/abs/1908.03265) comparing Adam-vanilla (i.e. Adam without warm-up)
 # vs Adam with a warm-up:
 #
 # <center width="100%"><img src="warmup_loss_plot.svg" width="350px"></center>
diff --git a/course_UvA-DL/06-graph-neural-networks/GNN_overview.py b/course_UvA-DL/06-graph-neural-networks/GNN_overview.py
@@ -744,7 +744,7 @@ def print_results(result_dict):
 # Tutorials and papers for this topic include:
 #
 # * [PyTorch Geometric example](https://github.com/rusty1s/pytorch_geometric/blob/master/examples/link_pred.py)
-# * [Graph Neural Networks: A Review of Methods and Applications](https://arxiv.org/pdf/1812.08434.pdf), Zhou et al.
+# * [Graph Neural Networks: A Review of Methods and Applications](https://arxiv.org/abs/1812.08434), Zhou et al.
 # 2019
 # * [Link Prediction Based on Graph Neural Networks](https://papers.nips.cc/paper/2018/file/53f0d7c537d99b3824f0f99d62ea2428-Paper.pdf), Zhang and Chen, 2018.
 
diff --git a/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py b/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py
@@ -1384,7 +1384,7 @@ def visualize_dequant_distribution(model: ImageFlow, imgs: Tensor, title: str =
 # and we have the guarantee that every possible input $x$ has a corresponding latent vector $z$.
 # However, even beyond continuous inputs and images, flows can be applied and allow us to exploit
 # the data structure in latent space, as e.g. on graphs for the task of molecule generation [6].
-# Recent advances in [Neural ODEs](https://arxiv.org/pdf/1806.07366.pdf) allow a flow with infinite number of layers,
+# Recent advances in [Neural ODEs](https://arxiv.org/abs/1806.07366) allow a flow with infinite number of layers,
 # called Continuous Normalizing Flows, whose potential is yet to fully explore.
 # Overall, normalizing flows are an exciting research area which will continue over the next couple of years.
 
diff --git a/course_UvA-DL/10-autoregressive-image-modeling/Autoregressive_Image_Modeling.py b/course_UvA-DL/10-autoregressive-image-modeling/Autoregressive_Image_Modeling.py
@@ -18,10 +18,10 @@
 # For instance, in autoregressive models, we cannot interpolate between two images because of the lack of a latent representation.
 # We will explore and discuss these benefits and drawbacks alongside with our implementation.
 #
-# Our implementation will focus on the [PixelCNN](https://arxiv.org/pdf/1606.05328.pdf) [2] model which has been discussed in detail in the lecture.
+# Our implementation will focus on the [PixelCNN](https://arxiv.org/abs/1606.05328) [2] model which has been discussed in detail in the lecture.
 # Most current SOTA models use PixelCNN as their fundamental architecture,
 # and various additions have been proposed to improve the performance
-# (e.g. [PixelCNN++](https://arxiv.org/pdf/1701.05517.pdf) and [PixelSNAIL](http://proceedings.mlr.press/v80/chen18h/chen18h.pdf)).
+# (e.g. [PixelCNN++](https://arxiv.org/abs/1701.05517) and [PixelSNAIL](http://proceedings.mlr.press/v80/chen18h/chen18h.pdf)).
 # Hence, implementing PixelCNN is a good starting point for our short tutorial.
 #
 # First of all, we need to import our standard libraries. Similarly as in
@@ -173,7 +173,7 @@ def show_imgs(imgs):
 # If we now want to apply this to our convolutions, we need to ensure that the prediction of pixel 1
 # is not influenced by its own "true" input, and all pixels on its right and in any lower row.
 # In convolutions, this means that we want to set those entries of the weight matrix to zero that take pixels on the right and below into account.
-# As an example for a 5x5 kernel, see a mask below (figure credit - [Aaron van den Oord](https://arxiv.org/pdf/1606.05328.pdf)):
+# As an example for a 5x5 kernel, see a mask below (figure credit - [Aaron van den Oord](https://arxiv.org/abs/1606.05328)):
 #
 # <center width="100%" style="padding: 10px"><img src="masked_convolution.svg" width="150px"></center>
 #
@@ -216,10 +216,10 @@ def forward(self, x):
 #
 # To build our own autoregressive image model, we could simply stack a few masked convolutions on top of each other.
 # This was actually the case for the original PixelCNN model, discussed in the paper
-# [Pixel Recurrent Neural Networks](https://arxiv.org/pdf/1601.06759.pdf), but this leads to a considerable issue.
+# [Pixel Recurrent Neural Networks](https://arxiv.org/abs/1601.06759), but this leads to a considerable issue.
 # When sequentially applying a couple of masked convolutions, the receptive field of a pixel
 # show to have a "blind spot" on the right upper side, as shown in the figure below
-# (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/pdf/1606.05328.pdf)):
+# (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/abs/1606.05328)):
 #
 # <center width="100%" style="padding: 10px"><img src="pixelcnn_blind_spot.svg" width="275px"></center>
 #
@@ -445,7 +445,7 @@ def show_center_recep_field(img, out):
 # For visualizing the receptive field, we assumed a very simplified stack of vertical and horizontal convolutions.
 # Obviously, there are more sophisticated ways of doing it, and PixelCNN uses gated convolutions for this.
 # Specifically, the Gated Convolution block in PixelCNN looks as follows
-# (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/pdf/1606.05328.pdf)):
+# (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/abs/1606.05328)):
 #
 # <center width="100%"><img src="PixelCNN_GatedConv.svg" width="700px" style="padding: 15px"/></center>
 #
@@ -506,7 +506,7 @@ def forward(self, v_stack, h_stack):
 # The architecture consists of multiple stacked GatedMaskedConv blocks, where we add an additional dilation factor to a few convolutions.
 # This is used to increase the receptive field of the model and allows to take a larger context into account during generation.
 # As a reminder, dilation on a convolution works looks as follows
-# (figure credit - [Vincent Dumoulin and Francesco Visin](https://arxiv.org/pdf/1603.07285.pdf)):
+# (figure credit - [Vincent Dumoulin and Francesco Visin](https://arxiv.org/abs/1603.07285)):
 #
 # <center width="100%"><img src="https://raw.githubusercontent.com/vdumoulin/conv_arithmetic/master/gif/dilation.gif" width="250px"></center>
 #
@@ -655,7 +655,7 @@ def test_step(self, batch, batch_idx):
 # %% [markdown]
 # The visualization shows that for predicting any pixel, we can take almost half of the image into account.
 # However, keep in mind that this is the "theoretical" receptive field and not necessarily
-# the [effective receptive field](https://arxiv.org/pdf/1701.04128.pdf), which is usually much smaller.
+# the [effective receptive field](https://arxiv.org/abs/1701.04128), which is usually much smaller.
 # For a stronger model, we should therefore try to increase the receptive
 # field even further. Especially, for the pixel on the bottom right, the
 # very last pixel, we would be allowed to take into account the whole
@@ -869,7 +869,7 @@ def autocomplete_image(img):
 # Interestingly, the pixel values 64, 128 and 191 also stand out which is likely due to the quantization used during the creation of the dataset.
 # For RGB images, we would also see two peaks around 0 and 255,
 # but the values in between would be much more frequent than in MNIST
-# (see Figure 1 in the [PixelCNN++](https://arxiv.org/pdf/1701.05517.pdf) for a visualization on CIFAR10).
+# (see Figure 1 in the [PixelCNN++](https://arxiv.org/abs/1701.05517) for a visualization on CIFAR10).
 #
 # Next, we can visualize the distribution our model predicts (in average):
 
diff --git a/course_UvA-DL/11-vision-transformer/Vision_Transformer.py b/course_UvA-DL/11-vision-transformer/Vision_Transformer.py
@@ -513,7 +513,7 @@ def train_model(**kwargs):
 # Dosovitskiy, Alexey, et al.
 # "An image is worth 16x16 words: Transformers for image recognition at scale."
 # International Conference on Representation Learning (2021).
-# [link](https://arxiv.org/pdf/2010.11929.pdf)
+# [link](https://arxiv.org/abs/2010.11929)
 #
 # Chen, Xiangning, et al.
 # "When Vision Transformers Outperform ResNets without Pretraining or Strong Data Augmentations."
diff --git a/course_UvA-DL/12-meta-learning/Meta_Learning.py b/course_UvA-DL/12-meta-learning/Meta_Learning.py
@@ -1,6 +1,6 @@
 # %% [markdown]
 # <div class="center-wrapper"><div class="video-wrapper"><iframe src="https://www.youtube.com/embed/035rkmT8FfE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></div></div>
-# Meta-Learning offers solutions to these situations, and we will discuss three popular algorithms: __Prototypical Networks__ ([Snell et al., 2017](https://arxiv.org/pdf/1703.05175.pdf)), __Model-Agnostic Meta-Learning / MAML__ ([Finn et al., 2017](http://proceedings.mlr.press/v70/finn17a.html)), and __Proto-MAML__ ([Triantafillou et al., 2020](https://openreview.net/pdf?id=rkgAGAVKPr)).
+# Meta-Learning offers solutions to these situations, and we will discuss three popular algorithms: __Prototypical Networks__ ([Snell et al., 2017](https://arxiv.org/abs/1703.05175)), __Model-Agnostic Meta-Learning / MAML__ ([Finn et al., 2017](http://proceedings.mlr.press/v70/finn17a.html)), and __Proto-MAML__ ([Triantafillou et al., 2020](https://openreview.net/pdf?id=rkgAGAVKPr)).
 # We will focus on the task of few-shot classification where the training and test set have distinct sets of classes.
 # For instance, we would train the model on the binary classifications of cats-birds and flowers-bikes, but during test time, the model would need to learn from 4 examples each the difference between dogs and otters, two classes we have not seen during training (Figure credit - [Lilian Weng](https://lilianweng.github.io/lil-log/2018/11/30/meta-learning.html)).
 #
@@ -417,7 +417,7 @@ def split_batch(imgs, targets):
 # $$\mathbf{v}_c=\frac{1}{|S_c|}\sum_{(\mathbf{x}_i,y_i)\in S_c}f_{\theta}(\mathbf{x}_i)$$
 #
 # where $S_c$ is the part of the support set $S$ for which $y_i=c$, and $\mathbf{v}_c$ represents the _prototype_ of class $c$.
-# The prototype calculation is visualized below for a 2-dimensional feature space and 3 classes (Figure credit - [Snell et al.](https://arxiv.org/pdf/1703.05175.pdf)).
+# The prototype calculation is visualized below for a 2-dimensional feature space and 3 classes (Figure credit - [Snell et al.](https://arxiv.org/abs/1703.05175)).
 # The colored dots represent encoded support elements with color-corresponding class label, and the black dots next to the class label are the averaged prototypes.
 #
 # <center width="100%"><img src="protonet_classification.svg" width="300px"></center>
@@ -1324,7 +1324,7 @@ def test_protomaml(model, dataset, k_shot=4):
 # [1] Snell, Jake, Kevin Swersky, and Richard S. Zemel.
 # "Prototypical networks for few-shot learning."
 # NeurIPS 2017.
-# ([link](https://arxiv.org/pdf/1703.05175.pdf))
+# ([link](https://arxiv.org/abs/1703.05175))
 #
 # [2] Chelsea Finn, Pieter Abbeel, Sergey Levine.
 # "Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks."
diff --git a/lightning_examples/finetuning-scheduler/finetuning-scheduler.py b/lightning_examples/finetuning-scheduler/finetuning-scheduler.py
@@ -609,18 +609,18 @@ def train() -> None:
 # %% [markdown]
 # ## Footnotes
 #
-# - [Howard, J., & Ruder, S. (2018)](https://arxiv.org/pdf/1801.06146.pdf). Fine-tuned Language
+# - [Howard, J., & Ruder, S. (2018)](https://arxiv.org/abs/1801.06146). Fine-tuned Language
 #  Models for Text Classification. ArXiv, abs/1801.06146. [↩](#Scheduled-Fine-Tuning-with-the-Fine-Tuning-Scheduler-Extension)
-# - [Chronopoulou, A., Baziotis, C., & Potamianos, A. (2019)](https://arxiv.org/pdf/1902.10547.pdf).
+# - [Chronopoulou, A., Baziotis, C., & Potamianos, A. (2019)](https://arxiv.org/abs/1902.10547).
 #  An embarrassingly simple approach for transfer learning from pretrained language models. arXiv
 #  preprint arXiv:1902.10547. [↩](#Scheduled-Fine-Tuning-with-the-Fine-Tuning-Scheduler-Extension)
-# - [Peters, M. E., Ruder, S., & Smith, N. A. (2019)](https://arxiv.org/pdf/1903.05987.pdf). To tune or not to
+# - [Peters, M. E., Ruder, S., & Smith, N. A. (2019)](https://arxiv.org/abs/1903.05987). To tune or not to
 #  tune? adapting pretrained representations to diverse tasks. arXiv preprint arXiv:1903.05987. [↩](#Scheduled-Fine-Tuning-with-the-Fine-Tuning-Scheduler-Extension)
-# - [Sivaprasad, P. T., Mai, F., Vogels, T., Jaggi, M., & Fleuret, F. (2020)](https://arxiv.org/pdf/1910.11758.pdf).
+# - [Sivaprasad, P. T., Mai, F., Vogels, T., Jaggi, M., & Fleuret, F. (2020)](https://arxiv.org/abs/1910.11758).
 #  Optimizer benchmarking needs to account for hyperparameter tuning. In International Conference on Machine Learning
 # (pp. 9036-9045). PMLR. [↩](#Optimizer-Configuration)
-# - [Mosbach, M., Andriushchenko, M., & Klakow, D. (2020)](https://arxiv.org/pdf/2006.04884.pdf). On the stability of
+# - [Mosbach, M., Andriushchenko, M., & Klakow, D. (2020)](https://arxiv.org/abs/2006.04884). On the stability of
 # fine-tuning bert: Misconceptions, explanations, and strong baselines. arXiv preprint arXiv:2006.04884. [↩](#Optimizer-Configuration)
-# - [Loshchilov, I., & Hutter, F. (2016)](https://arxiv.org/pdf/1608.03983.pdf). Sgdr: Stochastic gradient descent with
+# - [Loshchilov, I., & Hutter, F. (2016)](https://arxiv.org/abs/1608.03983). Sgdr: Stochastic gradient descent with
 # warm restarts. arXiv preprint arXiv:1608.03983. [↩](#LR-Scheduler-Configuration)
 #

Original file line number	Diff line number	Diff line change
`@@ -658,7 +658,7 @@ def forward(self, x):`
`658`	`658`	`# In fact, training a deep Transformer without learning rate warm-up can make the model diverge`
`659`	`659`	`# and achieve a much worse performance on training and testing.`
`660`	`660`	`# Take for instance the following plot by [Liu et al.`
`661`		`-# (2019)](https://arxiv.org/pdf/1908.03265.pdf) comparing Adam-vanilla (i.e. Adam without warm-up)`
	`661`	`+# (2019)](https://arxiv.org/abs/1908.03265) comparing Adam-vanilla (i.e. Adam without warm-up)`
`662`	`662`	`# vs Adam with a warm-up:`
`663`	`663`	`#`
`664`	`664`	`# <center width="100%"><img src="warmup_loss_plot.svg" width="350px"></center>`
Original file line number	Diff line number	Diff line change
`@@ -744,7 +744,7 @@ def print_results(result_dict):`
`744`	`744`	`# Tutorials and papers for this topic include:`
`745`	`745`	`#`
`746`	`746`	`# * [PyTorch Geometric example](https://github.com/rusty1s/pytorch_geometric/blob/master/examples/link_pred.py)`
`747`		`-# * [Graph Neural Networks: A Review of Methods and Applications](https://arxiv.org/pdf/1812.08434.pdf), Zhou et al.`
	`747`	`+# * [Graph Neural Networks: A Review of Methods and Applications](https://arxiv.org/abs/1812.08434), Zhou et al.`
`748`	`748`	`# 2019`
`749`	`749`	`# * [Link Prediction Based on Graph Neural Networks](https://papers.nips.cc/paper/2018/file/53f0d7c537d99b3824f0f99d62ea2428-Paper.pdf), Zhang and Chen, 2018.`
`750`	`750`
Original file line number	Diff line number	Diff line change
`@@ -18,10 +18,10 @@`
`18`	`18`	`# For instance, in autoregressive models, we cannot interpolate between two images because of the lack of a latent representation.`
`19`	`19`	`# We will explore and discuss these benefits and drawbacks alongside with our implementation.`
`20`	`20`	`#`
`21`		`-# Our implementation will focus on the [PixelCNN](https://arxiv.org/pdf/1606.05328.pdf) [2] model which has been discussed in detail in the lecture.`
	`21`	`+# Our implementation will focus on the [PixelCNN](https://arxiv.org/abs/1606.05328) [2] model which has been discussed in detail in the lecture.`
`22`	`22`	`# Most current SOTA models use PixelCNN as their fundamental architecture,`
`23`	`23`	`# and various additions have been proposed to improve the performance`
`24`		`-# (e.g. [PixelCNN++](https://arxiv.org/pdf/1701.05517.pdf) and [PixelSNAIL](http://proceedings.mlr.press/v80/chen18h/chen18h.pdf)).`
	`24`	`+# (e.g. [PixelCNN++](https://arxiv.org/abs/1701.05517) and [PixelSNAIL](http://proceedings.mlr.press/v80/chen18h/chen18h.pdf)).`
`25`	`25`	`# Hence, implementing PixelCNN is a good starting point for our short tutorial.`
`26`	`26`	`#`
`27`	`27`	`# First of all, we need to import our standard libraries. Similarly as in`
`@@ -173,7 +173,7 @@ def show_imgs(imgs):`
`173`	`173`	`# If we now want to apply this to our convolutions, we need to ensure that the prediction of pixel 1`
`174`	`174`	`# is not influenced by its own "true" input, and all pixels on its right and in any lower row.`
`175`	`175`	`# In convolutions, this means that we want to set those entries of the weight matrix to zero that take pixels on the right and below into account.`
`176`		`-# As an example for a 5x5 kernel, see a mask below (figure credit - [Aaron van den Oord](https://arxiv.org/pdf/1606.05328.pdf)):`
	`176`	`+# As an example for a 5x5 kernel, see a mask below (figure credit - [Aaron van den Oord](https://arxiv.org/abs/1606.05328)):`
`177`	`177`	`#`
`178`	`178`	`# <center width="100%" style="padding: 10px"><img src="masked_convolution.svg" width="150px"></center>`
`179`	`179`	`#`
`@@ -216,10 +216,10 @@ def forward(self, x):`
`216`	`216`	`#`
`217`	`217`	`# To build our own autoregressive image model, we could simply stack a few masked convolutions on top of each other.`
`218`	`218`	`# This was actually the case for the original PixelCNN model, discussed in the paper`
`219`		`-# [Pixel Recurrent Neural Networks](https://arxiv.org/pdf/1601.06759.pdf), but this leads to a considerable issue.`
	`219`	`+# [Pixel Recurrent Neural Networks](https://arxiv.org/abs/1601.06759), but this leads to a considerable issue.`
`220`	`220`	`# When sequentially applying a couple of masked convolutions, the receptive field of a pixel`
`221`	`221`	`# show to have a "blind spot" on the right upper side, as shown in the figure below`
`222`		`-# (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/pdf/1606.05328.pdf)):`
	`222`	`+# (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/abs/1606.05328)):`
`223`	`223`	`#`
`224`	`224`	`# <center width="100%" style="padding: 10px"><img src="pixelcnn_blind_spot.svg" width="275px"></center>`
`225`	`225`	`#`
`@@ -445,7 +445,7 @@ def show_center_recep_field(img, out):`
`445`	`445`	`# For visualizing the receptive field, we assumed a very simplified stack of vertical and horizontal convolutions.`
`446`	`446`	`# Obviously, there are more sophisticated ways of doing it, and PixelCNN uses gated convolutions for this.`
`447`	`447`	`# Specifically, the Gated Convolution block in PixelCNN looks as follows`
`448`		`-# (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/pdf/1606.05328.pdf)):`
	`448`	`+# (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/abs/1606.05328)):`
`449`	`449`	`#`
`450`	`450`	`# <center width="100%"><img src="PixelCNN_GatedConv.svg" width="700px" style="padding: 15px"/></center>`
`451`	`451`	`#`
`@@ -506,7 +506,7 @@ def forward(self, v_stack, h_stack):`
`506`	`506`	`# The architecture consists of multiple stacked GatedMaskedConv blocks, where we add an additional dilation factor to a few convolutions.`
`507`	`507`	`# This is used to increase the receptive field of the model and allows to take a larger context into account during generation.`
`508`	`508`	`# As a reminder, dilation on a convolution works looks as follows`
`509`		`-# (figure credit - [Vincent Dumoulin and Francesco Visin](https://arxiv.org/pdf/1603.07285.pdf)):`
	`509`	`+# (figure credit - [Vincent Dumoulin and Francesco Visin](https://arxiv.org/abs/1603.07285)):`
`510`	`510`	`#`
`511`	`511`	`# <center width="100%"><img src="https://raw.githubusercontent.com/vdumoulin/conv_arithmetic/master/gif/dilation.gif" width="250px"></center>`
`512`	`512`	`#`
`@@ -655,7 +655,7 @@ def test_step(self, batch, batch_idx):`
`655`	`655`	`# %% [markdown]`
`656`	`656`	`# The visualization shows that for predicting any pixel, we can take almost half of the image into account.`
`657`	`657`	`# However, keep in mind that this is the "theoretical" receptive field and not necessarily`
`658`		`-# the [effective receptive field](https://arxiv.org/pdf/1701.04128.pdf), which is usually much smaller.`
	`658`	`+# the [effective receptive field](https://arxiv.org/abs/1701.04128), which is usually much smaller.`
`659`	`659`	`# For a stronger model, we should therefore try to increase the receptive`
`660`	`660`	`# field even further. Especially, for the pixel on the bottom right, the`
`661`	`661`	`# very last pixel, we would be allowed to take into account the whole`
`@@ -869,7 +869,7 @@ def autocomplete_image(img):`
`869`	`869`	`# Interestingly, the pixel values 64, 128 and 191 also stand out which is likely due to the quantization used during the creation of the dataset.`
`870`	`870`	`# For RGB images, we would also see two peaks around 0 and 255,`
`871`	`871`	`# but the values in between would be much more frequent than in MNIST`
`872`		`-# (see Figure 1 in the [PixelCNN++](https://arxiv.org/pdf/1701.05517.pdf) for a visualization on CIFAR10).`
	`872`	`+# (see Figure 1 in the [PixelCNN++](https://arxiv.org/abs/1701.05517) for a visualization on CIFAR10).`
`873`	`873`	`#`
`874`	`874`	`# Next, we can visualize the distribution our model predicts (in average):`
`875`	`875`
Original file line number	Diff line number	Diff line change
`@@ -513,7 +513,7 @@ def train_model(**kwargs):`
`513`	`513`	`# Dosovitskiy, Alexey, et al.`
`514`	`514`	`# "An image is worth 16x16 words: Transformers for image recognition at scale."`
`515`	`515`	`# International Conference on Representation Learning (2021).`
`516`		`-# [link](https://arxiv.org/pdf/2010.11929.pdf)`
	`516`	`+# [link](https://arxiv.org/abs/2010.11929)`
`517`	`517`	`#`
`518`	`518`	`# Chen, Xiangning, et al.`
`519`	`519`	`# "When Vision Transformers Outperform ResNets without Pretraining or Strong Data Augmentations."`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`# %% [markdown]`
`2`	`2`	`# <div class="center-wrapper"><div class="video-wrapper"><iframe src="https://www.youtube.com/embed/035rkmT8FfE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></div></div>`
`3`		`-# Meta-Learning offers solutions to these situations, and we will discuss three popular algorithms: __Prototypical Networks__ ([Snell et al., 2017](https://arxiv.org/pdf/1703.05175.pdf)), __Model-Agnostic Meta-Learning / MAML__ ([Finn et al., 2017](http://proceedings.mlr.press/v70/finn17a.html)), and __Proto-MAML__ ([Triantafillou et al., 2020](https://openreview.net/pdf?id=rkgAGAVKPr)).`
	`3`	`+# Meta-Learning offers solutions to these situations, and we will discuss three popular algorithms: __Prototypical Networks__ ([Snell et al., 2017](https://arxiv.org/abs/1703.05175)), __Model-Agnostic Meta-Learning / MAML__ ([Finn et al., 2017](http://proceedings.mlr.press/v70/finn17a.html)), and __Proto-MAML__ ([Triantafillou et al., 2020](https://openreview.net/pdf?id=rkgAGAVKPr)).`
`4`	`4`	`# We will focus on the task of few-shot classification where the training and test set have distinct sets of classes.`
`5`	`5`	`# For instance, we would train the model on the binary classifications of cats-birds and flowers-bikes, but during test time, the model would need to learn from 4 examples each the difference between dogs and otters, two classes we have not seen during training (Figure credit - [Lilian Weng](https://lilianweng.github.io/lil-log/2018/11/30/meta-learning.html)).`
`6`	`6`	`#`
`@@ -417,7 +417,7 @@ def split_batch(imgs, targets):`
`417`	`417`	`# $$\mathbf{v}_c=\frac{1}{\|S_c\|}\sum_{(\mathbf{x}_i,y_i)\in S_c}f_{\theta}(\mathbf{x}_i)$$`
`418`	`418`	`#`
`419`	`419`	`# where $S_c$ is the part of the support set $S$ for which $y_i=c$, and $\mathbf{v}_c$ represents the _prototype_ of class $c$.`
`420`		`-# The prototype calculation is visualized below for a 2-dimensional feature space and 3 classes (Figure credit - [Snell et al.](https://arxiv.org/pdf/1703.05175.pdf)).`
	`420`	`+# The prototype calculation is visualized below for a 2-dimensional feature space and 3 classes (Figure credit - [Snell et al.](https://arxiv.org/abs/1703.05175)).`
`421`	`421`	`# The colored dots represent encoded support elements with color-corresponding class label, and the black dots next to the class label are the averaged prototypes.`
`422`	`422`	`#`
`423`	`423`	`# <center width="100%"><img src="protonet_classification.svg" width="300px"></center>`
`@@ -1324,7 +1324,7 @@ def test_protomaml(model, dataset, k_shot=4):`
`1324`	`1324`	`# [1] Snell, Jake, Kevin Swersky, and Richard S. Zemel.`
`1325`	`1325`	`# "Prototypical networks for few-shot learning."`
`1326`	`1326`	`# NeurIPS 2017.`
`1327`		`-# ([link](https://arxiv.org/pdf/1703.05175.pdf))`
	`1327`	`+# ([link](https://arxiv.org/abs/1703.05175))`
`1328`	`1328`	`#`
`1329`	`1329`	`# [2] Chelsea Finn, Pieter Abbeel, Sergey Levine.`
`1330`	`1330`	`# "Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks."`