diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index 10fe1f03f0f..78355ffa11f 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -1,168 +1,177 @@
-
-* [AbsVal](#absval)
-* [ArgMax](#argmax)
-* [BatchNorm](#batchnorm)
-* [Bias](#bias)
-* [BinaryOp](#binaryop)
-* [BNLL](#bnll)
-* [Cast](#cast)
-* [CELU](#celu)
-* [Clip](#clip)
-* [Concat](#concat)
-* [Convolution](#convolution)
-* [Convolution1D](#convolution1d)
-* [Convolution3D](#convolution3d)
-* [ConvolutionDepthWise](#convolutiondepthwise)
-* [ConvolutionDepthWise1D](#convolutiondepthwise1d)
-* [ConvolutionDepthWise3D](#convolutiondepthwise3d)
-* [CopyTo](#copyto)
-* [Crop](#crop)
-* [CumulativeSum](#cumulativesum)
-* [Deconvolution](#deconvolution)
-* [Deconvolution1D](#deconvolution1d)
-* [Deconvolution3D](#deconvolution3d)
-* [DeconvolutionDepthWise](#deconvolutiondepthwise)
-* [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d)
-* [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d)
-* [DeformableConv2D](#deformableconv2d)
-* [Dequantize](#dequantize)
-* [Diag](#diag)
-* [Dropout](#dropout)
-* [Eltwise](#eltwise)
-* [ELU](#elu)
-* [Embed](#embed)
-* [Exp](#exp)
-* [Flatten](#flatten)
-* [Fold](#fold)
-* [GELU](#gelu)
-* [GLU](#glu)
-* [Gemm](#gemm)
-* [GridSample](#gridsample)
-* [GroupNorm](#groupnorm)
-* [GRU](#gru)
-* [HardSigmoid](#hardsigmoid)
-* [HardSwish](#hardswish)
-* [InnerProduct](#innerproduct)
-* [Input](#input)
-* [InstanceNorm](#instancenorm)
-* [Interp](#interp)
-* [InverseSpectrogram](#inversespectrogram)
-* [LayerNorm](#layernorm)
-* [Log](#log)
-* [LRN](#lrn)
-* [LSTM](#lstm)
-* [MemoryData](#memorydata)
-* [Mish](#mish)
-* [MultiHeadAttention](#multiheadattention)
-* [MVN](#mvn)
-* [Noop](#noop)
-* [Normalize](#normalize)
-* [Packing](#packing)
-* [Padding](#padding)
-* [Permute](#permute)
-* [PixelShuffle](#pixelshuffle)
-* [Pooling](#pooling)
-* [Pooling1D](#pooling1d)
-* [Pooling3D](#pooling3d)
-* [Power](#power)
-* [PReLU](#prelu)
-* [Quantize](#quantize)
-* [Reduction](#reduction)
-* [ReLU](#relu)
-* [Reorg](#reorg)
-* [Requantize](#requantize)
-* [Reshape](#reshape)
-* [RMSNorm](#rmsnorm)
-* [RNN](#rnn)
-* [Scale](#scale)
-* [SELU](#selu)
-* [Shrink](#shrink)
-* [ShuffleChannel](#shufflechannel)
-* [Sigmoid](#sigmoid)
-* [Slice](#slice)
-* [Softmax](#softmax)
-* [Softplus](#softplus)
-* [Spectrogram](#spectrogram)
-* [Split](#split)
-* [Swish](#swish)
-* [TanH](#tanh)
-* [Threshold](#threshold)
-* [Tile](#tile)
-* [UnaryOp](#unaryop)
-* [Unfold](#unfold)
+- [AbsVal](#absval)
+- [ArgMax](#argmax)
+- [BatchNorm](#batchnorm)
+- [Bias](#bias)
+- [BinaryOp](#binaryop)
+- [BNLL](#bnll)
+- [Cast](#cast)
+- [CELU](#celu)
+- [Clip](#clip)
+- [Concat](#concat)
+- [Convolution](#convolution)
+- [Convolution1D](#convolution1d)
+- [Convolution3D](#convolution3d)
+- [ConvolutionDepthWise](#convolutiondepthwise)
+- [ConvolutionDepthWise1D](#convolutiondepthwise1d)
+- [ConvolutionDepthWise3D](#convolutiondepthwise3d)
+- [CopyTo](#copyto)
+- [Crop](#crop)
+- [CumulativeSum](#cumulativesum)
+- [Deconvolution](#deconvolution)
+- [Deconvolution1D](#deconvolution1d)
+- [Deconvolution3D](#deconvolution3d)
+- [DeconvolutionDepthWise](#deconvolutiondepthwise)
+- [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d)
+- [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d)
+- [DeformableConv2D](#deformableconv2d)
+- [Dequantize](#dequantize)
+- [Diag](#diag)
+- [Dropout](#dropout)
+- [Eltwise](#eltwise)
+- [ELU](#elu)
+- [Embed](#embed)
+- [Exp](#exp)
+- [Flatten](#flatten)
+- [Fold](#fold)
+- [GELU](#gelu)
+- [GLU](#glu)
+- [Gemm](#gemm)
+- [GridSample](#gridsample)
+- [GroupNorm](#groupnorm)
+- [GRU](#gru)
+- [HardSigmoid](#hardsigmoid)
+- [HardSwish](#hardswish)
+- [InnerProduct](#innerproduct)
+- [Input](#input)
+- [InstanceNorm](#instancenorm)
+- [Interp](#interp)
+- [InverseSpectrogram](#inversespectrogram)
+- [LayerNorm](#layernorm)
+- [Log](#log)
+- [LRN](#lrn)
+- [LSTM](#lstm)
+- [MemoryData](#memorydata)
+- [Mish](#mish)
+- [MultiHeadAttention](#multiheadattention)
+- [MVN](#mvn)
+- [Noop](#noop)
+- [Normalize](#normalize)
+- [Packing](#packing)
+- [Padding](#padding)
+- [Permute](#permute)
+- [PixelShuffle](#pixelshuffle)
+- [Pooling](#pooling)
+- [Pooling1D](#pooling1d)
+- [Pooling3D](#pooling3d)
+- [Power](#power)
+- [PReLU](#prelu)
+- [Quantize](#quantize)
+- [Reduction](#reduction)
+- [ReLU](#relu)
+- [Reorg](#reorg)
+- [Requantize](#requantize)
+- [Reshape](#reshape)
+- [RMSNorm](#rmsnorm)
+- [RNN](#rnn)
+- [Scale](#scale)
+- [SELU](#selu)
+- [Shrink](#shrink)
+- [ShuffleChannel](#shufflechannel)
+- [Sigmoid](#sigmoid)
+- [Slice](#slice)
+- [Softmax](#softmax)
+- [Softplus](#softplus)
+- [Spectrogram](#spectrogram)
+- [Split](#split)
+- [Swish](#swish)
+- [TanH](#tanh)
+- [Threshold](#threshold)
+- [TopK](#topk)
+- [Tile](#tile)
+- [UnaryOp](#unaryop)
+- [Unfold](#unfold)
 
 # AbsVal
+
 ```
 y = abs(x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # ArgMax
+
 ```
 y = argmax(x, out_max_val, topk)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | out_max_val   | int   | 0         |                   |
-| 1         | topk          | int   | 1         |                   |
+| param id | name        | type | default | description |
+| -------- | ----------- | ---- | ------- | ----------- |
+| 0        | out_max_val | int  | 0       |             |
+| 1        | topk        | int  | 1       |             |
 
 # BatchNorm
+
 ```
 y = (x - mean) / sqrt(var + eps) * slope + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | channels      | int   | 0         |                   |
-| 1         | eps           | float | 0.f       |                   |
+| param id | name     | type  | default | description |
+| -------- | -------- | ----- | ------- | ----------- |
+| 0        | channels | int   | 0       |             |
+| 1        | eps      | float | 0.f     |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| slope_data    | float | [channels]            |
-| mean_data     | float | [channels]            |
-| var_data      | float | [channels]            |
-| bias_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| slope_data | float | [channels] |
+| mean_data  | float | [channels] |
+| var_data   | float | [channels] |
+| bias_data  | float | [channels] |
 
 # Bias
+
 ```
 y = x + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | bias_data_size| int   | 0         |                   |
+| param id | name           | type | default | description |
+| -------- | -------------- | ---- | ------- | ----------- |
+| 0        | bias_data_size | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| bias_data     | float | [channels]            |
+| weight    | type  | shape      |
+| --------- | ----- | ---------- |
+| bias_data | float | [channels] |
 
 # BinaryOp
- This operation is used for binary computation, and the calculation rule depends on the [broadcasting rule](https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting).
+
+This operation is used for binary computation, and the calculation rule depends on the [broadcasting rule](https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting).
+
 ```
 C = binaryop(A, B)
 ```
+
 if with_scalar = 1:
+
 - one_blob_only
 - support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         | Operation type as follows |
-| 1         | with_scalar   | int   | 0         | with_scalar=0 B is a matrix, with_scalar=1 B is a scalar |
-| 2         | b             | float | 0.f       | When B is a scalar, B = b |
+| param id | name        | type  | default | description                                              |
+| -------- | ----------- | ----- | ------- | -------------------------------------------------------- |
+| 0        | op_type     | int   | 0       | Operation type as follows                                |
+| 1        | with_scalar | int   | 0       | with_scalar=0 B is a matrix, with_scalar=1 B is a scalar |
+| 2        | b           | float | 0.f     | When B is a scalar, B = b                                |
 
 Operation type:
+
 - 0 = ADD
 - 1 = SUB
 - 2 = MUL
@@ -177,28 +186,31 @@ Operation type:
 - 11 = RATAN2
 
 # BNLL
+
 ```
 y = log(1 + e^(-x)) , x > 0
 y = log(1 + e^x),     x < 0
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Cast
+
 ```
 y = cast(x)
 ```
 
-* one_blob_only
-* support_packing
+- one_blob_only
+- support_packing
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | type_from     | int   | 0         |                   |
-| 1         | type_to       | int   | 0         |                   |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | type_from | int  | 0       |             |
+| 1        | type_to   | int  | 0       |             |
 
 Element type:
+
 - 0 = auto
 - 1 = float32
 - 2 = float16
@@ -206,293 +218,304 @@ Element type:
 - 4 = bfloat16
 
 # CELU
+
 ```
 if x < 0    y = (exp(x / alpha) - 1.f) * alpha
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 1.f     |             |
 
 # Clip
+
 ```
 y = clamp(x, min, max)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | min           | float | -FLT_MAX  |                   |
-| 1         | max           | float | FLT_MAX   |                   |
+| param id | name | type  | default  | description |
+| -------- | ---- | ----- | -------- | ----------- |
+| 0        | min  | float | -FLT_MAX |             |
+| 1        | max  | float | FLT_MAX  |             |
 
 # Concat
+
 ```
 y = concat(x0, x1, x2, ...) by axis
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Convolution
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [num_output] |
-| bottom_blob_int8_scales| float | [1]          |
-| top_blob_int8_scales| float | [1]             |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 8        | int8_scale_term   | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 19       | dynamic_weight    | int   | 0          |             |
+
+| weight                  | type            | shape                                       |
+| ----------------------- | --------------- | ------------------------------------------- |
+| weight_data             | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data               | float           | [num_output]                                |
+| weight_data_int8_scales | float           | [num_output]                                |
+| bottom_blob_int8_scales | float           | [1]                                         |
+| top_blob_int8_scales    | float           | [1]                                         |
 
 # Convolution1D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv1d(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | pad_value         | float | 0.f      |             |
+| 19       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type            | shape                             |
+| ----------- | --------------- | --------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, num_input, num_output] |
+| bias_data   | float           | [num_output]                      |
 
 # Convolution3D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv3d(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 17       | pad_behind        | int   | pad_front  |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 21       | kernel_d          | int   | kernel_w   |             |
+| 22       | dilation_d        | int   | dilation_w |             |
+| 23       | stride_d          | int   | stride_w   |             |
+| 24       | pad_front         | int   | pad_left   |             |
+
+| weight      | type            | shape                                                 |
+| ----------- | --------------- | ----------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
+| bias_data   | float           | [num_output]                                          |
 
 # ConvolutionDepthWise
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [group]      |
-| bottom_blob_int8_scales| float | [1]          |
-| top_blob_int8_scales| float | [1]             |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 7        | group             | int   | 1          |             |
+| 8        | int8_scale_term   | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 19       | dynamic_weight    | int   | 0          |             |
+
+| weight                  | type            | shape                                                              |
+| ----------------------- | --------------- | ------------------------------------------------------------------ |
+| weight_data             | float/fp16/int8 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
+| bias_data               | float           | [num_output]                                                       |
+| weight_data_int8_scales | float           | [group]                                                            |
+| bottom_blob_int8_scales | float           | [1]                                                                |
+| top_blob_int8_scales    | float           | [1]                                                                |
 
 # ConvolutionDepthWise1D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv1d(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 7        | group             | int   | 1        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | pad_value         | float | 0.f      |             |
+| 19       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type            | shape                                                    |
+| ----------- | --------------- | -------------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, num_input / group, num_output / group, group] |
+| bias_data   | float           | [num_output]                                             |
 
 # ConvolutionDepthWise3D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv3d(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 7        | group             | int   | 1          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 17       | pad_behind        | int   | pad_front  |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 21       | kernel_d          | int   | kernel_w   |             |
+| 22       | dilation_d        | int   | dilation_w |             |
+| 23       | stride_d          | int   | stride_w   |             |
+| 24       | pad_front         | int   | pad_left   |             |
+
+| weight      | type            | shape                                                                        |
+| ----------- | --------------- | ---------------------------------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
+| bias_data   | float           | [num_output]                                                                 |
 
 # CopyTo
+
 ```
 self[offset] = src
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | woffset       | int   | 0         |                   |
-| 1         | hoffset       | int   | 0         |                   |
-| 13        | doffset       | int   | 0         |                   |
-| 2         | coffset       | int   | 0         |                   |
-| 9         | starts        | array | [ ]       |                   |
-| 11        | axes          | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | woffset | int   | 0       |             |
+| 1        | hoffset | int   | 0       |             |
+| 13       | doffset | int   | 0       |             |
+| 2        | coffset | int   | 0       |             |
+| 9        | starts  | array | [ ]     |             |
+| 11       | axes    | array | [ ]     |             |
 
 # Crop
+
 ```
 y = crop(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | woffset       | int   | 0         |                   |
-| 1         | hoffset       | int   | 0         |                   |
-| 13        | doffset       | int   | 0         |                   |
-| 2         | coffset       | int   | 0         |                   |
-| 3         | outw          | int   | 0         |                   |
-| 4         | outh          | int   | 0         |                   |
-| 14        | outd          | int   | 0         |                   |
-| 5         | outc          | int   | 0         |                   |
-| 6         | woffset2      | int   | 0         |                   |
-| 7         | hoffset2      | int   | 0         |                   |
-| 15        | doffset2      | int   | 0         |                   |
-| 8         | coffset2      | int   | 0         |                   |
-| 9         | starts        | array | [ ]       |                   |
-| 10        | ends          | array | [ ]       |                   |
-| 11        | axes          | array | [ ]       |                   |
+- one_blob_only
+
+| param id | name     | type  | default | description |
+| -------- | -------- | ----- | ------- | ----------- |
+| 0        | woffset  | int   | 0       |             |
+| 1        | hoffset  | int   | 0       |             |
+| 13       | doffset  | int   | 0       |             |
+| 2        | coffset  | int   | 0       |             |
+| 3        | outw     | int   | 0       |             |
+| 4        | outh     | int   | 0       |             |
+| 14       | outd     | int   | 0       |             |
+| 5        | outc     | int   | 0       |             |
+| 6        | woffset2 | int   | 0       |             |
+| 7        | hoffset2 | int   | 0       |             |
+| 15       | doffset2 | int   | 0       |             |
+| 8        | coffset2 | int   | 0       |             |
+| 9        | starts   | array | [ ]     |             |
+| 10       | ends     | array | [ ]     |             |
+| 11       | axes     | array | [ ]     |             |
 
 # CumulativeSum
 
@@ -500,408 +523,425 @@ If axis < 0, we use axis = x.dims + axis
 
 It implements https://pytorch.org/docs/stable/generated/torch.cumsum.html
 
-* one_blob_only
-* support_inplace
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+- one_blob_only
+- support_inplace
 
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Deconvolution
+
 ```
 x2 = deconv(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_w          | int   | 0                |             |
+| 21       | output_h          | int   | output_w         |             |
+| 28       | dynamic_weight    | int   | 0                |             |
+
+| weight      | type       | shape                                       |
+| ----------- | ---------- | ------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data   | float      | [num_output]                                |
 
 # Deconvolution1D
+
 ```
 x2 = deconv1d(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | output_pad_right  | int   | 0        |             |
+| 20       | output_w          | int   | 0        |             |
+| 28       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type       | shape                             |
+| ----------- | ---------- | --------------------------------- |
+| weight_data | float/fp16 | [kernel_w, num_input, num_output] |
+| bias_data   | float      | [num_output]                      |
 
 # Deconvolution3D
+
 ```
 x2 = deconv3d(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_pad_behind| int | output_pad_right |           |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-| 25        | output_w      | int   | 0         |                   |
-| 26        | output_h      | int   | output_w  |                   |
-| 27        | output_d      | int   | output_w  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 17       | pad_behind        | int   | pad_front        |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_pad_behind | int   | output_pad_right |             |
+| 21       | kernel_d          | int   | kernel_w         |             |
+| 22       | dilation_d        | int   | dilation_w       |             |
+| 23       | stride_d          | int   | stride_w         |             |
+| 24       | pad_front         | int   | pad_left         |             |
+| 25       | output_w          | int   | 0                |             |
+| 26       | output_h          | int   | output_w         |             |
+| 27       | output_d          | int   | output_w         |             |
+
+| weight      | type       | shape                                                 |
+| ----------- | ---------- | ----------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
+| bias_data   | float      | [num_output]                                          |
 
 # DeconvolutionDepthWise
+
 ```
 x2 = deconv(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 7        | group             | int   | 1                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_w          | int   | 0                |             |
+| 21       | output_h          | int   | output_w         |             |
+| 28       | dynamic_weight    | int   | 0                |             |
+
+| weight      | type       | shape                                                              |
+| ----------- | ---------- | ------------------------------------------------------------------ |
+| weight_data | float/fp16 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                                       |
 
 # DeconvolutionDepthWise1D
+
 ```
 x2 = deconv1d(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 7        | group             | int   | 1        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | output_pad_right  | int   | 0        |             |
+| 20       | output_w          | int   | 0        |             |
+| 28       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type       | shape                                                    |
+| ----------- | ---------- | -------------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                             |
 
 # DeconvolutionDepthWise3D
+
 ```
 x2 = deconv3d(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_pad_behind| int | output_pad_right |           |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-| 25        | output_w      | int   | 0         |                   |
-| 26        | output_h      | int   | output_w  |                   |
-| 27        | output_d      | int   | output_w  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 7        | group             | int   | 1                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 17       | pad_behind        | int   | pad_front        |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_pad_behind | int   | output_pad_right |             |
+| 21       | kernel_d          | int   | kernel_w         |             |
+| 22       | dilation_d        | int   | dilation_w       |             |
+| 23       | stride_d          | int   | stride_w         |             |
+| 24       | pad_front         | int   | pad_left         |             |
+| 25       | output_w          | int   | 0                |             |
+| 26       | output_h          | int   | output_w         |             |
+| 27       | output_d          | int   | output_w         |             |
+
+| weight      | type       | shape                                                                        |
+| ----------- | ---------- | ---------------------------------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                                                 |
 
 # DeformableConv2D
+
 ```
 x2 = deformableconv2d(x, offset, mask, weight, kernel, stride, dilation) + bias
 y = activation(x2, act_type, act_params)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+
+| weight      | type            | shape                                       |
+| ----------- | --------------- | ------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data   | float           | [num_output]                                |
 
 # Dequantize
+
 ```
 y = x * scale + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 1         |                   |
-| 1         | bias_data_size| int   | 0         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 1       |             |
+| 1        | bias_data_size  | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
-| bias_data     | float | [bias_data_size]      |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
+| bias_data  | float | [bias_data_size]  |
 
 # Diag
+
 ```
 y = diag(x, diagonal)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | diagonal      | int   | 0         |                   |
+| param id | name     | type | default | description |
+| -------- | -------- | ---- | ------- | ----------- |
+| 0        | diagonal | int  | 0       |             |
 
 # Dropout
+
 ```
 y = x * scale
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale         | float | 1.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | scale | float | 1.f     |             |
 
 # Eltwise
+
 ```
 y = elementwise_op(x0, x1, ...)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         |                   |
-| 1         | coeffs        | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | op_type | int   | 0       |             |
+| 1        | coeffs  | array | [ ]     |             |
 
 Operation type:
+
 - 0 = PROD
 - 1 = SUM
 - 2 = MAX
 
 # ELU
+
 ```
 if x < 0    y = (exp(x) - 1) * alpha
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.1f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.1f    |             |
 
 # Embed
+
 ```
 y = embedding(x)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | input_dim     | int   | 0         |                   |
-| 2         | bias_term     | int   | 0         |                   |
-| 3         | weight_data_size | int | 0        |                   |
-| 18        | int8_scale_term| int  | 0         |                   |
+| param id | name             | type | default | description |
+| -------- | ---------------- | ---- | ------- | ----------- |
+| 0        | num_output       | int  | 0       |             |
+| 1        | input_dim        | int  | 0       |             |
+| 2        | bias_term        | int  | 0       |             |
+| 3        | weight_data_size | int  | 0       |             |
+| 18       | int8_scale_term  | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float | [weight_data_size]    |
-| bias_term     | float | [num_output]          |
-| weight_data_int8_scales| float | [1]          |
+| weight                  | type  | shape              |
+| ----------------------- | ----- | ------------------ |
+| weight_data             | float | [weight_data_size] |
+| bias_term               | float | [num_output]       |
+| weight_data_int8_scales | float | [1]                |
 
 # Exp
+
 ```
 if base == -1   y = exp(shift + x * scale)
 else            y = pow(base, (shift + x * scale))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | base          | float | -1.f      |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | base  | float | -1.f    |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # Flatten
+
 Reshape blob to 1 dimension
 
-* one_blob_only
+- one_blob_only
 
 # Fold
+
 ```
 y = fold(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
+- one_blob_only
+
+| param id | name       | type | default    | description |
+| -------- | ---------- | ---- | ---------- | ----------- |
+| 0        | num_output | int  | 0          |             |
+| 1        | kernel_w   | int  | 0          |             |
+| 2        | dilation_w | int  | 1          |             |
+| 3        | stride_w   | int  | 1          |             |
+| 4        | pad_left   | int  | 0          |             |
+| 11       | kernel_h   | int  | kernel_w   |             |
+| 12       | dilation_h | int  | dilation_w |             |
+| 13       | stride_h   | int  | stride_w   |             |
+| 14       | pad_top    | int  | pad_left   |             |
+| 15       | pad_right  | int  | pad_left   |             |
+| 16       | pad_bottom | int  | pad_top    |             |
+| 20       | output_w   | int  | 0          |             |
+| 21       | output_h   | int  | output_w   |             |
 
 # GELU
+
 ```
 if fast_gelu == 1   y = 0.5 * x * (1 + tanh(0.79788452 * (x + 0.044715 * x * x * x)));
 else                y = 0.5 * x * erfc(-0.70710678 * x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | fast_gelu     | int   | 0         | use approximation |
+| param id | name      | type | default | description       |
+| -------- | --------- | ---- | ------- | ----------------- |
+| 0        | fast_gelu | int  | 0       | use approximation |
 
 # GLU
 
@@ -913,13 +953,14 @@ where a is the first half of the input matrix and b is the second half.
 
 axis specifies the dimension to split the input
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Gemm
+
 ```
 a = transA ? transpose(x0) : x0
 b = transb ? transpose(x1) : x1
@@ -927,88 +968,91 @@ c = x2
 y = (gemm(a, b) + c * beta) * alpha
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.f       |                   |
-| 1         | beta          | float | 1.f       |                   |
-| 2         | transA        | int   | 0         |                   |
-| 3         | transb        | int   | 0         |                   |
-| 4         | constantA     | int   | 0         |                   |
-| 5         | constantB     | int   | 0         |                   |
-| 6         | constantC     | int   | 0         |                   |
-| 7         | constantM     | int   | 0         |                   |
-| 8         | constantN     | int   | 0         |                   |
-| 9         | constantK     | int   | 0         |                   |
-| 10        | constant_broadcast_type_C | int | 0 |                 |
-| 11        | output_N1M    | int   | 0         |                   |
-| 12        | output_elempack | int | 0         |                   |
-| 13        | output_elemtype | int | 0         |                   |
-| 14        | output_transpose | int| 0         |                   |
-| 18        | int8_scale_term | int | 0         |                   |
-| 20        | constant_TILE_M | int | 0         |                   |
-| 21        | constant_TILE_N | int | 0         |                   |
-| 22        | constant_TILE_K | int | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| A_data        | float/fp16/int8 | [M, K] or [K, M] |
-| B_data        | float/fp16/int8 | [N, K] or [K, N] |
-| C_data        | float | [1], [M] or [N] or [1, M] or [N,1] or [N, M] |
-| A_data_int8_scales| float | [M]               |
-| B_data_int8_scales| float | [1]               |
+| param id | name                      | type  | default | description |
+| -------- | ------------------------- | ----- | ------- | ----------- |
+| 0        | alpha                     | float | 1.f     |             |
+| 1        | beta                      | float | 1.f     |             |
+| 2        | transA                    | int   | 0       |             |
+| 3        | transb                    | int   | 0       |             |
+| 4        | constantA                 | int   | 0       |             |
+| 5        | constantB                 | int   | 0       |             |
+| 6        | constantC                 | int   | 0       |             |
+| 7        | constantM                 | int   | 0       |             |
+| 8        | constantN                 | int   | 0       |             |
+| 9        | constantK                 | int   | 0       |             |
+| 10       | constant_broadcast_type_C | int   | 0       |             |
+| 11       | output_N1M                | int   | 0       |             |
+| 12       | output_elempack           | int   | 0       |             |
+| 13       | output_elemtype           | int   | 0       |             |
+| 14       | output_transpose          | int   | 0       |             |
+| 18       | int8_scale_term           | int   | 0       |             |
+| 20       | constant_TILE_M           | int   | 0       |             |
+| 21       | constant_TILE_N           | int   | 0       |             |
+| 22       | constant_TILE_K           | int   | 0       |             |
+
+| weight             | type            | shape                                        |
+| ------------------ | --------------- | -------------------------------------------- |
+| A_data             | float/fp16/int8 | [M, K] or [K, M]                             |
+| B_data             | float/fp16/int8 | [N, K] or [K, N]                             |
+| C_data             | float           | [1], [M] or [N] or [1, M] or [N,1] or [N, M] |
+| A_data_int8_scales | float           | [M]                                          |
+| B_data_int8_scales | float           | [1]                                          |
 
 # GridSample
+
 ```
 Given an input and a flow-field grid, computes the output using input values and pixel locations from grid.
 
-For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y, 
+For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y,
 which are used to interpolate the output value output[:, h2, w2]
 
 This function is often used in conjunction with affine_grid() to build Spatial Transformer Networks .
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | sample_type   | int   | 1         |                   |
-| 1         | padding_mode  | int   | 1         |                   |
-| 2         | align_corner  | int   | 0         |                   |
-| 3         | permute_fusion| int   | 0         | fuse with permute |
-
+| param id | name           | type | default | description       |
+| -------- | -------------- | ---- | ------- | ----------------- |
+| 0        | sample_type    | int  | 1       |                   |
+| 1        | padding_mode   | int  | 1       |                   |
+| 2        | align_corner   | int  | 0       |                   |
+| 3        | permute_fusion | int  | 0       | fuse with permute |
 
 Sample type:
+
 - 1 = Nearest
 - 2 = Bilinear
 - 3 = Bicubic
 
 Padding mode:
+
 - 1 = zeros
 - 2 = border
 - 3 = reflection
 
-
 # GroupNorm
+
 ```
 split x along channel axis into group x0, x1 ...
 l2 normalize for each group x0, x1 ...
 y = x * gamma + beta
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | group         | int   | 1         |                   |
-| 1         | channels      | int   | 0         |                   |
-| 2         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 3         | affine        | int   | 1         |                   |
+| param id | name     | type  | default | description             |
+| -------- | -------- | ----- | ------- | ----------------------- |
+| 0        | group    | int   | 1       |                         |
+| 1        | channels | int   | 0       |                         |
+| 2        | eps      | float | 0.001f  | x = x / sqrt(var + eps) |
+| 3        | affine   | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [channels]            |
-| beta_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| gamma_data | float | [channels] |
+| beta_data  | float | [channels] |
 
 # GRU
+
 Apply a single-layer GRU to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1016,134 +1060,143 @@ y = gru(x)
 y0, hidden y1 = gru(x0, hidden x1)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | hidden size of output |
-| 1         | weight_data_size| int | 0         | total size of weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
+| param id | name             | type | default | description                           |
+| -------- | ---------------- | ---- | ------- | ------------------------------------- |
+| 0        | num_output       | int  | 0       | hidden size of output                 |
+| 1        | weight_data_size | int  | 0       | total size of weight matrix           |
+| 2        | direction        | int  | 0       | 0=forward, 1=reverse, 2=bidirectional |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, num_output * 3, num_directions] |
-| bias_c_data   | float/fp16/int8 | [num_output, 4, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, num_output * 3, num_directions] |
+| weight         | type            | shape                                        |
+| -------------- | --------------- | -------------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, num_output * 3, num_directions] |
+| bias_c_data    | float/fp16/int8 | [num_output, 4, num_directions]              |
+| weight_hc_data | float/fp16/int8 | [num_output, num_output * 3, num_directions] |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # HardSigmoid
+
 ```
 y = clamp(x * alpha + beta, 0, 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.2f      |                   |
-| 1         | beta          | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.2f    |             |
+| 1        | beta  | float | 0.5f    |             |
 
 # HardSwish
+
 ```
 y = x * clamp(x * alpha + beta, 0, 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.2f      |                   |
-| 1         | beta          | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.2f    |             |
+| 1        | beta  | float | 0.5f    |             |
 
 # InnerProduct
+
 ```
 x2 = innerproduct(x, weight) + bias
 y = activation(x2, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | bias_term     | int   | 0         |                   |
-| 2         | weight_data_size| int | 0         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
+- one_blob_only
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [num_input, num_output] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [num_output] |
-| bottom_blob_int8_scales| float | [1]          |
+| param id | name              | type  | default | description |
+| -------- | ----------------- | ----- | ------- | ----------- |
+| 0        | num_output        | int   | 0       |             |
+| 1        | bias_term         | int   | 0       |             |
+| 2        | weight_data_size  | int   | 0       |             |
+| 8        | int8_scale_term   | int   | 0       |             |
+| 9        | activation_type   | int   | 0       |             |
+| 10       | activation_params | array | [ ]     |             |
+
+| weight                  | type            | shape                   |
+| ----------------------- | --------------- | ----------------------- |
+| weight_data             | float/fp16/int8 | [num_input, num_output] |
+| bias_data               | float           | [num_output]            |
+| weight_data_int8_scales | float           | [num_output]            |
+| bottom_blob_int8_scales | float           | [1]                     |
 
 # Input
+
 ```
 y = input
 ```
 
-* support_inplace
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | 0         |                   |
-| 1         | h             | int   | 0         |                   |
-| 11        | d             | int   | 0         |                   |
-| 2         | c             | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | w    | int  | 0       |             |
+| 1        | h    | int  | 0       |             |
+| 11       | d    | int  | 0       |             |
+| 2        | c    | int  | 0       |             |
 
 # InstanceNorm
+
 ```
 split x along channel axis into instance x0, x1 ...
 l2 normalize for each channel instance x0, x1 ...
 y = x * gamma + beta
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | channels      | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name     | type  | default | description             |
+| -------- | -------- | ----- | ------- | ----------------------- |
+| 0        | channels | int   | 0       |                         |
+| 1        | eps      | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine   | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [channels]            |
-| beta_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| gamma_data | float | [channels] |
+| beta_data  | float | [channels] |
 
 # Interp
+
 ```
 if dynamic_target_size == 0     y = resize(x) by fixed size or scale
 else                            y = resize(x0, size(x1))
 ```
 
-* one_blob_only if dynamic_target_size == 0
+- one_blob_only if dynamic_target_size == 0
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | resize_type   | int   | 0         |                   |
-| 1         | height_scale  | float | 1.f       |                   |
-| 2         | width_scale   | float | 1.f       |                   |
-| 3         | output_height | int   | 0         |                   |
-| 4         | output_width  | int   | 0         |                   |
-| 5         | dynamic_target_size| int | 0      |                   |
-| 6         | align_corner  | int   | 0         |                   |
+| param id | name                | type  | default | description |
+| -------- | ------------------- | ----- | ------- | ----------- |
+| 0        | resize_type         | int   | 0       |             |
+| 1        | height_scale        | float | 1.f     |             |
+| 2        | width_scale         | float | 1.f     |             |
+| 3        | output_height       | int   | 0       |             |
+| 4        | output_width        | int   | 0       |             |
+| 5        | dynamic_target_size | int   | 0       |             |
+| 6        | align_corner        | int   | 0       |             |
 
 Resize type:
+
 - 1 = Nearest
 - 2 = Bilinear
 - 3 = Bicubic
 
 # InverseSpectrogram
+
 ```
 x1 = x as complex
 x1 = x1 * sqrt(norm) if normalized
@@ -1155,77 +1208,82 @@ if returns == 1 return y1 real
 if returns == 2 return y1 imag
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | n_fft         | int   | 0         |                   |
-| 1         | returns       | int   | 1         |                   |
-| 2         | hoplen        | int   | n_fft / 4 |                   |
-| 3         | winlen        | int   | n_fft     |                   |
-| 4         | window_type   | int   | 0         | 0=ones 1=hann 2=hamming |
-| 5         | center        | int   | 1         |                   |
-| 7         | normalized    | int   | 0         | 0=no 1=n_fft 2=window-l2-energy |
+| param id | name        | type | default   | description                     |
+| -------- | ----------- | ---- | --------- | ------------------------------- |
+| 0        | n_fft       | int  | 0         |                                 |
+| 1        | returns     | int  | 1         |                                 |
+| 2        | hoplen      | int  | n_fft / 4 |                                 |
+| 3        | winlen      | int  | n_fft     |                                 |
+| 4        | window_type | int  | 0         | 0=ones 1=hann 2=hamming         |
+| 5        | center      | int  | 1         |                                 |
+| 7        | normalized  | int  | 0         | 0=no 1=n_fft 2=window-l2-energy |
 
 # LayerNorm
+
 ```
 split x along outmost axis into part x0, x1 ...
 l2 normalize for each part x0, x1 ...
 y = x * gamma + beta by elementwise
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | affine_size   | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name        | type  | default | description             |
+| -------- | ----------- | ----- | ------- | ----------------------- |
+| 0        | affine_size | int   | 0       |                         |
+| 1        | eps         | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine      | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [affine_size]         |
-| beta_data     | float | [affine_size]         |
+| weight     | type  | shape         |
+| ---------- | ----- | ------------- |
+| gamma_data | float | [affine_size] |
+| beta_data  | float | [affine_size] |
 
 # Log
+
 ```
 if base == -1   y = log(shift + x * scale)
 else            y = log(shift + x * scale) / log(base)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | base          | float | -1.f      |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | base  | float | -1.f    |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # LRN
+
 ```
 if region_type == ACROSS_CHANNELS   square_sum = sum of channel window of local_size
 if region_type == WITHIN_CHANNEL    square_sum = sum of spatial window of local_size
 y = x * pow(bias + alpha * square_sum / (local_size * local_size), -beta)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | region_type   | int   | 0         |                   |
-| 1         | local_size    | int   | 5         |                   |
-| 2         | alpha         | float | 1.f       |                   |
-| 3         | beta          | float | 0.75f     |                   |
-| 4         | bias          | float | 1.f       |                   |
+| param id | name        | type  | default | description |
+| -------- | ----------- | ----- | ------- | ----------- |
+| 0        | region_type | int   | 0       |             |
+| 1        | local_size  | int   | 5       |             |
+| 2        | alpha       | float | 1.f     |             |
+| 3        | beta        | float | 0.75f   |             |
+| 4        | bias        | float | 1.f     |             |
 
 Region type:
+
 - 0 = ACROSS_CHANNELS
 - 1 = WITHIN_CHANNEL
 
 # LSTM
+
 Apply a single-layer LSTM to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1233,53 +1291,57 @@ y = lstm(x)
 y0, hidden y1, cell y2 = lstm(x0, hidden x1, cell x2)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | output size of output |
-| 1         | weight_data_size| int | 0         | total size of IFOG weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
-| 3         | hidden_size   | int   | num_output| hidden size       |
+| param id | name             | type | default    | description                           |
+| -------- | ---------------- | ---- | ---------- | ------------------------------------- |
+| 0        | num_output       | int  | 0          | output size of output                 |
+| 1        | weight_data_size | int  | 0          | total size of IFOG weight matrix      |
+| 2        | direction        | int  | 0          | 0=forward, 1=reverse, 2=bidirectional |
+| 3        | hidden_size      | int  | num_output | hidden size                           |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, hidden_size * 4, num_directions] |
-| bias_c_data   | float/fp16/int8 | [hidden_size, 4, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, hidden_size * 4, num_directions] |
-| weight_hr_data| float/fp16/int8 | [hidden_size, num_output, num_directions] |
+| weight         | type            | shape                                         |
+| -------------- | --------------- | --------------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, hidden_size * 4, num_directions] |
+| bias_c_data    | float/fp16/int8 | [hidden_size, 4, num_directions]              |
+| weight_hc_data | float/fp16/int8 | [num_output, hidden_size * 4, num_directions] |
+| weight_hr_data | float/fp16/int8 | [hidden_size, num_output, num_directions]     |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # MemoryData
+
 ```
 y = data
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | 0         |                   |
-| 1         | h             | int   | 0         |                   |
-| 11        | d             | int   | 0         |                   |
-| 2         | c             | int   | 0         |                   |
-| 21        | load_type     | int   | 1         | 1=fp32            |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | w         | int  | 0       |             |
+| 1        | h         | int  | 0       |             |
+| 11       | d         | int  | 0       |             |
+| 2        | c         | int  | 0       |             |
+| 21       | load_type | int  | 1       | 1=fp32      |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| data          | float | [w, h, d, c]          |
+| weight | type  | shape        |
+| ------ | ----- | ------------ |
+| data   | float | [w, h, d, c] |
 
 # Mish
+
 ```
 y = x * tanh(log(exp(x) + 1))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # MultiHeadAttention
+
 ```
 split q k v into num_head part q0, k0, v0, q1, k1, v1 ...
 for each num_head part
@@ -1294,33 +1356,34 @@ for each num_head part
 y = affine(out)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | embed_dim     | int   | 0         |                   |
-| 1         | num_heads     | int   | 1         |                   |
-| 2         | weight_data_size| int | 0         | qdim = weight_data_size / embed_dim |
-| 3         | kdim          | int   | embed_dim |                   |
-| 4         | vdim          | int   | embed_dim |                   |
-| 5         | attn_mask     | int   | 0         |                   |
-| 6         | scale         | float | 1.f / sqrt(embed_dim / num_heads) | |
-| 18        | int8_scale_term | int | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| q_weight_data | float/fp16/int8 | [embed_dim * qdim] |
-| q_bias_data   | float | [embed_dim]           |
-| k_weight_data | float/fp16/int8 | [embed_dim * kdim] |
-| k_bias_data   | float | [embed_dim]           |
-| v_weight_data | float/fp16/int8 | [embed_dim * vdim] |
-| v_bias_data   | float | [embed_dim]           |
-| out_weight_data| float/fp16/int8 | [qdim * embed_dim] |
-| out_bias_data | float | [qdim]                |
-| q_weight_data_int8_scales| float | [embed_dim] |
-| k_weight_data_int8_scales| float | [embed_dim] |
-| v_weight_data_int8_scales| float | [embed_dim] |
-| out_weight_data_int8_scales| float | [1]      |
+| param id | name             | type  | default                           | description                         |
+| -------- | ---------------- | ----- | --------------------------------- | ----------------------------------- |
+| 0        | embed_dim        | int   | 0                                 |                                     |
+| 1        | num_heads        | int   | 1                                 |                                     |
+| 2        | weight_data_size | int   | 0                                 | qdim = weight_data_size / embed_dim |
+| 3        | kdim             | int   | embed_dim                         |                                     |
+| 4        | vdim             | int   | embed_dim                         |                                     |
+| 5        | attn_mask        | int   | 0                                 |                                     |
+| 6        | scale            | float | 1.f / sqrt(embed_dim / num_heads) |                                     |
+| 18       | int8_scale_term  | int   | 0                                 |                                     |
+
+| weight                      | type            | shape              |
+| --------------------------- | --------------- | ------------------ |
+| q_weight_data               | float/fp16/int8 | [embed_dim * qdim] |
+| q_bias_data                 | float           | [embed_dim]        |
+| k_weight_data               | float/fp16/int8 | [embed_dim * kdim] |
+| k_bias_data                 | float           | [embed_dim]        |
+| v_weight_data               | float/fp16/int8 | [embed_dim * vdim] |
+| v_bias_data                 | float           | [embed_dim]        |
+| out_weight_data             | float/fp16/int8 | [qdim * embed_dim] |
+| out_bias_data               | float           | [qdim]             |
+| q_weight_data_int8_scales   | float           | [embed_dim]        |
+| k_weight_data_int8_scales   | float           | [embed_dim]        |
+| v_weight_data_int8_scales   | float           | [embed_dim]        |
+| out_weight_data_int8_scales | float           | [1]                |
 
 # MVN
+
 ```
 if normalize_variance == 1 && across_channels == 1      y = (x - mean) / (sqrt(var) + eps) of whole blob
 if normalize_variance == 1 && across_channels == 0      y = (x - mean) / (sqrt(var) + eps) of each channel
@@ -1328,20 +1391,22 @@ if normalize_variance == 0 && across_channels == 1      y = x - mean of whole bl
 if normalize_variance == 0 && across_channels == 0      y = x - mean of each channel
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | normalize_variance| int | 0       |                   |
-| 1         | across_channels| int  | 0         |                   |
-| 2         | eps           | float | 0.0001f   | x = x / (sqrt(var) + eps) |
+| param id | name               | type  | default | description               |
+| -------- | ------------------ | ----- | ------- | ------------------------- |
+| 0        | normalize_variance | int   | 0       |                           |
+| 1        | across_channels    | int   | 0       |                           |
+| 2        | eps                | float | 0.0001f | x = x / (sqrt(var) + eps) |
 
 # Noop
+
 ```
 y = x
 ```
 
 # Normalize
+
 ```
 if across_spatial == 1 && across_channel == 1      x2 = normalize(x) of whole blob
 if across_spatial == 1 && across_channel == 0      x2 = normalize(x) of each channel
@@ -1349,79 +1414,85 @@ if across_spatial == 0 && across_channel == 1      x2 = normalize(x) of each pos
 y = x2 * scale
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | across_spatial| int   | 0         |                   |
-| 1         | channel_shared| int   | 0         |                   |
-| 2         | eps           | float | 0.0001f   | see eps mode      |
-| 3         | scale_data_size| int  | 0         |                   |
-| 4         | across_channel| int   | 0         |                   |
-| 9         | eps_mode      | int   | 0         |                   |
+| param id | name            | type  | default | description  |
+| -------- | --------------- | ----- | ------- | ------------ |
+| 0        | across_spatial  | int   | 0       |              |
+| 1        | channel_shared  | int   | 0       |              |
+| 2        | eps             | float | 0.0001f | see eps mode |
+| 3        | scale_data_size | int   | 0       |              |
+| 4        | across_channel  | int   | 0       |              |
+| 9        | eps_mode        | int   | 0       |              |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
 
 Eps Mode:
-- 0 = caffe/mxnet   x = x / sqrt(var + eps)
-- 1 = pytorch       x = x / max(sqrt(var), eps)
-- 2 = tensorflow    x = x / sqrt(max(var, eps))
+
+- 0 = caffe/mxnet x = x / sqrt(var + eps)
+- 1 = pytorch x = x / max(sqrt(var), eps)
+- 2 = tensorflow x = x / sqrt(max(var, eps))
 
 # Packing
+
 ```
 y = wrap_packing(x)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | out_elempack  | int   | 1         |                   |
-| 1         | use_padding   | int   | 0         |                   |
-| 2         | cast_type_from| int   | 0         |                   |
-| 3         | cast_type_to  | int   | 0         |                   |
-| 4         | storage_type_from| int | 0        |                   |
-| 5         | storage_type_to| int  | 0         |                   |
+| param id | name              | type | default | description |
+| -------- | ----------------- | ---- | ------- | ----------- |
+| 0        | out_elempack      | int  | 1       |             |
+| 1        | use_padding       | int  | 0       |             |
+| 2        | cast_type_from    | int  | 0       |             |
+| 3        | cast_type_to      | int  | 0       |             |
+| 4        | storage_type_from | int  | 0       |             |
+| 5        | storage_type_to   | int  | 0       |             |
 
 # Padding
+
 ```
 y = pad(x, pads)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | top           | int  | 0         |                   |
-| 1         | bottom        | int  | 0         |                   |
-| 2         | left          | int  | 0         |                   |
-| 3         | right         | int  | 0         |                   |
-| 4         | type          | int  | 0         |                   |
-| 5         | value         | float | 0         |                   |
-| 6         | per_channel_pad_data_size| int | 0 |                 |
-| 7         | front         | int  | stride_w  |                   |
-| 8         | behind        | int  | pad_left  |                   |
+| param id | name                      | type  | default  | description |
+| -------- | ------------------------- | ----- | -------- | ----------- |
+| 0        | top                       | int   | 0        |             |
+| 1        | bottom                    | int   | 0        |             |
+| 2        | left                      | int   | 0        |             |
+| 3        | right                     | int   | 0        |             |
+| 4        | type                      | int   | 0        |             |
+| 5        | value                     | float | 0        |             |
+| 6        | per_channel_pad_data_size | int   | 0        |             |
+| 7        | front                     | int   | stride_w |             |
+| 8        | behind                    | int   | pad_left |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| per_channel_pad_data| float | [per_channel_pad_data_size] |
+| weight               | type  | shape                       |
+| -------------------- | ----- | --------------------------- |
+| per_channel_pad_data | float | [per_channel_pad_data_size] |
 
 Padding type:
+
 - 0 = CONSTANT
 - 1 = REPLICATE
 - 2 = REFLECT
 
 # Permute
+
 ```
 y = reorder(x)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | order_type    | int  | 0         |                   |
+| param id | name       | type | default | description |
+| -------- | ---------- | ---- | ------- | ----------- |
+| 0        | order_type | int  | 0       |             |
 
 Order Type:
+
 - 0 = WH WHC WHDC
 - 1 = HW HWC HWDC
 - 2 = WCH WDHC
@@ -1448,183 +1519,198 @@ Order Type:
 - 23 = CDHW
 
 # PixelShuffle
+
 ```
 if mode == 0    y = depth_to_space(x) where x channel order is sw-sh-outc
 if mode == 1    y = depth_to_space(x) where x channel order is outc-sw-sh
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | upscale_factor| int  | 1         |                   |
-| 1         | mode          | int  | 0         |                   |
+| param id | name           | type | default | description |
+| -------- | -------------- | ---- | ------- | ----------- |
+| 0        | upscale_factor | int  | 1       |             |
+| 1        | mode           | int  | 0       |             |
 
 # Pooling
+
 ```
 x2 = pad(x, pads)
 x3 = pooling(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 11        | kernel_h      | int  | kernel_w  |                   |
-| 12        | stride_h      | int  | stride_w  |                   |
-| 13        | pad_top       | int  | pad_left  |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
-| 15        | pad_bottom    | int  | pad_top   |                   |
-| 18        | out_h         | int  | out_w     |                   |
+| param id | name                      | type | default  | description |
+| -------- | ------------------------- | ---- | -------- | ----------- |
+| 0        | pooling_type              | int  | 0        |             |
+| 1        | kernel_w                  | int  | 0        |             |
+| 2        | stride_w                  | int  | 1        |             |
+| 3        | pad_left                  | int  | 0        |             |
+| 4        | global_pooling            | int  | 0        |             |
+| 5        | pad_mode                  | int  | 0        |             |
+| 6        | avgpool_count_include_pad | int  | 0        |             |
+| 7        | adaptive_pooling          | int  | 0        |             |
+| 8        | out_w                     | int  | 0        |             |
+| 11       | kernel_h                  | int  | kernel_w |             |
+| 12       | stride_h                  | int  | stride_w |             |
+| 13       | pad_top                   | int  | pad_left |             |
+| 14       | pad_right                 | int  | pad_left |             |
+| 15       | pad_bottom                | int  | pad_top  |             |
+| 18       | out_h                     | int  | out_w    |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Pooling1D
+
 ```
 x2 = pad(x, pads)
 x3 = pooling1d(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
+| param id | name                      | type | default  | description |
+| -------- | ------------------------- | ---- | -------- | ----------- |
+| 0        | pooling_type              | int  | 0        |             |
+| 1        | kernel_w                  | int  | 0        |             |
+| 2        | stride_w                  | int  | 1        |             |
+| 3        | pad_left                  | int  | 0        |             |
+| 4        | global_pooling            | int  | 0        |             |
+| 5        | pad_mode                  | int  | 0        |             |
+| 6        | avgpool_count_include_pad | int  | 0        |             |
+| 7        | adaptive_pooling          | int  | 0        |             |
+| 8        | out_w                     | int  | 0        |             |
+| 14       | pad_right                 | int  | pad_left |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Pooling3D
+
 ```
 x2 = pad(x, pads)
 x3 = pooling3d(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 11        | kernel_h      | int  | kernel_w  |                   |
-| 12        | stride_h      | int  | stride_w  |                   |
-| 13        | pad_top       | int  | pad_left  |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
-| 15        | pad_bottom    | int  | pad_top   |                   |
-| 16        | pad_behind    | int  | pad_front |                   |
-| 18        | out_h         | int  | out_w     |                   |
-| 21        | kernel_d      | int  | kernel_w  |                   |
-| 22        | stride_d      | int  | stride_w  |                   |
-| 23        | pad_front     | int  | pad_left  |                   |
-| 28        | out_d         | int  | out_w     |                   |
+| param id | name                      | type | default   | description |
+| -------- | ------------------------- | ---- | --------- | ----------- |
+| 0        | pooling_type              | int  | 0         |             |
+| 1        | kernel_w                  | int  | 0         |             |
+| 2        | stride_w                  | int  | 1         |             |
+| 3        | pad_left                  | int  | 0         |             |
+| 4        | global_pooling            | int  | 0         |             |
+| 5        | pad_mode                  | int  | 0         |             |
+| 6        | avgpool_count_include_pad | int  | 0         |             |
+| 7        | adaptive_pooling          | int  | 0         |             |
+| 8        | out_w                     | int  | 0         |             |
+| 11       | kernel_h                  | int  | kernel_w  |             |
+| 12       | stride_h                  | int  | stride_w  |             |
+| 13       | pad_top                   | int  | pad_left  |             |
+| 14       | pad_right                 | int  | pad_left  |             |
+| 15       | pad_bottom                | int  | pad_top   |             |
+| 16       | pad_behind                | int  | pad_front |             |
+| 18       | out_h                     | int  | out_w     |             |
+| 21       | kernel_d                  | int  | kernel_w  |             |
+| 22       | stride_d                  | int  | stride_w  |             |
+| 23       | pad_front                 | int  | pad_left  |             |
+| 28       | out_d                     | int  | out_w     |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Power
+
 ```
 y = pow((shift + x * scale), power)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | power         | float | 1.f       |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | power | float | 1.f     |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # PReLU
+
 ```
 if x < 0    y = x * slope
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_slope     | int   | 0         |                   |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | num_slope | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| slope_data    | float | [num_slope]           |
+| weight     | type  | shape       |
+| ---------- | ----- | ----------- |
+| slope_data | float | [num_slope] |
 
 # Quantize
+
 ```
 y = float2int8(x * scale)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 1         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 1       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
 
 # Reduction
+
 ```
 y = reduce_op(x * coeff)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | operation     | int   | 0         |                   |
-| 1         | reduce_all    | int   | 1         |                   |
-| 2         | coeff         | float | 1.f       |                   |
-| 3         | axes          | array | [ ]       |                   |
-| 4         | keepdims      | int   | 0         |                   |
-| 5         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |
+| param id | name       | type  | default | description                   |
+| -------- | ---------- | ----- | ------- | ----------------------------- |
+| 0        | operation  | int   | 0       |                               |
+| 1        | reduce_all | int   | 1       |                               |
+| 2        | coeff      | float | 1.f     |                               |
+| 3        | axes       | array | [ ]     |                               |
+| 4        | keepdims   | int   | 0       |                               |
+| 5        | fixbug0    | int   | 0       | hack for bug fix, should be 1 |
 
 Operation type:
+
 - 0 = SUM
 - 1 = ASUM
 - 2 = SUMSQ
@@ -1638,96 +1724,103 @@ Operation type:
 - 10 = LogSumExp
 
 # ReLU
+
 ```
 if x < 0    y = x * slope
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | slope         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | slope | float | 0.f     |             |
 
 # Reorg
+
 ```
 if mode == 0    y = space_to_depth(x) where x channel order is sw-sh-outc
 if mode == 1    y = space_to_depth(x) where x channel order is outc-sw-sh
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | stride        | int  | 1         |                   |
-| 1         | mode          | int  | 0         |                   |
+| param id | name   | type | default | description |
+| -------- | ------ | ---- | ------- | ----------- |
+| 0        | stride | int  | 1       |             |
+| 1        | mode   | int  | 0       |             |
 
 # Requantize
+
 ```
 x2 = x * scale_in + bias
 x3 = activation(x2)
 y = float2int8(x3 * scale_out)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_in_data_size| int | 1       |                   |
-| 1         | scale_out_data_size| int | 1      |                   |
-| 2         | bias_data_size| int   | 0         |                   |
-| 3         | activation_type| int  | 0         |                   |
-| 4         | activation_params| int | [ ]      |                   |
+| param id | name                | type | default | description |
+| -------- | ------------------- | ---- | ------- | ----------- |
+| 0        | scale_in_data_size  | int  | 1       |             |
+| 1        | scale_out_data_size | int  | 1       |             |
+| 2        | bias_data_size      | int  | 0       |             |
+| 3        | activation_type     | int  | 0       |             |
+| 4        | activation_params   | int  | [ ]     |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_in_data | float | [scale_in_data_size]  |
-| scale_out_data| float | [scale_out_data_size] |
-| bias_data     | float | [bias_data_size]      |
+| weight         | type  | shape                 |
+| -------------- | ----- | --------------------- |
+| scale_in_data  | float | [scale_in_data_size]  |
+| scale_out_data | float | [scale_out_data_size] |
+| bias_data      | float | [bias_data_size]      |
 
 # Reshape
+
 ```
 if permute == 1     y = hwc2chw(reshape(chw2hwc(x)))
 else                y = reshape(x)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | -233      |                   |
-| 1         | h             | int   | -233      |                   |
-| 11        | d             | int   | -233      |                   |
-| 2         | c             | int   | -233      |                   |
-| 3         | permute       | int   | 0         |                   |
+| param id | name    | type | default | description |
+| -------- | ------- | ---- | ------- | ----------- |
+| 0        | w       | int  | -233    |             |
+| 1        | h       | int  | -233    |             |
+| 11       | d       | int  | -233    |             |
+| 2        | c       | int  | -233    |             |
+| 3        | permute | int  | 0       |             |
 
 Reshape flag:
+
 - 0 = copy from bottom
 - -1 = remaining
 - -233 = drop this dim(default)
 
 # RMSNorm
+
 ```
 split x along outmost axis into part x0, x1 ...
 root mean square normalize for each part x0, x1 ...
 y = x * gamma by elementwise
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | affine_size   | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name        | type  | default | description             |
+| -------- | ----------- | ----- | ------- | ----------------------- |
+| 0        | affine_size | int   | 0       |                         |
+| 1        | eps         | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine      | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [affine_size]         |
+| weight     | type  | shape         |
+| ---------- | ----- | ------------- |
+| gamma_data | float | [affine_size] |
 
 # RNN
+
 Apply a single-layer RNN to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1735,127 +1828,137 @@ y = rnn(x)
 y0, hidden y1 = rnn(x0, hidden x1)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | hidden size of output |
-| 1         | weight_data_size| int | 0         | total size of weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
+| param id | name             | type | default | description                           |
+| -------- | ---------------- | ---- | ------- | ------------------------------------- |
+| 0        | num_output       | int  | 0       | hidden size of output                 |
+| 1        | weight_data_size | int  | 0       | total size of weight matrix           |
+| 2        | direction        | int  | 0       | 0=forward, 1=reverse, 2=bidirectional |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, num_output, num_directions] |
-| bias_c_data   | float/fp16/int8 | [num_output, 1, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, num_output, num_directions] |
+| weight         | type            | shape                                    |
+| -------------- | --------------- | ---------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, num_output, num_directions] |
+| bias_c_data    | float/fp16/int8 | [num_output, 1, num_directions]          |
+| weight_hc_data | float/fp16/int8 | [num_output, num_output, num_directions] |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # Scale
+
 ```
 if scale_data_size == -233  y = x0 * x1
 else                        y = x * scale + bias
 ```
 
-* one_blob_only if scale_data_size != -233
-* support_inplace
+- one_blob_only if scale_data_size != -233
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 0         |                   |
-| 1         | bias_term     | int   | 0         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 0       |             |
+| 1        | bias_term       | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
-| bias_data     | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
+| bias_data  | float | [scale_data_size] |
 
 # SELU
+
 ```
 if x < 0    y = (exp(x) - 1.f) * alpha * lambda
 else        y = x * lambda
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.67326324f|                  |
-| 1         | lambda        | float | 1.050700987f|                 |
+| param id | name   | type  | default      | description |
+| -------- | ------ | ----- | ------------ | ----------- |
+| 0        | alpha  | float | 1.67326324f  |             |
+| 1        | lambda | float | 1.050700987f |             |
 
 # Shrink
+
 ```
 if x < -lambd y = x + bias
 if x >  lambd y = x - bias
 else          y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | bias          | float | 0.0f      |                   |
-| 1         | lambd         | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | bias  | float | 0.0f    |             |
+| 1        | lambd | float | 0.5f    |             |
 
 # ShuffleChannel
+
 ```
 if reverse == 0     y = shufflechannel(x) by group
 if reverse == 1     y = shufflechannel(x) by channel / group
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | group         | int  | 1         |                   |
-| 1         | reverse       | int  | 0         |                   |
+| param id | name    | type | default | description |
+| -------- | ------- | ---- | ------- | ----------- |
+| 0        | group   | int  | 1       |             |
+| 1        | reverse | int  | 0       |             |
 
 # Sigmoid
+
 ```
 y = 1 / (1 + exp(-x))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Slice
+
 ```
 split x along axis into slices, each part slice size is based on slices array
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | slices        | array | [ ]       |                   |
-| 1         | axis          | int   | 0         |                   |
-| 2         | indices       | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | slices  | array | [ ]     |             |
+| 1        | axis    | int   | 0       |             |
+| 2        | indices | array | [ ]     |             |
 
 # Softmax
+
 ```
 softmax(x, axis)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
-| 1         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |
+| param id | name    | type | default | description                   |
+| -------- | ------- | ---- | ------- | ----------------------------- |
+| 0        | axis    | int  | 0       |                               |
+| 1        | fixbug0 | int  | 0       | hack for bug fix, should be 1 |
 
 # Softplus
+
 ```
 y = log(exp(x) + 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Spectrogram
+
 ```
 x1 = pad(x) if center
 y = stft(x1)
@@ -1866,68 +1969,89 @@ if power == 1 return magnitude
 if power == 2 return square of magnitude
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | n_fft         | int   | 0         |                   |
-| 1         | power         | int   | 0         |                   |
-| 2         | hoplen        | int   | n_fft / 4 |                   |
-| 3         | winlen        | int   | n_fft     |                   |
-| 4         | window_type   | int   | 0         | 0=ones 1=hann 2=hamming |
-| 5         | center        | int   | 1         |                   |
-| 6         | pad_type      | int   | 2         | 0=CONSTANT 1=REPLICATE 2=REFLECT |
-| 7         | normalized    | int   | 0         | 0=no 1=n_fft 2=window-l2-energy |
-| 8         | onesided      | int   | 1         |                   |
+| param id | name        | type | default   | description                      |
+| -------- | ----------- | ---- | --------- | -------------------------------- |
+| 0        | n_fft       | int  | 0         |                                  |
+| 1        | power       | int  | 0         |                                  |
+| 2        | hoplen      | int  | n_fft / 4 |                                  |
+| 3        | winlen      | int  | n_fft     |                                  |
+| 4        | window_type | int  | 0         | 0=ones 1=hann 2=hamming          |
+| 5        | center      | int  | 1         |                                  |
+| 6        | pad_type    | int  | 2         | 0=CONSTANT 1=REPLICATE 2=REFLECT |
+| 7        | normalized  | int  | 0         | 0=no 1=n_fft 2=window-l2-energy  |
+| 8        | onesided    | int  | 1         |                                  |
 
 # Split
+
 ```
 y0, y1 ... = x
 ```
 
 # Swish
+
 ```
 y = x / (1 + exp(-x))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # TanH
+
 ```
 y = tanh(x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Threshold
+
 ```
 if x > threshold    y = 1
 else                y = 0
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
+
+| param id | name      | type  | default | description |
+| -------- | --------- | ----- | ------- | ----------- |
+| 0        | threshold | float | 0.f     |             |
+
+# TopK
+
+```
+y = topk(x, k, axis, largest, sorted)
+```
+
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | threshold     | float | 0.f       |                   |
+| param id | name    | type | default | description                        |
+| -------- | ------- | ---- | ------- | ---------------------------------- |
+| 0        | k       | int  | 1       | number of top entries              |
+| 1        | axis    | int  | 0       | dimension along which to compute k |
+| 2        | largest | int  | 1       | 1 for largest, 0 for smallest      |
+| 3        | sorted  | int  | 1       | 1 to return sorted results         |
 
 # Tile
+
 ```
 y = repeat tiles along axis for x
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
-| 1         | tiles         | int   | 1         |                   |
-| 2         | repeats       | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | axis    | int   | 0       |             |
+| 1        | tiles   | int   | 1       |             |
+| 2        | repeats | array | [ ]     |             |
 
 # UnaryOp
+
 ```
 y = unaryop(x)
 ```
@@ -1935,11 +2059,12 @@ y = unaryop(x)
 - one_blob_only
 - support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         | Operation type as follows |
+| param id | name    | type | default | description               |
+| -------- | ------- | ---- | ------- | ------------------------- |
+| 0        | op_type | int  | 0       | Operation type as follows |
 
 Operation type:
+
 - 0 = ABS
 - 1 = NEG
 - 2 = FLOOR
@@ -1962,22 +2087,23 @@ Operation type:
 - 19 = TRUNC
 
 # Unfold
+
 ```
 y = unfold(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
+- one_blob_only
+
+| param id | name       | type | default    | description |
+| -------- | ---------- | ---- | ---------- | ----------- |
+| 0        | num_output | int  | 0          |             |
+| 1        | kernel_w   | int  | 0          |             |
+| 2        | dilation_w | int  | 1          |             |
+| 3        | stride_w   | int  | 1          |             |
+| 4        | pad_left   | int  | 0          |             |
+| 11       | kernel_h   | int  | kernel_w   |             |
+| 12       | dilation_h | int  | dilation_w |             |
+| 13       | stride_h   | int  | stride_w   |             |
+| 14       | pad_top    | int  | pad_left   |             |
+| 15       | pad_right  | int  | pad_left   |             |
+| 16       | pad_bottom | int  | pad_top    |             |
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c97235d97a0..49f61c8b8c2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -169,6 +169,7 @@ ncnn_add_layer(Shrink)
 ncnn_add_layer(RMSNorm)
 ncnn_add_layer(Spectrogram)
 ncnn_add_layer(InverseSpectrogram)
+ncnn_add_layer(TopK)
 
 if(NCNN_VULKAN)
     ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
new file mode 100644
index 00000000000..efe15e62f16
--- /dev/null
+++ b/src/layer/topk.cpp
@@ -0,0 +1,437 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "topk.h"
+#if !NCNN_SIMPLESTL
+// 兼容vs编译器
+#include <functional>
+#endif
+
+namespace ncnn {
+
+// auto comp = [this](const std::pair<float, int> &a, const std::pair<float, int> &b)
+// {
+//     if (a.first == b.first)
+//         return a.second < b.second; // 值相等时按索引升序排序
+//     return this->largest ? (a.first > b.first) : (a.first < b.first);
+// };
+
+// simplestl兼容写法
+struct CompareFunc
+{
+    bool largest;
+    CompareFunc(bool l)
+        : largest(l)
+    {
+    }
+    bool operator()(const std::pair<float, int>& a, const std::pair<float, int>& b) const
+    {
+        if (a.first == b.first)
+            return a.second < b.second;
+        return largest ? (a.first > b.first) : (a.first < b.first);
+    }
+};
+
+void TopK::do_sort(std::vector<std::pair<float, int> >& vec) const
+{
+    CompareFunc comp(largest);
+    if (sorted)
+    {
+        std::partial_sort(vec.begin(), vec.begin() + k, vec.end(), comp);
+    }
+    else
+    {
+#if !NCNN_SIMPLESTL
+        std::nth_element(vec.begin(), vec.begin() + k - 1, vec.end(), comp);
+        std::sort(vec.begin(), vec.begin() + k, comp);
+#else
+        for (int i = 0; i < k; i++)
+        {
+            for (int j = vec.size() - 1; j > i; j--)
+            {
+                if (comp(vec[j], vec[j - 1]))
+                {
+                    std::swap(vec[j], vec[j - 1]);
+                }
+            }
+        }
+#endif
+    }
+}
+
+TopK::TopK()
+{
+    one_blob_only = false;   // 只需要一个输入 blob
+    support_inplace = false; // 是否支持原地运算
+}
+
+int TopK::load_param(const ParamDict& pd)
+{
+    k = pd.get(0, 1); // [获取参数，默认值1]
+    axis = pd.get(1, 0);
+    largest = pd.get(2, 1);
+    sorted = pd.get(3, 1);
+    // printf("参数加载k=%d, axis=%d, largest=%d, sorted=%d\n", k, axis, largest, sorted);
+    return 0;
+}
+
+int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int dims = bottom_blob.dims;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+
+    // printf("dims=%d, w=%d, h=%d, d=%d, channels=%d, elemsize=%zu\n", dims, w, h, d, channels, elemsize);
+    // 检查k值是否有效
+    if (k <= 0 || k > w * h * channels)
+    {
+        return -1;
+    }
+
+    // 创建输出Mat
+    Mat& top_blob_values = top_blobs[0];  // values
+    Mat& top_blob_indices = top_blobs[1]; // indices
+
+    // 根据dims创建不同维度的输出
+    if (dims == 1)
+    {
+        // 创建输出blob
+        top_blob_values.create(k, elemsize, opt.blob_allocator);
+        top_blob_indices.create(k, sizeof(int), opt.blob_allocator);
+
+        const float* ptr = bottom_blob;
+        float* outptr = top_blob_values;
+        int* indices = top_blob_indices;
+        // 创建pair数组用于排序
+        std::vector<std::pair<float, int> > vec(w);
+        for (int i = 0; i < w; i++)
+        {
+            vec[i] = std::make_pair(ptr[i], i);
+        }
+
+        // 根据sorted参数选择排序方式
+        do_sort(vec);
+
+        // 保存结果
+        for (int i = 0; i < k; i++)
+        {
+            outptr[i] = vec[i].first;
+            indices[i] = vec[i].second;
+        }
+    }
+    else if (dims == 2)
+    {
+        // 在每一行上进行TopK
+        if (axis == 0)
+        {
+            top_blob_values.create(w, k, elemsize, opt.blob_allocator);
+            top_blob_indices.create(w, k, sizeof(int), opt.blob_allocator);
+
+            for (int j = 0; j < w; j++) // 对每列进行处理
+            {
+                std::vector<std::pair<float, int> > vec(h);
+                // 收集当前列的所有元素
+                for (int i = 0; i < h; i++)
+                {
+                    vec[i] = std::make_pair(bottom_blob.row(i)[j], i);
+                }
+
+                do_sort(vec);
+
+                // 保存结果到对应列
+                for (int i = 0; i < k; i++)
+                {
+                    top_blob_values.row(i)[j] = vec[i].first;
+                    top_blob_indices.row<int>(i)[j] = vec[i].second;
+                }
+            }
+        }
+        // 在每一列上进行TopK ，axis=-1等价于axis=1
+        else
+        {
+            top_blob_values.create(k, h, elemsize, opt.blob_allocator);
+            top_blob_indices.create(k, h, sizeof(int), opt.blob_allocator);
+
+            for (int i = 0; i < h; i++)
+            {
+                const float* ptr = bottom_blob.row(i);
+                float* outptr = top_blob_values.row(i);
+                int* indices = top_blob_indices.row<int>(i);
+
+                std::vector<std::pair<float, int> > vec(w);
+                for (int j = 0; j < w; j++)
+                {
+                    vec[j] = std::make_pair(ptr[j], j);
+                }
+
+                do_sort(vec);
+
+                for (int j = 0; j < k; j++)
+                {
+                    outptr[j] = vec[j].first;
+                    indices[j] = vec[j].second;
+                }
+            }
+        }
+    }
+    else if (dims == 3)
+    {
+        if (axis == 0)
+        {
+            // 深度方向上;w不变，高度h变为k
+            top_blob_values.create(w, h, k, elemsize, opt.blob_allocator);
+            top_blob_indices.create(w, h, k, sizeof(int), opt.blob_allocator);
+            // #pragma omp parallel for collapse(2)
+            for (int i = 0; i < h; i++)
+            {
+                for (int j = 0; j < w; j++)
+                {
+                    // 收集该位置所有channel的值
+                    std::vector<std::pair<float, int> > channel_values(channels);
+                    for (int c = 0; c < channels; c++)
+                    {
+                        const float* ptr = bottom_blob.channel(c);
+                        channel_values[c] = std::make_pair(ptr[i * w + j], c);
+                    }
+
+                    // 排序
+                    do_sort(channel_values);
+
+                    // 写回结果
+                    for (int c = 0; c < k; c++)
+                    {
+                        float* outptr = top_blob_values.channel(c);
+                        int* indices = (int*)top_blob_indices.channel(c);
+                        outptr[i * w + j] = channel_values[c].first;
+                        indices[i * w + j] = channel_values[c].second;
+                    }
+                }
+            }
+        }
+        else if (axis == 1)
+        {
+            // 子元素内部进行TopK;w不变，高度变为k
+            top_blob_values.create(w, k, channels, elemsize, opt.blob_allocator);
+            top_blob_indices.create(w, k, channels, sizeof(int), opt.blob_allocator);
+            for (int q = 0; q < channels; q++)
+            {
+                // 获取每个channel的行
+                std::vector<std::pair<float, int> > row_scores(h);
+                for (int j = 0; j < w; j++)
+                {
+                    // 每列单独处理
+                    for (int i = 0; i < h; i++)
+                    {
+                        row_scores[i] = std::make_pair(bottom_blob.channel(q).row(i)[j], i);
+                    }
+
+                    // 找到最大行的索引
+                    do_sort(row_scores);
+
+                    // 保存该列的结果
+                    for (int i = 0; i < k; i++)
+                    {
+                        float* outptr = top_blob_values.channel(q).row(i);
+                        int* indices = (int*)top_blob_indices.channel(q).row(i);
+                        outptr[j] = row_scores[i].first;
+                        indices[j] = row_scores[i].second;
+                    }
+                }
+            }
+        }
+        else if (axis == 2 || axis == -1)
+        {
+            // 输出为k长度的向量，高度不变
+            top_blob_values.create(k, h, channels, elemsize, opt.blob_allocator);
+            top_blob_indices.create(k, h, channels, sizeof(int), opt.blob_allocator);
+            for (int q = 0; q < channels; q++)
+            {
+                for (int j = 0; j < h; j++)
+                {
+                    const float* ptr = bottom_blob.channel(q).row(j);
+                    float* outptr = top_blob_values.channel(q).row(j);
+                    int* indices = top_blob_indices.channel(q).row<int>(j);
+
+                    std::vector<std::pair<float, int> > vec(w);
+                    for (int i = 0; i < w; i++)
+                    {
+                        vec[i] = std::make_pair(ptr[i], i);
+                    }
+
+                    do_sort(vec);
+
+                    for (int i = 0; i < k; i++)
+                    {
+                        outptr[i] = vec[i].first;
+                        indices[i] = vec[i].second;
+                    }
+                }
+            }
+        }
+    }
+    else if (dims == 4)
+    {
+        // 4D数据处理
+        if (axis == 0)
+        {
+            // 在torch中d维度求topk
+            top_blob_values.create(w, h, d, k, elemsize, opt.blob_allocator);
+            top_blob_indices.create(w, h, d, k, sizeof(int), opt.blob_allocator);
+
+            for (int z = 0; z < d; z++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    for (int j = 0; j < w; j++)
+                    {
+                        // 收集channel维度的值
+                        std::vector<std::pair<float, int> > channel_values(channels);
+                        for (int c = 0; c < channels; c++)
+                        {
+                            const float* ptr = bottom_blob.channel(c);
+                            int offset = z * h * w + i * w + j;
+                            channel_values[c] = std::make_pair(ptr[offset], c);
+                        }
+
+                        // 排序
+                        do_sort(channel_values);
+
+                        // 保存结果
+                        for (int kk = 0; kk < k; kk++)
+                        {
+                            float* outptr = top_blob_values.channel(kk);
+                            int* indptr = top_blob_indices.channel(kk);
+                            int out_offset = z * h * w + i * w + j;
+                            outptr[out_offset] = channel_values[kk].first;
+                            indptr[out_offset] = channel_values[kk].second;
+                        }
+                    }
+                }
+            }
+        }
+        else if (axis == 1)
+        {
+            // 在torch中c维度求topk
+            top_blob_values.create(w, h, k, channels, elemsize, opt.blob_allocator);
+            top_blob_indices.create(w, h, k, channels, sizeof(int), opt.blob_allocator);
+
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = top_blob_values.channel(q);
+                int* indptr = top_blob_indices.channel(q);
+
+                for (int i = 0; i < h; i++)
+                {
+                    for (int j = 0; j < w; j++)
+                    {
+                        // 收集当前(h,w)位置在d维度上的所有值
+                        std::vector<std::pair<float, int> > vec(d);
+                        for (int z = 0; z < d; z++)
+                        {
+                            int offset = z * h * w + i * w + j;
+                            vec[z] = std::make_pair(ptr[offset], z);
+                        }
+
+                        do_sort(vec);
+
+                        // 保存top-k结果
+                        for (int z = 0; z < k; z++)
+                        {
+                            int offset = z * h * w + i * w + j;
+                            outptr[offset] = vec[z].first;
+                            indptr[offset] = vec[z].second;
+                        }
+                    }
+                }
+            }
+        }
+        else if (axis == 2)
+        {
+            // 在h维度上进行TopK
+            top_blob_values.create(w, k, d, channels, elemsize, opt.blob_allocator);
+            top_blob_indices.create(w, k, d, channels, sizeof(int), opt.blob_allocator);
+
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = top_blob_values.channel(q);
+                int* indices = top_blob_indices.channel(q);
+
+                for (int z = 0; z < d; z++)
+                {
+                    for (int i = 0; i < w; i++)
+                    {
+                        std::vector<std::pair<float, int> > row_scores(h);
+                        for (int j = 0; j < h; j++)
+                        {
+                            int offset = (z * h + j) * w + i;
+                            row_scores[j] = std::make_pair(ptr[offset], j);
+                        }
+
+                        do_sort(row_scores);
+
+                        // 写回结果
+                        for (int kk = 0; kk < k; kk++)
+                        {
+                            outptr[(z * k + kk) * w + i] = row_scores[kk].first;
+                            indices[(z * k + kk) * w + i] = row_scores[kk].second;
+                        }
+                    }
+                }
+            }
+        }
+        else if (axis == 3 || axis == -1)
+        {
+            // 在w维度上进行TopK
+            top_blob_values.create(k, h, d, channels, elemsize, opt.blob_allocator);
+            top_blob_indices.create(k, h, d, channels, sizeof(int), opt.blob_allocator);
+
+            for (int q = 0; q < channels; q++)
+            {
+                for (int z = 0; z < d; z++)
+                {
+                    for (int i = 0; i < h; i++)
+                    {
+                        std::vector<std::pair<float, int> > row_values(w);
+                        // 收集width维度数据
+                        for (int j = 0; j < w; j++)
+                        {
+                            const float* ptr = bottom_blob.channel(q).row(i * d + z);
+                            row_values[j] = std::make_pair(ptr[j], j);
+                        }
+
+                        do_sort(row_values);
+
+                        // 写回结果
+                        for (int j = 0; j < k; j++)
+                        {
+                            float* outptr = top_blob_values.channel(q).row(i * d + z);
+                            int* indices = top_blob_indices.channel(q).row<int>(i * d + z);
+                            outptr[j] = row_values[j].first;
+                            indices[j] = row_values[j].second;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/topk.h b/src/layer/topk.h
new file mode 100644
index 00000000000..a243a9a2a30
--- /dev/null
+++ b/src/layer/topk.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_TOPK_H
+#define LAYER_TOPK_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class TopK : public Layer
+{
+public:
+    TopK();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+public:
+    int k;
+    int axis;
+    int largest;
+    int sorted;
+
+private:
+    void do_sort(std::vector<std::pair<float, int> >& vec) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_TOPK_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f55859e736e..254ca9aa7bb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -159,6 +159,7 @@ ncnn_add_layer_test(Spectrogram)
 ncnn_add_layer_test(Squeeze)
 ncnn_add_layer_test(Swish)
 ncnn_add_layer_test(TanH)
+ncnn_add_layer_test(TopK)
 ncnn_add_layer_test(Tile)
 ncnn_add_layer_test(UnaryOp)
 ncnn_add_layer_test(Unfold)
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
new file mode 100644
index 00000000000..351c6060e2c
--- /dev/null
+++ b/tests/test_topk.cpp
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+#include "testutil.h"
+
+static int test_topk(const ncnn::Mat& a, int k, int axis, int largest, int sorted)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, k);
+    pd.set(1, axis);
+    pd.set(2, largest);
+    pd.set(3, sorted);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    std::vector<ncnn::Mat> a0(1);
+    a0[0] = a;
+
+    int ret = test_layer("TopK", pd, weights, a0, 2);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_topk failed a.dims=%d a=(%d %d %d) k=%d axis=%d largest=%d sorted=%d\n", a.dims, a.w, a.h, a.c, k, axis, largest, sorted);
+    }
+
+    return ret;
+}
+
+static int test_topk_0()
+{
+    return 0
+           || test_topk(RandomMat(3, 2, 6, 7), 1, 0, 1, 1)
+           || test_topk(RandomMat(3, 4, 2, 5), 2, 1, 0, 1)
+           || test_topk(RandomMat(3, 6, 4, 2), 2, 2, 1, 0)
+           || test_topk(RandomMat(5, 3, 5, 3), 1, 3, 1, 1);
+}
+
+static int test_topk_1()
+{
+    return 0
+           || test_topk(RandomMat(2, 3, 5), 1, 0, 1, 1)
+           || test_topk(RandomMat(4, 2, 5), 1, 1, 0, 1)
+           || test_topk(RandomMat(3, 4, 2), 3, 2, 1, 0);
+}
+
+static int test_topk_2()
+{
+    return 0
+           || test_topk(RandomMat(8, 2), 2, 0, 1, 1)
+           || test_topk(RandomMat(16, 3), 5, 1, 0, 1);
+}
+
+static int test_topk_3()
+{
+    return 0
+           || test_topk(RandomMat(16), 5, 0, 1, 1)
+           || test_topk(RandomMat(32), 10, 0, 0, 1);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_topk_0()
+           || test_topk_1()
+           || test_topk_2()
+           || test_topk_3();
+}
\ No newline at end of file
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index b1ac6f5c024..17a9ff245e9 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -590,6 +590,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/torch_sum.cpp
     pass_ncnn/torch_stft.cpp
     pass_ncnn/torch_t.cpp
+    pass_ncnn/torch_topk.cpp
     pass_ncnn/torch_transpose.cpp
     pass_ncnn/torch_unsqueeze.cpp
     pass_ncnn/torchaudio_F_inverse_spectrogram.cpp
diff --git a/tools/pnnx/src/pass_ncnn/torch_topk.cpp b/tools/pnnx/src/pass_ncnn/torch_topk.cpp
new file mode 100644
index 00000000000..9adf06cc464
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/torch_topk.cpp
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class torch_topk : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.topk              op_0        1 2 input out indices dim=%dim k=%k largest=%largest sorted=%sorted
+pnnx.Output             output      2 0 out indices
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "TopK";
+    }
+
+    const char* name_str() const
+    {
+        return "topk";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        int k = captured_params.at("k").i;
+        int dim = captured_params.at("dim").i;
+        int largest = captured_params.at("largest").b ? 1 : 0;
+        int sorted = captured_params.at("sorted").b ? 1 : 0;
+
+        // 设置参数
+        op->params["0"] = k;
+        op->params["1"] = dim;
+        op->params["2"] = largest;
+        op->params["3"] = sorted;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_topk, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
\ No newline at end of file
diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt
index 42c3bed32e0..2e95d2ada2b 100644
--- a/tools/pnnx/tests/ncnn/CMakeLists.txt
+++ b/tools/pnnx/tests/ncnn/CMakeLists.txt
@@ -203,6 +203,7 @@ pnnx_ncnn_add_test(torch_square)
 pnnx_ncnn_add_test(torch_tan)
 pnnx_ncnn_add_test(torch_tanh)
 pnnx_ncnn_add_test(torch_trunc)
+pnnx_ncnn_add_test(torch_topk)
 
 pnnx_ncnn_add_test(convnext_tiny)
 pnnx_ncnn_add_test(mobilenet_v2)
diff --git a/tools/pnnx/tests/ncnn/test_torch_topk.py b/tools/pnnx/tests/ncnn/test_torch_topk.py
new file mode 100644
index 00000000000..d36af339b8e
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_topk.py
@@ -0,0 +1,114 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, d):
+        x0, i0 = torch.topk(x, 4)
+        y1, i1 = torch.topk(y, k=2, dim=0, largest=True)
+        y2, i2 = torch.topk(y, k=2, dim=1, largest=False)
+        # 3D
+        z1, i3 = torch.topk(z, k=2, dim=0)
+        z2, i4 = torch.topk(z, k=3, dim=1)
+        z3, i5 = torch.topk(z, k=1, dim=2)
+        # 4D
+        d0, i6 = torch.topk(
+            d,
+            k=2,
+            dim=0,
+        )
+        d1, i7 = torch.topk(
+            d,
+            k=2,
+            dim=1,
+        )
+        d2, i8 = torch.topk(
+            d,
+            k=2,
+            dim=2,
+        )
+        d3, i9 = torch.topk(d, k=2, dim=3, sorted=True)
+        return (
+            x0,
+            i0,
+            y1,
+            i1,
+            y2,
+            i2,
+            z1,
+            i3,
+            z2,
+            i4,
+            z3,
+            i5,
+            d0,
+            i6,
+            d1,
+            i7,
+            d2,
+            i8,
+            d3,
+            i9,
+        )
+
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(36)  # 1D
+    y = torch.rand(4, 7)  # 2D
+    z = torch.rand(3, 4, 5)  # 3D
+    d = torch.rand(4, 2, 6, 7)  # 4D
+
+    a = net(x, y, z, d)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z, d))
+    mod.save("test_torch_topk.pt")
+
+    # torchscript to pnnx
+    import os
+
+    os.system(
+        "../../src/pnnx test_torch_topk.pt inputshape=[36],[4,7],[3,4,5],[4,2,6,7]"
+    )
+
+    # pnnx inference
+    import test_torch_topk_ncnn
+
+    b = test_torch_topk_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if a0.dtype != torch.float:
+            a0 = a0.to(torch.int32)  # i64 --> i32
+            b0 = b0.view(torch.int32)  # f32 --> i32
+        if not torch.allclose(a0, b0, 1e-3, 1e-3):
+            return False
+    return True
+
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)