From 6b9c1b632a544653446926b08b2459a1bb37afd6 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sun, 10 Apr 2022 14:38:17 +0200
Subject: [PATCH 1/4] Updates for Flux 0.13 mainly: Partial handling of groups
 in Conv due to deprecation of DepthwiseConv Flux.Diagonal => Flux.Scale

---
 Project.toml          |  4 ++--
 src/NaiveNASflux.jl   |  2 +-
 src/constraints.jl    | 42 +++++++++++++++++++++++++++++++-----------
 src/mutable.jl        | 24 ++++++++++++++++--------
 src/neuronutility.jl  | 15 ++++++++++-----
 src/select.jl         |  6 +++---
 src/types.jl          | 22 ++++++++++++++--------
 src/util.jl           | 30 ++++++++++++++++++------------
 src/vertex.jl         |  4 ++--
 test/mutable.jl       | 18 ++++++++++--------
 test/neuronutility.jl |  2 +-
 test/runtests.jl      |  7 ++-----
 test/util.jl          |  8 ++++++++
 test/vertex.jl        | 18 +++++++++---------
 14 files changed, 127 insertions(+), 75 deletions(-)

diff --git a/Project.toml b/Project.toml
index f64566f..2e920f8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "NaiveNASflux"
 uuid = "85610aed-7d32-5e57-bb50-4c2e1c9e7997"
-version = "2.0.4"
+version = "2.0.5"
 
 [deps]
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
@@ -14,7 +14,7 @@ Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
-Flux = "0.12"
+Flux = "0.13"
 Functors = "0.2"
 JuMP = "0.19, 0.20, 0.21, 0.22, 0.23, 1"
 NaiveNASlib = "2"
diff --git a/src/NaiveNASflux.jl b/src/NaiveNASflux.jl
index f38f1eb..63c4dd7 100644
--- a/src/NaiveNASflux.jl
+++ b/src/NaiveNASflux.jl
@@ -4,7 +4,7 @@ using Reexport
 @reexport using NaiveNASlib
 using NaiveNASlib.Extend, NaiveNASlib.Advanced
 import Flux
-using Flux: Dense, Conv, ConvTranspose, DepthwiseConv, CrossCor, LayerNorm, BatchNorm, InstanceNorm, GroupNorm, 
+using Flux: Dense, Conv, ConvTranspose, CrossCor, LayerNorm, BatchNorm, InstanceNorm, GroupNorm, 
             MaxPool, MeanPool, Dropout, AlphaDropout, GlobalMaxPool, GlobalMeanPool, cpu
 import Functors
 using Functors: @functor
diff --git a/src/constraints.jl b/src/constraints.jl
index 09e8500..ab161f3 100644
--- a/src/constraints.jl
+++ b/src/constraints.jl
@@ -86,10 +86,11 @@ function NaiveNASlib.compconstraint!(case, s::DecoratingJuMPΔSizeStrategy, lt::
    NaiveNASlib.compconstraint!(case, NaiveNASlib.base(s), lt, data)
 end
 # To avoid ambiguity
-function NaiveNASlib.compconstraint!(case::NaiveNASlib.ScalarSize, s::DecoratingJuMPΔSizeStrategy, lt::FluxDepthwiseConv, data)
+function NaiveNASlib.compconstraint!(case::NaiveNASlib.ScalarSize, s::DecoratingJuMPΔSizeStrategy, lt::FluxConvolutional, data)
   NaiveNASlib.compconstraint!(case, NaiveNASlib.base(s), lt, data)
 end
-function NaiveNASlib.compconstraint!(::NaiveNASlib.ScalarSize, s::AbstractJuMPΔSizeStrategy, ::FluxDepthwiseConv, data, ms=allowed_multipliers(s))
+function NaiveNASlib.compconstraint!(::NaiveNASlib.ScalarSize, s::AbstractJuMPΔSizeStrategy, ::FluxConvolutional, data, ms=allowed_multipliers(s))
+  ngroups(data.vertex) == 1 && return
 
   # Add constraint that nout(l) == n * nin(l) where n is integer
   ins = filter(vin -> vin in keys(data.noutdict), inputs(data.vertex))
@@ -119,14 +120,15 @@ allowed_multipliers(s::DepthwiseConvSimpleΔSizeStrategy) = s.allowed_multiplier
 allowed_multipliers(::AbstractJuMPΔSizeStrategy) = 1:10
 
 
-function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::DecoratingJuMPΔSizeStrategy, t::FluxDepthwiseConv, data) 
+function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::DecoratingJuMPΔSizeStrategy, t::FluxConvolutional, data) 
   NaiveNASlib.compconstraint!(case, base(s), t, data)
 end
-function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::AbstractJuMPΔSizeStrategy, t::FluxDepthwiseConv, data)
+function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::AbstractJuMPΔSizeStrategy, t::FluxConvolutional, data)
+  ngroups(data.vertex) == 1 && return
   # Fallbacks don't matter here since we won't call it from below here, just add default so we don't accidentally crash due to some
   # strategy which hasn't defined a fallback
   if 15 < sum(keys(data.outselectvars)) do v
-      layertype(v) isa FluxDepthwiseConv || return 0
+      ngroups(v) == 1 && return 0
       return log2(nout(v)) # Very roughly determined...
   end
     return NaiveNASlib.compconstraint!(case, DepthwiseConvSimpleΔSizeStrategy(10, s, NaiveNASlib.DefaultJuMPΔSizeStrategy()), t, data)
@@ -154,22 +156,29 @@ function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::Abstrac
   =#
 end
 
-function NaiveNASlib.compconstraint!(::NaiveNASlib.NeuronIndices, s::DepthwiseConvSimpleΔSizeStrategy, t::FluxDepthwiseConv, data)
+function NaiveNASlib.compconstraint!(::NaiveNASlib.NeuronIndices, s::DepthwiseConvSimpleΔSizeStrategy, t::FluxConvolutional, data)
   model = data.model
   v = data.vertex
   select = data.outselectvars[v]
   insert = data.outinsertvars[v]
 
+
+  ngroups(v) == 1 && return
   nin(v)[] == 1 && return # Special case, no restrictions as we only need to be an integer multple of 1
 
-  ngroups = div(nout(v), nin(v)[])
+  if size(weights(layer(v)), indim(v)) != 1
+    @warn "Handling of convolutional layers with groups != nin not implemented. Model might not be size aligned after mutation!"
+  end
+
   # Neurons mapped to the same weight are interleaved, i.e layer.weight[:,:,1,:] maps to y[1:ngroups:end] where y = layer(x)
-  for group in 1:ngroups
-    neurons_in_group = select[group : ngroups : end]
+  ngrps = div(nout(v), nin(v)[])
+
+  for group in 1:ngrps
+    neurons_in_group = select[group : ngrps : end]
     @constraint(model, neurons_in_group[1] == neurons_in_group[end])
     @constraint(model, [i=2:length(neurons_in_group)], neurons_in_group[i] == neurons_in_group[i-1])
 
-    insert_in_group = insert[group : ngroups : end]
+    insert_in_group = insert[group : ngrps : end]
     @constraint(model, insert_in_group[1] == insert_in_group[end])
     @constraint(model, [i=2:length(insert_in_group)], insert_in_group[i] == insert_in_group[i-1])
   end
@@ -177,14 +186,18 @@ function NaiveNASlib.compconstraint!(::NaiveNASlib.NeuronIndices, s::DepthwiseCo
   NaiveNASlib.compconstraint!(NaiveNASlib.ScalarSize(), s, t, data, allowed_multipliers(s))
 end
 
-function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::DepthwiseConvAllowNinChangeStrategy, t::FluxDepthwiseConv, data)
+function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::DepthwiseConvAllowNinChangeStrategy, t::FluxConvolutional, data)
   model = data.model
   v = data.vertex
   select = data.outselectvars[v]
   insert = data.outinsertvars[v]
 
+  ngroups(v) == 1 && return
   nin(v)[] == 1 && return # Special case, no restrictions as we only need to be an integer multple of 1?
 
+  # Step 0:
+  # Flux 0.13 changed the grouping of weigths so that size(layer.weight) = (..., nin / ngroups, nout)
+  # We can get back the shape expected here through weightgroups = reshape(layer.weight, ..., nout / groups, nin)
   # Step 1: 
   # Neurons mapped to the same weight are interleaved, i.e layer.weight[:,:,1,:] maps to y[1:ngroups:end] where y = layer(x)
   # where ngroups = nout / nin. For example, nout = 12 and nin = 4 mean size(layer.weight) == (..,3, 4)
@@ -199,6 +212,9 @@ function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::Depthwi
   ininsert = data.outinsertvars[ins[]]
 
   #ngroups = div(nout(v), nin(v)[])
+  if size(weights(layer(v)), indim(v)) != 1
+    @warn "Handling of convolutional layers with groups != nin not implemented. Model might not be size aligned after mutation!"
+  end
   ningroups = nin(v)[]
   add_depthwise_constraints(model, inselect, ininsert, select, insert, ningroups, s.allowed_new_outgroups, s.allowed_multipliers)
 end
@@ -213,6 +229,10 @@ function add_depthwise_constraints(model, inselect, ininsert, select, insert, ni
   # Inserting one new input element at position i will get us noutgroups new consecutive outputputs at position i
   # Thus nout change by Δ * noutgroups.
 
+  # Note: Flux 0.13 changed the grouping of weigths so that size(layer.weight) = (..., nin / ngroups, nout)
+  # We can get back the shape expected here through weightgroups = reshape(layer.weight, ..., nout / groups, nin)
+  # All examples below assume the pre-0.13 representation!
+
   # Example:
 
   # dc = DepthwiseConv((1,1), 3 => 9; bias=false);
diff --git a/src/mutable.jl b/src/mutable.jl
index 902a1ae..4789876 100644
--- a/src/mutable.jl
+++ b/src/mutable.jl
@@ -63,7 +63,9 @@ function mutate(m::MutableLayer; inputs, outputs, other = l -> (), insert=neuron
     end
 end
 
-function mutate(lt::FluxParLayer, m::MutableLayer; inputs=1:nin(m)[], outputs=1:nout(m), other= l -> (), insert=neuroninsert)
+mutate(lt::FluxParLayer, m::MutableLayer; kwargs...) = _mutate(lt, m; kwargs...)
+
+function _mutate(lt::FluxParLayer, m::MutableLayer; inputs=1:nin(m)[], outputs=1:nout(m), other= l -> (), insert=neuroninsert)
     l = layer(m)
     otherdims = other(l)
     w = select(weights(l), indim(l) => inputs, outdim(l) => outputs, otherdims...; newfun=insert(lt, WeightParam()))
@@ -72,19 +74,25 @@ function mutate(lt::FluxParLayer, m::MutableLayer; inputs=1:nin(m)[], outputs=1:
 end
 otherpars(o, l) = ()
 
-function mutate(lt::FluxDepthwiseConv{N}, m::MutableLayer; inputs=1:nin(m)[], outputs=1:nout(m), other= l -> (), insert=neuroninsert) where N
+function mutate(lt::FluxConvolutional{N}, m::MutableLayer; inputs=1:nin(m)[], outputs=1:nout(m), other= l -> (), insert=neuroninsert) where N
+
+    if ngroups(lt, layer(m)) == 1
+        return _mutate(lt, m; inputs, outputs, other, insert)
+    end
+
     l = layer(m)
     otherdims = other(l)
 
-    ngroups = div(length(outputs), length(inputs))
+    # TODO: Handle other cases than ngroups == nin
+    newingroups = 1
 
     # inputs and outputs are coupled through the constraints (which hopefully were enforced) so we only need to consider outputs
     currsize =size(weights(l))
     wo = select(reshape(weights(l), currsize[1:N]...,:), N+1 => outputs, otherdims...; newfun=insert(lt, WeightParam()))
     newks = size(wo)[1:N]
-    w = collect(reshape(wo, newks...,ngroups, :))
+    w = collect(reshape(wo, newks...,newingroups, :))
     b = select(bias(l), 1 => outputs; newfun=insert(lt, BiasParam()))
-    newlayer(m, w, b, otherpars(other, l))
+    newlayer(m, w, b, (;groups= length(inputs) ÷ newingroups, otherpars(other, l)...))
 end
 
 function mutate(lt::FluxRecurrent, m::MutableLayer; inputs=1:nin(m)[], outputs=1:nout(m), other=missing, insert=neuroninsert)
@@ -131,7 +139,7 @@ function mutate(t::FluxParInvLayer, m::MutableLayer; inputs=missing, outputs=mis
     ismissing(outputs) || return mutate(t, m, outputs; insert=insert)
 end
 
-function mutate(lt::FluxDiagonal, m::MutableLayer, inds; insert=neuroninsert)
+function mutate(lt::FluxScale, m::MutableLayer, inds; insert=neuroninsert)
     l = layer(m)
     w = select(weights(l), 1 => inds, newfun=insert(lt, WeightParam()))
     b = select(bias(l), 1 => inds; newfun=insert(lt, BiasParam()))
@@ -139,7 +147,7 @@ function mutate(lt::FluxDiagonal, m::MutableLayer, inds; insert=neuroninsert)
 end
 
 function mutate(::FluxLayerNorm, m::MutableLayer, inds; insert=neuroninsert)
-    # LayerNorm is only a wrapped Diagonal. Just mutate the Diagonal and make a new LayerNorm of it
+    # LayerNorm is only a wrapped Scale. Just mutate the Scale and make a new LayerNorm of it
     proxy = MutableLayer(layer(m).diag)
     mutate(proxy; inputs=inds, outputs=inds, other=l->(), insert=insert)
 
@@ -197,7 +205,7 @@ newlayer(m::MutableLayer, w, b, other=nothing) = m.layer = newlayer(layertype(m)
 
 newlayer(::FluxDense, m::MutableLayer, w, b, other) = Dense(w, b, deepcopy(layer(m).σ))
 newlayer(::FluxConvolutional, m::MutableLayer, w, b, other) = setproperties(layer(m), (weight=w, bias=b, σ=deepcopy(layer(m).σ), other...))
-newlayer(::FluxDiagonal, m::MutableLayer, w, b, other) = Flux.Diagonal(w, b)
+newlayer(::FluxScale, m::MutableLayer, w, b, other) = Flux.Scale(w, b)
 
 
 """
diff --git a/src/neuronutility.jl b/src/neuronutility.jl
index 6dfece2..be94a76 100644
--- a/src/neuronutility.jl
+++ b/src/neuronutility.jl
@@ -62,7 +62,7 @@ function l2_squeeze(x, dimskeep=1:ndims(x))
     dims = filter(i -> i ∉ dimskeep, 1:ndims(x))
     return sqrt.(dropdims(sum(x -> x^2, x, dims=dims), dims=Tuple(dims)))
 end
-l2_squeeze(z::Flux.Zeros, args...) = z
+l2_squeeze(z::Number, args...) = z
 
 """
     mean_squeeze(f, x, dimkeep)
@@ -90,14 +90,19 @@ neuronutility(l) = neuronutility(layertype(l), l)
 # Default: mean of abs of weights + bias. Not a very good metric, but should be better than random
 # Maybe do something about state in recurrent layers as well, but CBA to do it right now
 neuronutility(::FluxParLayer, l) = l2_squeeze(weights(l), outdim(l)) .+ l2_squeeze(bias(l))
-function neuronutility(::FluxDepthwiseConv, l)
-    wm = l2_squeeze(weights(l), outdim(l))
+function neuronutility(::FluxConvolutional{N}, l) where N
+    ngroups(l) == 1 && return l2_squeeze(weights(l), outdim(l)) .+ l2_squeeze(bias(l))
+
+    kernelsize = size(weights(l))[1:N]
+    weightgroups = reshape(weights(l), kernelsize..., nout(l) ÷ ngroups(l), nin(l)[])
+
+    wm = l2_squeeze(weightgroups, indim(l))
     bm = l2_squeeze(bias(l))
 
     (length(wm) == 1 || length(wm) == length(bm)) && return wm .+ bm
     # use this to get insight on whether to repeat inner or outer:
-    # cc = DepthwiseConv(reshape([1 1 1 1;2 2 2 2], 1, 1, 2, 4), [0,0,0,0,1,1,1,1])
-    # cc(fill(10, (1,1,4,1)))
+    # cc = DepthwiseConv(reshape(Float32[1 1 1 1;2 2 2 2], 1, 1, 4, 2), Float32[0,0,0,0,1,1,1,1])
+    # cc(fill(10f0, (1,1,4,1)))
     return repeat(wm, length(bm) ÷ length(wm)) .+ bm
 end
 
diff --git a/src/select.jl b/src/select.jl
index b1cd1dd..6933ed2 100644
--- a/src/select.jl
+++ b/src/select.jl
@@ -1,7 +1,7 @@
 
 select(pars::AbstractArray{T,N}, elements_per_dim...; newfun = randoutzeroin) where {T, N} = NaiveNASlib.parselect(pars, elements_per_dim...; newfun)
 select(::Missing, args...;kwargs...) = missing
-select(::Flux.Zeros, args...;kwargs...) = Flux.Zeros()
+select(s::Number, args...;kwargs...) = s
 
 struct WeightParam end
 struct BiasParam end
@@ -21,8 +21,8 @@ neuroninsert(lt::FluxParNorm, t::Val) = norminsert(lt, t)
 norminsert(::FluxParNorm, ::Union{Val{:β},Val{:μ}}) = (args...) -> 0
 norminsert(::FluxParNorm, ::Union{Val{:γ},Val{:σ²}}) = (args...) -> 1
 
-# Coupling between input and output weights make it difficult to do anything else?
-neuroninsert(::FluxDepthwiseConv, partype) = (args...) -> 0
+# Coupling between input and output weights when grouped make it difficult to do anything else?
+neuroninsert(lt::FluxConvolutional, partype) = ngroups(lt) == 1 ? randoutzeroin : (args...) -> 0
 
 randoutzeroin(T, d, s...) = _randoutzeroin(T,d,s)
 _randoutzeroin(T, d, s) = 0
diff --git a/src/types.jl b/src/types.jl
index 6bdf539..f83765e 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -28,13 +28,19 @@ NaiveNASlib.shapetrait(::Flux.GRUCell) = FluxGru()
 
 abstract type FluxConvolutional{N} <: FluxParLayer end
 struct GenericFluxConvolutional{N} <: FluxConvolutional{N} end
-struct FluxConv{N} <: FluxConvolutional{N} end
-struct FluxConvTranspose{N}  <: FluxConvolutional{N} end
-struct FluxDepthwiseConv{N} <: FluxConvolutional{N}  end
+# Groups here is an eyesore. Its just to not have to tag a breaking version for Flux 0.13 due 
+# to some functions needing to tell the number of groups from the layertype alone
+struct FluxConv{N} <: FluxConvolutional{N} 
+    groups::Int
+end
+FluxConv{N}() where N = FluxConv{N}(1)
+struct FluxConvTranspose{N}  <: FluxConvolutional{N} 
+    groups::Int
+end
+FluxConvTranspose{N}() where N = FluxConvTranspose{N}(1)
 struct FluxCrossCor{N} <: FluxConvolutional{N}  end
-NaiveNASlib.shapetrait(::Conv{N}) where N = FluxConv{N}()
-NaiveNASlib.shapetrait(::ConvTranspose{N}) where N = FluxConvTranspose{N}()
-NaiveNASlib.shapetrait(::DepthwiseConv{N}) where N = FluxDepthwiseConv{N}()
+NaiveNASlib.shapetrait(l::Conv{N}) where N = FluxConv{N}(l.groups)
+NaiveNASlib.shapetrait(l::ConvTranspose{N}) where N = FluxConvTranspose{N}(l.groups)
 NaiveNASlib.shapetrait(::CrossCor{N}) where N = FluxCrossCor{N}()
 
 
@@ -42,14 +48,14 @@ abstract type FluxTransparentLayer <: FluxLayer end
 # Invariant layers with parameters, i.e nin == nout always and parameter selection must
 # be performed
 abstract type FluxParInvLayer <: FluxTransparentLayer end
-struct FluxDiagonal <: FluxParInvLayer end
+struct FluxScale <: FluxParInvLayer end
 struct FluxLayerNorm <: FluxParInvLayer end
 abstract type FluxParNorm <: FluxParInvLayer end
 struct FluxBatchNorm <: FluxParNorm end
 struct FluxInstanceNorm <: FluxParNorm end
 struct FluxGroupNorm <: FluxParNorm end
 
-NaiveNASlib.shapetrait(::Flux.Diagonal) = FluxDiagonal()
+NaiveNASlib.shapetrait(::Flux.Scale) = FluxScale()
 NaiveNASlib.shapetrait(::LayerNorm) = FluxLayerNorm()
 NaiveNASlib.shapetrait(::BatchNorm) = FluxBatchNorm()
 NaiveNASlib.shapetrait(::InstanceNorm) = FluxInstanceNorm()
diff --git a/src/util.jl b/src/util.jl
index e16f236..0809aa9 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -2,16 +2,15 @@ NaiveNASlib.nin(t::FluxLayer, l) = throw(ArgumentError("Not implemented for $t")
 NaiveNASlib.nout(t::FluxLayer, l) = throw(ArgumentError("Not implemented for $t"))
 
 NaiveNASlib.nin(::FluxParLayer, l) = [size(weights(l), indim(l))]
-NaiveNASlib.nout(::FluxParLayer, l) = size(weights(l), outdim(l))
-NaiveNASlib.nout(::FluxDepthwiseConv, l) = size(weights(l), outdim(l)) * nin(l)[]
-
 
+NaiveNASlib.nin(::FluxConvolutional, l) = [size(weights(l), indim(l)) * ngroups(l)]
 NaiveNASlib.nin(::FluxParInvLayer, l) = [nout(l)]
 
-NaiveNASlib.nout(::FluxDiagonal, l) = length(weights(l))
+NaiveNASlib.nout(::FluxParLayer, l) = size(weights(l), outdim(l))
+
+NaiveNASlib.nout(::FluxScale, l) = length(weights(l))
 NaiveNASlib.nout(::FluxParInvLayer, l::LayerNorm) = nout(l.diag)
 NaiveNASlib.nout(::FluxParNorm, l) = l.chs
-
 NaiveNASlib.nout(::FluxRecurrent, l) = div(size(weights(l), outdim(l)), outscale(l))
 
 outscale(l) = outscale(layertype(l))
@@ -34,10 +33,10 @@ outdim(::Flux2D) = 1
 actdim(::Flux2D) = 1
 actrank(::Flux2D) = 1
 
-indim(::FluxDiagonal) = 1
-outdim(::FluxDiagonal) = 1
-actdim(::FluxDiagonal) = 1
-actrank(::FluxDiagonal) = 1
+indim(::FluxScale) = 1
+outdim(::FluxScale) = 1
+actdim(::FluxScale) = 1
+actrank(::FluxScale) = 1
 
 indim(::FluxRecurrent) = 2
 outdim(::FluxRecurrent) = 1
@@ -50,7 +49,7 @@ actdim(::FluxConvolutional{N}) where N = 1+N
 actrank(::FluxConvolutional{N}) where N = 1+N
 indim(::Union{FluxConv{N}, FluxCrossCor{N}}) where N = 1+N
 outdim(::Union{FluxConv{N}, FluxCrossCor{N}}) where N = 2+N
-# Note: Absence of bias mean that bias is of type Flux.Zeros which mostly behaves like a normal array, mostly...
+# Note: Absence of bias mean that bias is a Bool (false), so beware!
 weights(l) = weights(layertype(l), l)
 bias(l) = bias(layertype(l), l)
 
@@ -60,8 +59,8 @@ bias(::FluxDense, l) = l.bias
 weights(::FluxConvolutional, l) = l.weight
 bias(::FluxConvolutional, l) = l.bias
 
-weights(::FluxDiagonal, l) = l.α
-bias(::FluxDiagonal, l) = l.β
+weights(::FluxScale, l) = l.scale
+bias(::FluxScale, l) = l.bias
 
 weights(lt::FluxRecurrent, l::Flux.Recur) = weights(lt, l.cell)
 bias(lt::FluxRecurrent, l::Flux.Recur) = bias(lt, l.cell)
@@ -80,3 +79,10 @@ hiddenstate(::FluxLstm, cell::Flux.LSTMCell) = [h for h in cell.state0]
 state(l) = state(layertype(l), l)
 state(::FluxRecurrent, l) = l.state
 state(::FluxLstm, l) = [h for h in l.state]
+
+ngroups(l) = ngroups(layertype(l), l)
+ngroups(lt, l) = 1
+ngroups(lt::FluxConvolutional, l) = ngroups(lt)
+ngroups(::FluxConvolutional) = 1
+ngroups(lt::FluxConv) = lt.groups
+ngroups(lt::FluxConvTranspose) = lt.groups
diff --git a/src/vertex.jl b/src/vertex.jl
index 7400f6c..380294e 100644
--- a/src/vertex.jl
+++ b/src/vertex.jl
@@ -103,7 +103,7 @@ layertype(l::LayerTypeWrapper) = l.t
 Trait for computations for which a change in output size results in a change in input size but which 
 is not fully `SizeTransparent`.
 
-Example of this is DepthWiseConv where output size must be an integer multiple of the input size.
+Example of this is grouped convolutions where output size must be an integer multiple of the input size.
 
 Does not create any constraints or objectives, only signals that vertices after a 
 `SizeNinNoutConnected` might need to change size if the size of the `SizeNinNoutConnected` vertex changes.
@@ -147,7 +147,7 @@ fluxvertex(name::AbstractString, l, in::AbstractVertex; layerfun=LazyMutable, tr
 
 fluxvertex(::FluxParLayer, l, in::AbstractVertex, layerfun, traitfun) = absorbvertex(layerfun(MutableLayer(l)), in, traitdecoration = traitfun)
 
-fluxvertex(::FluxDepthwiseConv, l, in::AbstractVertex, layerfun, traitfun) = absorbvertex(layerfun(MutableLayer(l)), in; traitdecoration=traitfun ∘ SizeNinNoutConnected)
+fluxvertex(::FluxConvolutional, l, in::AbstractVertex, layerfun, traitfun) = absorbvertex(layerfun(MutableLayer(l)), in; traitdecoration= ngroups(l) == 1 ? traitfun : traitfun ∘ SizeNinNoutConnected)
 
 fluxvertex(::FluxParInvLayer, l, in::AbstractVertex, layerfun, traitfun) = invariantvertex(layerfun(MutableLayer(l)), in, traitdecoration=traitfun ∘ FixedSizeTrait)
 
diff --git a/test/mutable.jl b/test/mutable.jl
index 0ee64b2..abb618c 100644
--- a/test/mutable.jl
+++ b/test/mutable.jl
@@ -50,8 +50,8 @@
         end
 
         @testset "No bias" begin
-            m = MutableLayer(Dense(rand(3,2), Flux.Zeros()))
-            @test bias(layer(m)) == Flux.Zeros()
+            m = MutableLayer(Dense(rand(3,2), false))
+            @test bias(layer(m)) == false
 
             @test nin(m) == [2]
             @test nout(m) == 3
@@ -59,7 +59,7 @@
             inds = [2,3]
             Wexp = weights(layer(m))[inds, :]
             NaiveNASlib.Δsize!(m,_nins(m), inds)
-            assertlayer(layer(m), Wexp, Flux.Zeros())
+            assertlayer(layer(m), Wexp, false)
         end
     end
     @testset "Convolutional layers" begin
@@ -120,8 +120,8 @@
             end
 
             @testset "No bias" begin
-                m = MutableLayer(Conv(Flux.convfilter((2,3), 4=>5), Flux.Zeros()))
-                @test bias(layer(m)) == Flux.Zeros()
+                m = MutableLayer(Conv(Flux.convfilter((2,3), 4=>5), false))
+                @test bias(layer(m)) == false
 
                 @test nin(m) == [4]
                 @test nout(m) == 5
@@ -129,7 +129,7 @@
                 inds = [2,3]
                 Wexp = weights(layer(m))[:,:,:,inds]
                 NaiveNASlib.Δsize!(m, _nins(m), inds)
-                assertlayer(layer(m), Wexp, Flux.Zeros())
+                assertlayer(layer(m), Wexp, false)
             end
         end
 
@@ -162,7 +162,8 @@
                 wins = [1, 3]
                 wouts = [1, 2, 5, 6]
                 outputs = mapreduce(i -> wouts .+ (i-1) .* 6, vcat, wins)
-                Wexp, bexp = weights(m.layer)[:,:,wouts,wins], bias(m.layer)[outputs]
+                Wexp = reshape(reshape(weights(m.layer), 2, 2, 6, 3)[:,:,wouts,wins], 2, 2, 1, :)
+                bexp = bias(m.layer)[outputs]
                 NaiveNASlib.Δsize!(m, [wins], outputs)
                 assertlayer(m.layer, Wexp, bexp)
                 @test size(m(ones(Float32, 3,3,2,2)))[3:4] == (8, 2)
@@ -497,7 +498,8 @@
             wins = [1, 3]
             wouts = [1, 2, 5, 6]
             outs = mapreduce(i -> wouts .+ (i-1) .* 6, vcat, wins)
-            Wexp, bexp = weights(layer(m))[:,:,wouts,wins], bias(layer(m))[outs]
+            Wexp = reshape(reshape(weights(layer(m)), 2, 2, 6, 3)[:,:,wouts,wins], 2, 2, 1, :)
+            bexp = bias(layer(m))[outs]
 
             NaiveNASlib.Δsize!(m, [wins], outs)
             @test size(m(ones(Float32, 3,3,2,2)))[3:4] == (8, 2)
diff --git a/test/neuronutility.jl b/test/neuronutility.jl
index 1bacc43..d3b5afe 100644
--- a/test/neuronutility.jl
+++ b/test/neuronutility.jl
@@ -50,7 +50,7 @@
     end
 
     @testset "Neuron utility Dense default no bias" begin
-        l = ml(Dense(ones(5, 3), Flux.Zeros()))
+        l = ml(Dense(ones(5, 3), false))
         @test size(neuronutility(l)) == (5,)
         @test neuronutility(l) ≈ neuronutility_safe(l)
     end
diff --git a/test/runtests.jl b/test/runtests.jl
index 13db396..dda280b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,11 +3,8 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
 
 function assertlayer(l, Wexp, bexp)
     @test size(Wexp) == size(weights(l))
-    if bexp isa Flux.Zeros
-        @test bias(l) isa Flux.Zeros
-    else
-        @test size(bexp) == size(bias(l))
-    end
+    @test size(bexp) == size(bias(l))
+    
     @test Wexp == weights(l)
     @test bexp == bias(l)
 end
diff --git a/test/util.jl b/test/util.jl
index 2d5b80f..c8e4a0f 100644
--- a/test/util.jl
+++ b/test/util.jl
@@ -72,4 +72,12 @@
         @test_throws ArgumentError indim(BogusLayer())
         @test_throws ArgumentError outdim(BogusLayer())
     end
+
+    @testset "ngroups" begin
+        import NaiveNASflux: ngroups
+
+        @test ngroups(DepthwiseConv((2,), 3 => 9)) == ngroups(Conv((2,), 3 => 9; groups=3)) == ngroups(ConvTranspose((2,), 3 => 9; groups=3)) == 3
+        @test ngroups(Conv((3,3), 10 => 30; groups=5)) == ngroups(ConvTranspose((3,3), 10 => 30; groups=5)) == 5
+        @test ngroups(Conv((3,3), 10 => 30; groups=2)) == ngroups(ConvTranspose((3,3), 10 => 30; groups=2)) == 2
+    end 
 end
diff --git a/test/vertex.jl b/test/vertex.jl
index ec1db34..3e959b5 100644
--- a/test/vertex.jl
+++ b/test/vertex.jl
@@ -154,7 +154,7 @@ end
             # just to check that I have understood the wiring of the weight
             @testset "4 inputs times 2" begin
                 inpt = inputvertex("in", 4, FluxConv{2}())
-                dc = fluxvertex("dc", DepthwiseConv(reshape(Float32[10 10 10 10;20 20 20 20], 1, 1, 2, 4), Float32[0,0,0,0,1,1,1,1]), inpt)
+                dc = fluxvertex("dc", DepthwiseConv(reshape(Float32[10 10 10 10;20 20 20 20], 1, 1, 4, 2), Float32[0,0,0,0,1,1,1,1]), inpt)
                 @test neuronutility(dc) == [20, 40, 20, 40, 21, 41, 21, 41]
                 @test reshape(dc(fill(1f0, (1,1,4,1))), :) == [10, 20, 10, 20, 11, 21, 11, 21]
                 @test Δnout!( dc => -4)
@@ -168,7 +168,7 @@ end
 
             @testset "2 inputs times 3" begin
                 inpt = inputvertex("in", 2, FluxConv{2}())
-                dc = fluxvertex("dc", DepthwiseConv(reshape(Float32[10 10;20 20;30 30], 1, 1, 3, 2), Float32[0,0,1,1,2,2]), inpt)
+                dc = fluxvertex("dc", DepthwiseConv(reshape(Float32[10 10;20 20;30 30], 1, 1, 2, 3), Float32[0,0,1,1,2,2]), inpt)
                 @test reshape(dc(fill(1f0, (1,1,2,1))), :) == [10, 20, 31, 11, 22, 32]
                 @test Δnout!(dc => -2)
                 @test lazyouts(dc) == [2,3,5,6] 
@@ -181,7 +181,7 @@ end
 
             @testset "1 input times 5" begin
                 inpt = inputvertex("in", 1, FluxConv{2}())
-                dc = fluxvertex("dc", DepthwiseConv(reshape(Float32.(10:10:50), 1, 1, 5, 1), Float32.(1:5)), inpt)
+                dc = fluxvertex("dc", DepthwiseConv(reshape(Float32.(10:10:50), 1, 1, 1, 5), Float32.(1:5)), inpt)
                 @test reshape(dc(fill(1f0, (1,1,1,1))), :) == [11, 22, 33, 44, 55]
                 @test Δnout!(dc=>-2)
                 @test lazyouts(dc) == 3:5 
@@ -194,7 +194,7 @@ end
 
             @testset "3 inputs times 7" begin
                 inpt = inputvertex("in", 3, FluxConv{2}())
-                dc = fluxvertex("dc", DepthwiseConv(reshape(repeat(Float32.(10:10:70), 3), 1,1,7,3), Float32.(1:21)), inpt)
+                dc = fluxvertex("dc", DepthwiseConv(reshape(repeat(Float32.(10:10:70), 3), 1,1,3,7), Float32.(1:21)), inpt)
                 @test reshape(dc(fill(10f0, (1,1,3,1))), :) == repeat(100:100:700, 3) .+ (1:21)
                 @test Δnout!(dc => -9) do v
                     v == dc || return 1
@@ -270,9 +270,9 @@ end
 
             # Test that we actually succeeded in making a valid model
             y1 = dc1(ones(Float32, 3,3, nout(inpt), 2))
-            @test size(y1, outdim(dc1)) == nout(dc1)
+            @test size(y1)[end-1] == nout(dc1)
             y2 = dc2(y1)
-            @test size(y2, outdim(dc2)) == nout(dc2)
+            @test size(y2)[end-1] == nout(dc2)
         end
 
         @testset "DepthwiseConv groupsize 3 into groupsize 5" begin
@@ -307,11 +307,11 @@ end
             
             # Test that we actually succeeded in making a valid model
             y1 = dc1(ones(Float32,5,5, nout(inpt), 2))
-            @test size(y1, outdim(dc1)) == nout(dc1)
+            @test size(y1)[end-1] == nout(dc1)
             y2 = dc2(y1)
-            @test size(y2, outdim(dc2)) == nout(dc2)
+            @test size(y2)[end-1] == nout(dc2)
             y3 = dc3(y2)
-            @test size(y3, outdim(dc3)) == nout(dc3)
+            @test size(y3)[end-1] == nout(dc3)
         end
 
         @testset "Depthwise conv change input size from Conv" begin

From 39de482dae2917819fd9eaa62cbb7e0d34160ffd Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sun, 10 Apr 2022 15:06:40 +0200
Subject: [PATCH 2/4] Fix doctest errors Rename DepthwiseConv strategies to
 GroupedConv strategies Flux.Diagonal => Flux.Scale in tests

---
 src/constraints.jl | 68 ++++++++++++++++++++++------------------------
 src/vertex.jl      |  6 ++--
 test/mutable.jl    |  4 +--
 test/util.jl       |  4 +--
 test/vertex.jl     | 16 +++++------
 5 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/src/constraints.jl b/src/constraints.jl
index ab161f3..e3211d8 100644
--- a/src/constraints.jl
+++ b/src/constraints.jl
@@ -1,74 +1,72 @@
 
 """
   
-  DepthwiseConvAllowNinChangeStrategy(newoutputsmax::Integer, multipliersmax::Integer, base, [fallback])  
-  DepthwiseConvAllowNinChangeStrategy(allowed_new_outgroups::AbstractVector{<:Integer}, allowed_multipliers::AbstractVector{<:Integer}, base, [fallback])
+  GroupedConvAllowNinChangeStrategy(newoutputsmax::Integer, multipliersmax::Integer, base, [fallback])  
+  GroupedConvAllowNinChangeStrategy(allowed_new_outgroups::AbstractVector{<:Integer}, allowed_multipliers::AbstractVector{<:Integer}, base, [fallback])
   
-`DecoratingJuMPΔSizeStrategy` which allows both nin and nout of `DepthwiseConv` layers to change independently.
+`DecoratingJuMPΔSizeStrategy` which allows both nin and nout of grouped `Conv` layers (i.e `Conv` with `groups` != 1) to change independently.
 
-Might cause optimization to take very long time so use with care! Use [`DepthwiseConvSimpleΔSizeStrategy`](@ref)
-if `DepthwiseConvAllowNinChangeStrategy` takes too long.
+Might cause optimization to take very long time so use with care! Use [`GroupedConvSimpleΔSizeStrategy`](@ref)
+if `GroupedConvAllowNinChangeStrategy` takes too long.
 
 The elements of `allowed_new_outgroups` determine how many extra elements in the output dimension of the weight 
-shall be tried for each existing output element. For example, for a `DepthwiseConv((k1,k2), nin=>nout))` there 
-are `nout / nin` elements in the output dimension. With `allowed_new_outgroups = 0:3` it is allowed to insert
-0, 1, 2 or 3 new elements in the output dimension between each already existing element (so with `nout / nin` 
-elements the maximum increase is `3 * nout / nin`). 
+shall be tried for each existing output element. For example, for a `Conv((k1,k2), nin=>nout; groups=nin))` one 
+must insert integer multiples of `nout / nin` elements at the time. With `nin/nout = k` and `allowed_new_outgroups = 0:3` it is allowed to insert 0, `k`, `2k` or `3k` new elements in the output dimension between each already existing element.
 
 The elements of `allowed_multipliers` determine the total number of allowed output elements, i.e the allowed 
 ratios of `nout / nin`.
 
 If `fallback` is not provided, it will be derived from `base`.
 """
-struct DepthwiseConvAllowNinChangeStrategy{S,F} <: DecoratingJuMPΔSizeStrategy
+struct GroupedConvAllowNinChangeStrategy{S,F} <: DecoratingJuMPΔSizeStrategy
   allowed_new_outgroups::Vector{Int}
   allowed_multipliers::Vector{Int}
   base::S
   fallback::F
 end
-DepthwiseConvAllowNinChangeStrategy(newoutputsmax::Integer, multipliersmax::Integer,base,fb...) = DepthwiseConvAllowNinChangeStrategy(0:newoutputsmax, 1:multipliersmax, base, fb...)
+GroupedConvAllowNinChangeStrategy(newoutputsmax::Integer, multipliersmax::Integer,base,fb...) = GroupedConvAllowNinChangeStrategy(0:newoutputsmax, 1:multipliersmax, base, fb...)
 
 
-function DepthwiseConvAllowNinChangeStrategy(
+function GroupedConvAllowNinChangeStrategy(
   allowed_new_outgroups::AbstractVector{<:Integer},
   allowed_multipliers::AbstractVector{<:Integer}, 
-  base, fb= recurse_fallback(s -> DepthwiseConvAllowNinChangeStrategy(allowed_new_outgroups, allowed_multipliers, s), base)) 
-  return DepthwiseConvAllowNinChangeStrategy(collect(Int, allowed_new_outgroups), collect(Int, allowed_multipliers), base, fb)
+  base, fb= recurse_fallback(s -> GroupedConvAllowNinChangeStrategy(allowed_new_outgroups, allowed_multipliers, s), base)) 
+  return GroupedConvAllowNinChangeStrategy(collect(Int, allowed_new_outgroups), collect(Int, allowed_multipliers), base, fb)
 end
 
 
-NaiveNASlib.base(s::DepthwiseConvAllowNinChangeStrategy) = s.base
-NaiveNASlib.fallback(s::DepthwiseConvAllowNinChangeStrategy) = s.fallback
+NaiveNASlib.base(s::GroupedConvAllowNinChangeStrategy) = s.base
+NaiveNASlib.fallback(s::GroupedConvAllowNinChangeStrategy) = s.fallback
 
-NaiveNASlib.add_participants!(s::DepthwiseConvAllowNinChangeStrategy, vs=AbstractVertex[]) = NaiveNASlib.add_participants!(base(s), vs)
+NaiveNASlib.add_participants!(s::GroupedConvAllowNinChangeStrategy, vs=AbstractVertex[]) = NaiveNASlib.add_participants!(base(s), vs)
 
 
 """
-  DepthwiseConvSimpleΔSizeStrategy(base, [fallback])
+  GroupedConvSimpleΔSizeStrategy(base, [fallback])
 
-`DecoratingJuMPΔSizeStrategy` which only allows nout of `DepthwiseConv` layers to change.
+`DecoratingJuMPΔSizeStrategy` which only allows nout of grouped `Conv` layers (i.e `Conv` with `groups` != 1) to change.
 
-Use if [`DepthwiseConvAllowNinChangeStrategy`](@ref) takes too long to solve.
+Use if [`GroupedConvAllowNinChangeStrategy`](@ref) takes too long to solve.
 
 The elements of `allowed_multipliers` determine the total number of allowed output elements, i.e the allowed 
-ratios of `nout / nin`.
+ratios of `nout / nin` (where `nin` is fixed).
 
 If `fallback` is not provided, it will be derived from `base`.
 """
-struct DepthwiseConvSimpleΔSizeStrategy{S, F} <: DecoratingJuMPΔSizeStrategy
+struct GroupedConvSimpleΔSizeStrategy{S, F} <: DecoratingJuMPΔSizeStrategy
   allowed_multipliers::Vector{Int}
   base::S
   fallback::F
 end
 
-DepthwiseConvSimpleΔSizeStrategy(maxms::Integer, base, fb...) = DepthwiseConvSimpleΔSizeStrategy(1:maxms, base, fb...)
-function DepthwiseConvSimpleΔSizeStrategy(ms::AbstractVector{<:Integer}, base, fb=recurse_fallback(s -> DepthwiseConvSimpleΔSizeStrategy(ms, s), base)) 
-  return DepthwiseConvSimpleΔSizeStrategy(collect(Int, ms), base, fb)
+GroupedConvSimpleΔSizeStrategy(maxms::Integer, base, fb...) = GroupedConvSimpleΔSizeStrategy(1:maxms, base, fb...)
+function GroupedConvSimpleΔSizeStrategy(ms::AbstractVector{<:Integer}, base, fb=recurse_fallback(s -> GroupedConvSimpleΔSizeStrategy(ms, s), base)) 
+  return GroupedConvSimpleΔSizeStrategy(collect(Int, ms), base, fb)
 end
-NaiveNASlib.base(s::DepthwiseConvSimpleΔSizeStrategy) = s.base
-NaiveNASlib.fallback(s::DepthwiseConvSimpleΔSizeStrategy) = s.fallback
+NaiveNASlib.base(s::GroupedConvSimpleΔSizeStrategy) = s.base
+NaiveNASlib.fallback(s::GroupedConvSimpleΔSizeStrategy) = s.fallback
 
-NaiveNASlib.add_participants!(s::DepthwiseConvSimpleΔSizeStrategy, vs=AbstractVertex[]) = NaiveNASlib.add_participants!(base(s), vs)
+NaiveNASlib.add_participants!(s::GroupedConvSimpleΔSizeStrategy, vs=AbstractVertex[]) = NaiveNASlib.add_participants!(base(s), vs)
 
 
 recurse_fallback(f, s::AbstractJuMPΔSizeStrategy) = wrap_fallback(f, NaiveNASlib.fallback(s))
@@ -115,8 +113,8 @@ function NaiveNASlib.compconstraint!(::NaiveNASlib.ScalarSize, s::AbstractJuMPΔ
   end
 end
 
-allowed_multipliers(s::DepthwiseConvAllowNinChangeStrategy) = s.allowed_multipliers
-allowed_multipliers(s::DepthwiseConvSimpleΔSizeStrategy) = s.allowed_multipliers
+allowed_multipliers(s::GroupedConvAllowNinChangeStrategy) = s.allowed_multipliers
+allowed_multipliers(s::GroupedConvSimpleΔSizeStrategy) = s.allowed_multipliers
 allowed_multipliers(::AbstractJuMPΔSizeStrategy) = 1:10
 
 
@@ -131,10 +129,10 @@ function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::Abstrac
       ngroups(v) == 1 && return 0
       return log2(nout(v)) # Very roughly determined...
   end
-    return NaiveNASlib.compconstraint!(case, DepthwiseConvSimpleΔSizeStrategy(10, s, NaiveNASlib.DefaultJuMPΔSizeStrategy()), t, data)
+    return NaiveNASlib.compconstraint!(case, GroupedConvSimpleΔSizeStrategy(10, s, NaiveNASlib.DefaultJuMPΔSizeStrategy()), t, data)
   end
   # The number of allowed multipliers can probably be better tuned, perhaps based on current size.
-  return NaiveNASlib.compconstraint!(case, DepthwiseConvAllowNinChangeStrategy(10, 10, s, NaiveNASlib.DefaultJuMPΔSizeStrategy()), t, data)
+  return NaiveNASlib.compconstraint!(case, GroupedConvAllowNinChangeStrategy(10, 10, s, NaiveNASlib.DefaultJuMPΔSizeStrategy()), t, data)
   #=
   For benchmarking:
     using NaiveNASflux, Flux, NaiveNASlib.Advanced
@@ -156,7 +154,7 @@ function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::Abstrac
   =#
 end
 
-function NaiveNASlib.compconstraint!(::NaiveNASlib.NeuronIndices, s::DepthwiseConvSimpleΔSizeStrategy, t::FluxConvolutional, data)
+function NaiveNASlib.compconstraint!(::NaiveNASlib.NeuronIndices, s::GroupedConvSimpleΔSizeStrategy, t::FluxConvolutional, data)
   model = data.model
   v = data.vertex
   select = data.outselectvars[v]
@@ -186,7 +184,7 @@ function NaiveNASlib.compconstraint!(::NaiveNASlib.NeuronIndices, s::DepthwiseCo
   NaiveNASlib.compconstraint!(NaiveNASlib.ScalarSize(), s, t, data, allowed_multipliers(s))
 end
 
-function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::DepthwiseConvAllowNinChangeStrategy, t::FluxConvolutional, data)
+function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::GroupedConvAllowNinChangeStrategy, t::FluxConvolutional, data)
   model = data.model
   v = data.vertex
   select = data.outselectvars[v]
@@ -206,7 +204,7 @@ function NaiveNASlib.compconstraint!(case::NaiveNASlib.NeuronIndices, s::Depthwi
   # 
   ins = filter(vin -> vin in keys(data.noutdict), inputs(v))
   # If inputs to v are not part of problem we have to keep nin(v) fixed!
-  isempty(ins) && return NaiveNASlib.compconstraint!(case, DepthwiseConvSimpleΔSizeStrategy(allowed_multipliers(s), base(s)), t, data)
+  isempty(ins) && return NaiveNASlib.compconstraint!(case, GroupedConvSimpleΔSizeStrategy(allowed_multipliers(s), base(s)), t, data)
   # TODO: Check if input is immutable and do simple strat then too?
   inselect = data.outselectvars[ins[]]
   ininsert = data.outinsertvars[ins[]]
diff --git a/src/vertex.jl b/src/vertex.jl
index 380294e..4ba39a5 100644
--- a/src/vertex.jl
+++ b/src/vertex.jl
@@ -204,7 +204,7 @@ Return the computation wrapped inside `v` and inside any mutable wrappers.
 julia> using NaiveNASflux, Flux
 
 julia> layer(fluxvertex(Dense(2,3), inputvertex("in", 2)))
-Dense(2, 3)         # 9 parameters
+Dense(2 => 3)         # 9 parameters
 ```
 """
 layer(v::AbstractVertex) = layer(base(v))
@@ -235,12 +235,12 @@ This typically means create a new layer with the given values and set the wrappe
 julia> v = fluxvertex(Dense(3, 4, relu), inputvertex("in", 3));
 
 julia> layer(v)
-Dense(3, 4, relu)   # 16 parameters
+Dense(3 => 4, relu)   # 16 parameters
 
 julia> NaiveNASflux.setlayer!(v, (;σ=tanh));
 
 julia> layer(v)
-Dense(3, 4, tanh)   # 16 parameters
+Dense(3 => 4, tanh)   # 16 parameters
 ```
 """
 function setlayer!(x, propval) end
diff --git a/test/mutable.jl b/test/mutable.jl
index abb618c..5fd2581 100644
--- a/test/mutable.jl
+++ b/test/mutable.jl
@@ -198,8 +198,8 @@
         end
     end
 
-    @testset "Diagonal MutableLayer" begin
-        m = MutableLayer(Flux.Diagonal(4))
+    @testset "Scale MutableLayer" begin
+        m = MutableLayer(Flux.Scale(4))
 
         @test nin(m) == [nout(m)] == [4]
 
diff --git a/test/util.jl b/test/util.jl
index c8e4a0f..adfdfcb 100644
--- a/test/util.jl
+++ b/test/util.jl
@@ -35,7 +35,7 @@
         @test nin(CrossCor((1,2,3), 4=>5)) == [4]
         @test nout(CrossCor((1,2,3), 4=>5)) == 5
 
-        @test nin(Flux.Diagonal(3)) == [nout(Flux.Diagonal(3))] == [3]
+        @test nin(Flux.Scale(3)) == [nout(Flux.Scale(3))] == [3]
 
         @test nin(LayerNorm(3)) == [nout(LayerNorm(3))] == [3]
         @test nin(BatchNorm(3)) == [nout(BatchNorm(3))] == [3]
@@ -61,7 +61,7 @@
         @test actdim(DepthwiseConv((1,2), 3=>6)) == 3
         @test actdim(CrossCor((1,2), 3=>6)) == 3
 
-        @test actdim(Flux.Diagonal(1)) == indim(Flux.Diagonal(2)) == outdim(Flux.Diagonal(3)) == 1
+        @test actdim(Flux.Scale(1)) == indim(Flux.Scale(2)) == outdim(Flux.Scale(3)) == 1
 
         @test actdim(GenericFluxRecurrent()) == 1
         @test actdim(RNN(3,4)) ==  1
diff --git a/test/vertex.jl b/test/vertex.jl
index 3e959b5..e66af9a 100644
--- a/test/vertex.jl
+++ b/test/vertex.jl
@@ -212,35 +212,35 @@ end
                 @test reshape(dc(fill(10f0, (1,1,3,1))), :) == [101,303,404,505,0, 0, 108,310,411,512,0 ,0, 115,317,418,519,0, 0]
             end
 
-            @testset "DepthwiseConvAllowNinChangeStrategy" begin
-                import NaiveNASflux: DepthwiseConvAllowNinChangeStrategy
+            @testset "GroupedConvAllowNinChangeStrategy" begin
+                import NaiveNASflux: GroupedConvAllowNinChangeStrategy
                 import NaiveNASlib: ΔNout
                 inpt = inputvertex("in", 2, FluxConv{2}())
                 dc = fluxvertex("dc", DepthwiseConv((1,1), nout(inpt) => 3*nout(inpt)), inpt)
                 
                 # Get output multiplier == 4 (nout = 4 * nin) by adding one more outgroup (4 = 3 + 1)
-                okstrat = DepthwiseConvAllowNinChangeStrategy([1], [4], ΔNout(dc => 2))
+                okstrat = GroupedConvAllowNinChangeStrategy([1], [4], ΔNout(dc => 2))
                 @test Δsize!(okstrat, dc)
                 @test nout(dc) == 8
                 @test nin(dc) == [2]
 
-                failstrat = DepthwiseConvAllowNinChangeStrategy([10], [0], ΔNout(dc => 2))
+                failstrat = GroupedConvAllowNinChangeStrategy([10], [0], ΔNout(dc => 2))
                 @test @test_logs (:warn, r"Could not change nout of dc") match_mode=:any Δsize!(failstrat, dc) == false
             end
 
-            @testset "DepthwiseConvSimpleΔSizeStrategy" begin
-                using NaiveNASflux: DepthwiseConvSimpleΔSizeStrategy
+            @testset "GroupedConvSimpleΔSizeStrategy" begin
+                using NaiveNASflux: GroupedConvSimpleΔSizeStrategy
                 using NaiveNASlib: ΔNout
                 inpt = inputvertex("in", 2, FluxConv{2}())
                 dc = fluxvertex("dc", DepthwiseConv((1,1), nout(inpt) => 3*nout(inpt)), inpt)
                 
-                okstrat = DepthwiseConvSimpleΔSizeStrategy(4, ΔNout(dc => 2))
+                okstrat = GroupedConvSimpleΔSizeStrategy(4, ΔNout(dc => 2))
                 @test Δsize!(okstrat, dc)
                 @test nout(dc) == 8
                 @test nin(dc) == [2]
 
                 # We tested complete failure above, so lets make the relaxation work here
-                failstrat = DepthwiseConvSimpleΔSizeStrategy(5, ΔNout(dc => 3))
+                failstrat = GroupedConvSimpleΔSizeStrategy(5, ΔNout(dc => 3))
                 @test_logs (:warn, r"Could not change nout of dc") Δsize!(failstrat, dc)
                 @test nout(dc) == 10
                 @test nin(dc) == [2]

From 0b85a28b46b316b62cd4220d3b7b19afe1600cd4 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sun, 10 Apr 2022 15:49:07 +0200
Subject: [PATCH 3/4] Remove spaces in doctest

---
 src/vertex.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/vertex.jl b/src/vertex.jl
index 4ba39a5..2450209 100644
--- a/src/vertex.jl
+++ b/src/vertex.jl
@@ -204,7 +204,7 @@ Return the computation wrapped inside `v` and inside any mutable wrappers.
 julia> using NaiveNASflux, Flux
 
 julia> layer(fluxvertex(Dense(2,3), inputvertex("in", 2)))
-Dense(2 => 3)         # 9 parameters
+Dense(2 => 3)       # 9 parameters
 ```
 """
 layer(v::AbstractVertex) = layer(base(v))
@@ -235,12 +235,12 @@ This typically means create a new layer with the given values and set the wrappe
 julia> v = fluxvertex(Dense(3, 4, relu), inputvertex("in", 3));
 
 julia> layer(v)
-Dense(3 => 4, relu)   # 16 parameters
+Dense(3 => 4, relu)  # 16 parameters
 
 julia> NaiveNASflux.setlayer!(v, (;σ=tanh));
 
 julia> layer(v)
-Dense(3 => 4, tanh)   # 16 parameters
+Dense(3 => 4, tanh)  # 16 parameters
 ```
 """
 function setlayer!(x, propval) end

From f572a3d8123fcbb53619c43b04beaa843e916ac1 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sun, 10 Apr 2022 16:37:54 +0200
Subject: [PATCH 4/4] Fix doctest

---
 src/mutable.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mutable.jl b/src/mutable.jl
index 4789876..c00dd2a 100644
--- a/src/mutable.jl
+++ b/src/mutable.jl
@@ -241,7 +241,7 @@ julia> lazy(ones(Float32, 2, 5)) |> size
 (3, 5)
 
 julia> layer(lazy)
-Dense(2, 3, relu)   # 9 parameters
+Dense(2 => 3, relu)  # 9 parameters
 ```
 """
 mutable struct LazyMutable <: AbstractMutableComp