You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{ message }}
This repository has been archived by the owner on Dec 1, 2018. It is now read-only.
Stefan found this in vgg_1d, pasted below. I think this is a bug, unless if there is something logical I should do when dispatched to multiple duplicates? Issue is on node x14503 accessing lb x14374_lb0
import spatial.dsl._
import org.virtualized._
object vgg_1d extends SpatialApp {
// override val target = targets.AWS_F1
type T = FixPt[TRUE,_128,_128] // Signed // TODO: If quantized, change this later
// type T = FixPt[TRUE,_16,_16] // Signed // TODO: If quantized, change this later
val par_L0 = 3
val par_L1 = 3
val par_L3_o = 1
val par_L3_i = 1
val par_L5_o = 1
val par_L5_i = 1
@virtualize
def vgg_1d[T:Type:Num](
i0: Array[T],
c0: Array[T],
c1: Array[T],
c2: Array[T],
c3: Array[T],
c4: Array[T],
c5: Array[T],
c6: Array[T],
c7: Array[T]
) : Array[T] = {
val c0_DRAM = DRAM[T](16,3,3,3)
val i0_DRAM = DRAM[T](3,16,16)
val c1_DRAM = DRAM[T](16)
val tmp0_DRAM = DRAM[T](16,16,16)
val c2_DRAM = DRAM[T](16,16,3,3)
val c3_DRAM = DRAM[T](16)
val tmp1_DRAM = DRAM[T](16,8,16)
val c4_DRAM = DRAM[T](1024,1024)
val c5_DRAM = DRAM[T](1024)
val c6_DRAM = DRAM[T](1000,1024)
val c7_DRAM = DRAM[T](1000)
val tmp5_DRAM = DRAM[T](1008)
setMem(c0_DRAM, c0.reshape(16,3,3,3))
setMem(i0_DRAM, i0.reshape(3,16,16))
setMem(c1_DRAM, c1)
setMem(c2_DRAM, c2.reshape(16,16,3,3))
setMem(c3_DRAM, c3)
setMem(c4_DRAM, c4.reshape(1024,1024))
setMem(c5_DRAM, c5)
setMem(c6_DRAM, c6.reshape(1000,1024))
setMem(c7_DRAM, c7)
Accel {
// Sequential.Foreach { // TODO: Make Pipeline/MetaPipeline to pipeline input images for inference, for now since 1 image can ignore this line
// Conv2D
val c1_SRAM = SRAM[T](16)
c1_SRAM load c1_DRAM(0::16)
Sequential.Foreach(16 by 1) { outD_i => // out channels
val nr = 16
val nc = 16
val kr = 3
val kc = 3
val kr_ignore = 1
val kc_ignore = 1 // This should be called half_pad_minus_1
val d = 3
val tmp0_SRAM_conv = SRAM[T](nr, nc)
MemReduce(tmp0_SRAM_conv)(d by 1) { inD_i => // in channels
val lb0 = LineBuffer[T](kr, nc)
val c0_RF = RegFile[T](kr, kc)
c0_RF load c0_DRAM(outD_i, inD_i, 0::kr, 0::kc) // TODO: Can load a burst to SRAM first, since this is probably going to be many small loads
// val sr0 = RegFile[T](kr, kc)
val result = SRAM[T](nr, nc)
Foreach(0 until nr + kr_ignore) { r =>
val row_to_load_from = min(r.to[Int], nr.to[Int]-1)
lb0 load i0_DRAM(inD_i, row_to_load_from, 0::nc)
Foreach(0 until nc) { c =>
// val col_to_load_from = min(c.to[Int], nc.to[Int]-1)
// Foreach(0 until kr par par_L0){i => sr0(i, *) <<= lb0(i, col_to_load_from) }
val row_start = min((kr-1).to[Index], max(0.to[Index], (kr-1-r.to[Index] ).to[Index]) )
val row_end = min((kr ).to[Index], max(1.to[Index], (kr+nr-1-r.to[Index]).to[Index]) )
val col_start = max( 0.to[Index], kc_ignore -c.to[Index]).to[Index]
val col_end = min(kc.to[Index], kc_ignore+nc-c.to[Index]).to[Index]
// Note: Can make hardware above more efficient by calculating statically, e.g. below is for k=5
// val row_start = if (r == 2) (2) else if (r == 3) (1) else (0)
// val row_end = if (r == n + k_ignore - 1) (3) else if (r == n + k_ignore - 2) (4) else (k)
// val col_start = if (c == 2) (2) else if (c == 3) (1) else (0)
// val col_end = if (c == n + k_ignore - 1) (3) else if (c == n + k_ignore - 2) (4) else (k)
/*
val window = Reduce(Reg[T](0.to[T]))(row_start until row_end, col_start until col_end){ (i,j) =>
sr0(i,kc-1-j) * c0_RF(i,j)
}{_+_}
if (r >= kr_ignore && c >= kc_ignore) {
result(r.to[Index]-kr_ignore, c.to[Index]-kc_ignore) = window.value
}
*/
// Could 2x unroll above, but need muxes (since bounds are not const)
// TODO: Inline row_start and row_end etc. later and see if it improves
// /*
val prod00 = mux( (0 < row_start || 0 < col_start), 0.to[T], lb0(0, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c0_RF(0,0) )
val prod01 = mux( (0 < row_start ), 0.to[T], lb0(0, max(0.to[Index], min(15.to[Index], c.to[Index] ))) * c0_RF(0,1) )
val prod02 = mux( (0 < row_start || 3 > col_end ), 0.to[T], lb0(0, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c0_RF(0,2) )
val prod03 = mux( ( 0 < col_start), 0.to[T], lb0(1, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c0_RF(1,0) )
val prod04 = lb0(1, max(0.to[Index], min(15.to[Index], c.to[Index] ))) * c0_RF(1,1)
val prod05 = mux( ( 3 > col_end ), 0.to[T], lb0(1, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c0_RF(1,2) )
val prod06 = mux( (3 > row_end || 0 < col_start), 0.to[T], lb0(2, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c0_RF(2,0) )
val prod07 = mux( (3 > row_end ), 0.to[T], lb0(2, max(0.to[Index], min(15.to[Index], c.to[Index] ))) * c0_RF(2,1) )
val prod08 = mux( (3 > row_end || 3 > col_end ), 0.to[T], lb0(2, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c0_RF(2,2) )
val tree_level_0_00 = prod00 + prod01
val tree_level_0_01 = prod02 + prod03
val tree_level_0_02 = prod04 + prod05
val tree_level_0_03 = prod06 + prod07
val tree_level_0_04 = prod08
val tree_level_1_00 = tree_level_0_00 + tree_level_0_01
val tree_level_1_01 = tree_level_0_02 + tree_level_0_03
val tree_level_1_02 = tree_level_0_04
val tree_level_2_00 = tree_level_1_00 + tree_level_1_01
val tree_level_2_01 = tree_level_1_02
val window = tree_level_2_00 + tree_level_2_01
if (r >= kr_ignore) {
result(r.to[Index]-kr_ignore, c) = window
}
// */
}
}
result
}{_+_} // Reduce across in channels
// Fused BiasAdd
val tmp0_SRAM_bias = SRAM[T](16,16)
Foreach(16 by 1, 16 by 1) { (i,j) =>
tmp0_SRAM_bias(i, j) = max(0.to[T], tmp0_SRAM_conv(i,j) + c1_SRAM(outD_i))
}
tmp0_DRAM(outD_i, 0::16, 0::16) store tmp0_SRAM_bias
}
// Optimization: BiasAdd was merged into Conv2D above
// Optimization: ReLU was merged into Conv2D above
// Conv2D
val c3_SRAM = SRAM[T](16)
c3_SRAM load c3_DRAM(0::16)
Sequential.Foreach(16 by 1) { outD_i => // out channels
val nr = 16
val nc = 16
val kr = 3
val kc = 3
val kr_ignore = 1
val kc_ignore = 1
val d = 16
val tmp1_SRAM_conv = SRAM[T](nr, nc)
MemReduce(tmp1_SRAM_conv)(d by 1) { inD_i => // in channels
val lb1 = LineBuffer[T](kr, nc)
val c2_RF = RegFile[T](kr, kc)
c2_RF load c2_DRAM(outD_i, inD_i, 0::kr, 0::kc) // TODO: Can load a burst to SRAM first, since this is probably going to be many small loads
// val sr1 = RegFile[T](kr, kc)
val result = SRAM[T](nr, nc)
Foreach(0 until nr + kr_ignore) { r =>
val row_to_load_from = min(r.to[Int], nr.to[Int]-1)
lb1 load tmp0_DRAM(inD_i, row_to_load_from, 0::nc)
Foreach(0 until nc) { c =>
// val col_to_load_from = min(c.to[Int], nc.to[Int]-1)
// Foreach(0 until kr par par_L1){i => sr1(i, *) <<= lb1(i, col_to_load_from) }
val row_start = min((kr-1).to[Index], max(0.to[Index], (kr-1-r.to[Index] ).to[Index]) )
val row_end = min((kr ).to[Index], max(1.to[Index], (kr+nr-1-r.to[Index]).to[Index]) )
val col_start = max( 0.to[Index], kc_ignore -c.to[Index]).to[Index]
val col_end = min(kc.to[Index], kc_ignore+nc-c.to[Index]).to[Index]
// Note: Can make hardware above more efficient by calculating statically, e.g. below is for k=5
// val row_start = if (r == 2) (2) else if (r == 3) (1) else (0)
// val row_end = if (r == n + k_ignore - 1) (3) else if (r == n + k_ignore - 2) (4) else (k)
// val col_start = if (c == 2) (2) else if (c == 3) (1) else (0)
// val col_end = if (c == n + k_ignore - 1) (3) else if (c == n + k_ignore - 2) (4) else (k)
/*
val window = Reduce(Reg[T](0.to[T]))(row_start until row_end, col_start until col_end){ (i,j) =>
sr1(i,kc-1-j) * c2_RF(i,j)
}{_+_}
if (r >= kr_ignore && c >= kc_ignore) {
result(r.to[Index]-kr_ignore, c.to[Index]-kc_ignore) = window.value
}
*/
// /*
val prod00 = mux( (0 < row_start || 0 < col_start), 0.to[T], lb1(0, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c2_RF(0,0) )
val prod01 = mux( (0 < row_start ), 0.to[T], lb1(0, max(0.to[Index], min(15.to[Index], c.to[Index] ))) * c2_RF(0,1) )
val prod02 = mux( (0 < row_start || 3 > col_end ), 0.to[T], lb1(0, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c2_RF(0,2) )
val prod03 = mux( ( 0 < col_start), 0.to[T], lb1(1, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c2_RF(1,0) )
val prod04 = lb1(1, max(0.to[Index], min(15.to[Index], c.to[Index] ))) * c2_RF(1,1)
val prod05 = mux( ( 3 > col_end ), 0.to[T], lb1(1, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c2_RF(1,2) )
val prod06 = mux( (3 > row_end || 0 < col_start), 0.to[T], lb1(2, max(0.to[Index], min(15.to[Index], c.to[Index] - 1))) * c2_RF(2,0) )
val prod07 = mux( (3 > row_end ), 0.to[T], lb1(2, max(0.to[Index], min(15.to[Index], c.to[Index] ))) * c2_RF(2,1) )
val prod08 = mux( (3 > row_end || 3 > col_end ), 0.to[T], lb1(2, max(0.to[Index], min(15.to[Index], c.to[Index] + 1))) * c2_RF(2,2) )
val tree_level_0_00 = prod00 + prod01
val tree_level_0_01 = prod02 + prod03
val tree_level_0_02 = prod04 + prod05
val tree_level_0_03 = prod06 + prod07
val tree_level_0_04 = prod08
val tree_level_1_00 = tree_level_0_00 + tree_level_0_01
val tree_level_1_01 = tree_level_0_02 + tree_level_0_03
val tree_level_1_02 = tree_level_0_04
val tree_level_2_00 = tree_level_1_00 + tree_level_1_01
val tree_level_2_01 = tree_level_1_02
val window = tree_level_2_00 + tree_level_2_01
if (r >= kr_ignore) {
result(r.to[Index]-kr_ignore, c) = window
}
// */
}
}
result
}{_+_} // Reduce across in channels
// Fused BiasAdd
val tmp1_SRAM_pool = SRAM[T](8,16)
Foreach(8 by 1, 8 by 1) { (i,j) =>
val out = Reduce(Reg[T](0.to[T]))(2 by 1, 2 by 1) { (ii, jj) =>
max(0.to[T], tmp1_SRAM_conv(i*2 + ii, j*2 + jj) + c3_SRAM(outD_i))
} { (x,y) => max(x,y) }
tmp1_SRAM_pool(i, j) = out.value
}
tmp1_DRAM(outD_i, 0::8, 0::16) store tmp1_SRAM_pool
}
// Optimization: BiasAdd was merged into Conv2D above
// Optimization: ReLU was merged into Conv2D above
// Optimization: MaxPool was merged into Conv2D above
// Reshape
// TODO: Should fuse this with next op
val tmp2_SRAM = SRAM[T](8*8*16)
Foreach(16 by 1) { j =>
Foreach(8 by 1) { i =>
val row = SRAM[T](8)
row load tmp1_DRAM(j, i, 0::8)
Foreach(8 by 1) { k =>
tmp2_SRAM(k*16 + i*8*16 + j) = row(k)
}
}
}
// MatMul
val c5_SRAM = SRAM[T](1024)
c5_SRAM load c5_DRAM(0::1024)
val tmp3_SRAM = SRAM[T](1024)
Foreach(1024 by 1 par par_L3_o){out_i =>
val c4_row_SRAM = SRAM[T](1024)
c4_row_SRAM load c4_DRAM(out_i, 0::1024 par 16)
val prod = Reduce(Reg[T](0.to[T]))(1024 by 1 par 16){ in_i => tmp2_SRAM(in_i) * c4_row_SRAM(in_i) }{_+_}
tmp3_SRAM(out_i) = max(0.to[T], prod.value + c5_SRAM(out_i))
}
// Optimization: BiasAdd was merged into MatMul above
// Optimization: ReLU was merged into MatMul above
// Reshape
// Skipping reshape since tmp4 and tmp3 already 1d
// MatMul
val c7_SRAM = SRAM[T](1000)
c7_SRAM load c7_DRAM(0::1000)
val tmp5_SRAM = SRAM[T](1008)
Foreach(1000 by 1 par par_L5_o){out_i =>
val c6_row_SRAM = SRAM[T](1024)
c6_row_SRAM load c6_DRAM(out_i, 0::1024 par 16)
val prod = Reduce(Reg[T](0.to[T]))(1024 by 1 par 16){ in_i => tmp3_SRAM(in_i) * c6_row_SRAM(in_i) }{_+_}
tmp5_SRAM(out_i) = prod.value + c7_SRAM(out_i)
}
// Optimization: BiasAdd was merged into MatMul above
// Optimization: ReLU was merged into MatMul above
tmp5_DRAM(0::1008) store tmp5_SRAM
// } Sequential over all images
}
getMem(tmp5_DRAM)
}
@virtualize
def main() {
val i0 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/vgg_1_in_0.csv", "\n")
val c0 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c0.csv", "\n") // conv1_1/conv1_1_filters
val c1 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c1.csv", "\n") // conv1_1/conv1_1_biases
val c2 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c2.csv", "\n") // conv1_2/conv1_2_filters
val c3 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c3.csv", "\n") // conv1_2/conv1_2_biases
val c4 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c4.csv", "\n") // fc6/fc6_weights
val c5 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c5.csv", "\n") // fc6/fc6_biases
val c6 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c6.csv", "\n") // fc8/fc8_weights
val c7 = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/c7.csv", "\n") // fc8/fc8_biases
val i0_reshaped = i0.reshape(3,16,16)
val i0_reconstructed = (0::3, 0::16, 0::16){(i,j,k) =>
var x = 0.to[T]
if (i == 0) {
x = i0_reshaped(2,j,k)*255.0.to[T] - 103.939.to[T]
}
if (i == 1) {
x = i0_reshaped(1,j,k)*255.0.to[T] - 116.779.to[T]
}
if (i == 2) {
x = i0_reshaped(0,j,k)*255.0.to[T] - 123.68.to[T]
}
x
};
val i0_reconstructed_linear = Array.tabulate(768){i => i0_reconstructed(i/(16*16),(i%(16*16))/(16),i%16)};
val output = vgg_1d(i0_reconstructed_linear, c0, c1, c2, c3, c4, c5, c6, c7)
val output_no_extra = Array.tabulate(1000){i => output(i)}
printArray(output_no_extra, "output")
val gold = loadCSV1D[T]("/home/shadjis/spatial-lang/csv_vgg_1/vgg_1_check_out_0.csv", "\n")
printArray(gold, "gold")
// val margin = 0.0001.to[T] // Within 0.01% confidence
val margin = 0.000000001.to[T]
val cksum = gold.zip(output_no_extra){(a,b) => abs(a-b) < margin}.reduce{_&&_}
println("PASS: " + cksum)
}
}
The text was updated successfully, but these errors were encountered:
Partially fixed on develop - issue was the compiler didn't know LineBuffer is always banked by the rows, so it was trying to duplicate.
However, this is an obvious case where we should be coalescing reads onto a single banked memory, but instead are creating duplicates (in this case, of the LineBuffer).
FYI on latest develop this gives the following error still in --synth (not --sim):
[bug] An exception was encountered while compiling:
[bug] This is an example where lb dispatch > 1. Please use as test case! (node x15945 on lb x15813)
java.lang.Exception: This is an example where lb dispatch > 1. Please use as test case! (node x15945 on lb x15813)
at spatial.codegen.chiselgen.ChiselGenUnrolled.emitNode(ChiselGenUnrolled.scala:404)
at spatial.codegen.chiselgen.ChiselGenUnrolled.emitNode$(ChiselGenUnrolled.scala:51)
at spatial.SpatialCompiler$$anon$3.spatial$codegen$chiselgen$ChiselGenVector$$super$emitNode(Spatial.scala:107)
at spatial.codegen.chiselgen.ChiselGenVector.emitNode(ChiselGenVector.scala:56)
at spatial.codegen.chiselgen.ChiselGenVector.emitNode$(ChiselGenVector.scala:33)
at spatial.SpatialCompiler$$anon$3.argon$codegen$chiselgen$ChiselGenArray$$super$emitNode(Spatial.scala:107)
at argon.codegen.chiselgen.ChiselGenArray.emitNode(ChiselGenArray.scala:26)
at argon.codegen.chiselgen.ChiselGenArray.emitNode$(ChiselGenArray.scala:18)
at spatial.SpatialCompiler$$anon$3.spatial$codegen$chiselgen$ChiselGenAlteraVideo$$super$emitNode(Spatial.scala:107)
Stefan found this in vgg_1d, pasted below. I think this is a bug, unless if there is something logical I should do when dispatched to multiple duplicates? Issue is on node x14503 accessing lb x14374_lb0
The text was updated successfully, but these errors were encountered: