Description
It seems that for imperfect loop nests, polymer-opt
can end up generating IR that invokes functions representing statements with parameters that may cause out-of-bounds memory accesses. For example, when invoking polymer-opt -reg2mem -extract-scop-stmt -pluto-opt kernel.cgeist.mlir -o kernel.polymer.mlir --allow-unregistered-dialect
on the following IR saved as kernel.mlir
:
func.func @kernel(%arg0: memref<153xi64>, %arg1: memref<153x3x7xi64>, %arg2: memref<7xi64>) {
affine.for %arg3 = 0 to 153 {
%0 = affine.load %arg0[%arg3] : memref<153xi64>
affine.for %arg4 = 0 to 3 {
affine.for %arg5 = 0 to 7 {
%1 = affine.load %arg1[%arg3, %arg4, %arg5] : memref<153x3x7xi64>
%2 = arith.muli %1, %0 : i64
%3 = affine.load %arg2[%arg5] : memref<7xi64>
%4 = arith.subi %3, %2 : i64
affine.store %4, %arg2[%arg5] : memref<7xi64>
}
}
}
return
}
polymer-opt
generates:
map = affine_map<(d0) -> (d0 * 2)>
#map1 = affine_map<(d0) -> (10, d0 * 2 + 3)>
#map2 = affine_map<(d0, d1) -> (d0 * 6, d1 * 3 - 1)>
#map3 = affine_map<(d0, d1) -> (29, d0 * 6 + 7, d1 * 3 + 4)>
#map4 = affine_map<(d0, d1) -> (d0 * 2 + d1 * 10)>
#map5 = affine_map<(d0, d1) -> (d0 * 32 + 32, d0 * 2 + d1 * 10 + 3)>
#map6 = affine_map<(d0) -> (d0 * 32)>
#map7 = affine_map<(d0, d1) -> (d0 * 6 + d1 * 30 + 7)>
#map8 = affine_map<(d0, d1, d2) -> (d0 * -6 - d1 * 30 + d2)>
#map9 = affine_map<(d0, d1) -> (d0 + d1 * 5)>
#map10 = affine_map<(d0, d1, d2) -> (d0 * -2 - d1 * 10 + d2)>
#map11 = affine_map<(d0, d1) -> (d0 * 32, d1 * 96 - 6)>
#map12 = affine_map<(d0, d1) -> (d0 * 96 + 1, d1 * 32 + 32)>
#map13 = affine_map<(d0, d1) -> (d0 * -96 + d1 + 6)>
#map14 = affine_map<(d0) -> (d0 * 16 - 1)>
#map15 = affine_map<(d0, d1) -> ((d0 * 16) ceildiv 3, d1 * 16)>
#map16 = affine_map<(d0, d1, d2) -> (153, (d0 * 16) floordiv 3 + 6, d1 * 32 + 32, d2 * 16 + 16)>
#map17 = affine_map<(d0, d1) -> (d0 * 32 + 32, d1 * 2 + 3)>
#map18 = affine_map<(d0) -> (d0 * 6)>
#map19 = affine_map<(d0, d1) -> (d0 * 32 + 32, d1 * 6 + 7)>
#map20 = affine_map<(d0, d1) -> (d0 * -6 + d1)>
#map21 = affine_map<(d0, d1) -> (d0 * -2 + d1)>
#set = affine_set<(d0, d1) : ((d1 - 1) floordiv 3 - d0 >= 0)>
#set1 = affine_set<(d0, d1, d2) : ((d1 - 1) floordiv 2 - d0 >= 0, d1 - d2 ceildiv 3 >= 0)>
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32
>>, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f80, dense<128> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32>
: vector<4xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi32>>, #dlt
i.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i32>,
#dlti.dl_entry<"dlti.endianness", "little">>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu", "
polygeist.target-cpu" = "x86-64", "polygeist.target-features" = "+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87", "polygeist.tune-cpu" = "generic"} {
func.func private @S0(%arg0: memref<1xi64>, %arg1: memref<153xi64>, %arg2: index) attributes {scop.stmt} {
%0 = affine.load %arg1[symbol(%arg2)] : memref<153xi64>
affine.store %0, %arg0[0] : memref<1xi64>
return
}
func.func private @S1(%arg0: memref<7xi64>, %arg1: index, %arg2: memref<1xi64>, %arg3: memref<153x3x7xi64>, %arg4: index, %arg5: index) attributes {scop.stmt} {
%0 = affine.load %arg0[symbol(%arg1)] : memref<7xi64>
%1 = affine.load %arg3[symbol(%arg4), symbol(%arg5), symbol(%arg1)] : memref<153x3x7xi64>
%2 = affine.load %arg2[0] : memref<1xi64>
%3 = arith.muli %1, %2 : i64
%4 = arith.subi %0, %3 : i64
affine.store %4, %arg0[symbol(%arg1)] : memref<7xi64>
return
}
func.func @kernel(%arg0: memref<153xi64>, %arg1: memref<153x3x7xi64>, %arg2: memref<7xi64>) attributes {llvm.linkage = #llvm.linkage<external>} {
%c2 = arith.constant 2 : index
%alloca = memref.alloca() {scop.scratchpad} : memref<1xi64>
affine.for %arg3 = 0 to 5 {
affine.for %arg4 = #map(%arg3) to min #map1(%arg3) {
affine.for %arg5 = max #map2(%arg3, %arg4) to min #map3(%arg3, %arg4) {
affine.if #set(%arg4, %arg5) {
affine.for %arg6 = #map4(%arg4, %arg5) to min #map5(%arg4, %arg5) {
affine.for %arg7 = #map6(%arg5) to #map7(%arg4, %arg5) {
%0 = affine.apply #map8(%arg4, %arg5, %arg7)
%1 = affine.apply #map9(%arg4, %arg5)
%2 = affine.apply #map10(%arg4, %arg6, %arg5)
func.call @S1(%arg2, %0, %alloca, %arg1, %1, %2) : (memref<7xi64>, index, memref<1xi64>, memref<153x3x7xi64>, index, index) -> ()
}
}
}
affine.if #set1(%arg3, %arg4, %arg5) {
affine.for %arg6 = max #map11(%arg5, %arg4) to min #map12(%arg4, %arg5) {
%0 = affine.apply #map13(%arg4, %arg6)
%1 = affine.apply #map14(%arg4)
func.call @S1(%arg2, %0, %alloca, %arg1, %1, %c2) : (memref<7xi64>, index, memref<1xi64>, memref<153x3x7xi64>, index, index) -> ()
}
}
affine.for %arg6 = max #map15(%arg5, %arg4) to min #map16(%arg5, %arg3, %arg4) {
func.call @S0(%alloca, %arg0, %arg6) : (memref<1xi64>, memref<153xi64>, index) -> ()
affine.for %arg7 = #map(%arg6) to min #map17(%arg4, %arg6) {
affine.for %arg8 = #map18(%arg6) to min #map19(%arg5, %arg6) {
%0 = affine.apply #map20(%arg6, %arg8)
%1 = affine.apply #map21(%arg7, %arg6)
func.call @S1(%arg2, %0, %alloca, %arg1, %arg6, %1) : (memref<7xi64>, index, memref<1xi64>, memref<153x3x7xi64>, index, index) -> ()
}
}
}
}
}
}
return
}
}
The 448th invocation of S1
by this IR is with %arg1 = 2
, %arg4 = 21
, and %arg5 = 418
, which causes %arg3
to be indexed with a negative value, ultimately resulting in a segmentation fault.
This behavior can be reproduced with the code from this repository with a minimal non-working example:
$ git clone https://github.com/andidr/Polygeist-mnwe
$ cd Polygeist-mnwe
$ make mnwe.polymer
cgeist kernel.c --function=kernel -S -I. --memref-fullrank --raise-scf-to-affine -o kernel.cgeist.mlir
polymer-opt kernel.cgeist.mlir -reg2mem -extract-scop-stmt -pluto-opt -o kernel.polymer.mlir --allow-unregistered-dialect >/dev/null 2>&1
mlir-opt \
-lower-affine \
-convert-scf-to-cf \
-cse \
-canonicalize \
-convert-func-to-llvm="use-bare-ptr-memref-call-conv" \
--finalize-memref-to-llvm \
-canonicalize \
-o kernel.polymer.low.mlir kernel.polymer.mlir
mlir-translate -mlir-to-llvmir -o kernel.polymer.ll kernel.polymer.low.mlir
llc -opaque-pointers -o kernel.polymer.S kernel.polymer.ll
as -o kernel.polymer.o kernel.polymer.S
gcc -O3 -I. main.c kernel.polymer.o -o mnwe.polymer
$ ./mnwe.polymer
Segmentation fault
Compiling the kernel with plain cgeist
and without any polymer optimizations yields the expected results:
$ make mnwe.cgeist
cgeist -c kernel.c --function=kernel -I. --memref-fullrank -o kernel.cgeist.o -O3
gcc -O3 -I. -o mnwe.cgeist main.c kernel.cgeist.o
$ ./mnwe.cgeist
res: 8
which are the same as the results produced by a binary compiled using a plain C compiler:
$ make mnwe.O3
gcc -O3 -I. -c kernel.c -o kernel.O3.o
gcc -O3 -I. main.c kernel.O3.o -o mnwe.O3
$ ./mnwe.O3
res: 8
I experienced similar issues with imperfect loop nests directly with pluto
in the past and this may be related. Could somebody confirm that this is indeed a bug in pluto
? Any suggestions for fixes or workarounds are welcome, too.
Thanks!