Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor codegen improvements #225

Merged
merged 5 commits into from
May 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 54 additions & 34 deletions ptx/src/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::ffi::CStr;
use std::fmt::Display;
use std::io::Write;
use std::ptr::null_mut;
use std::{convert, iter, mem, ptr};
use std::{iter, mem, ptr};
use zluda_llvm::core::*;
use zluda_llvm::prelude::*;
use zluda_llvm::zluda::*;
Expand Down Expand Up @@ -157,7 +157,7 @@ impl NamedIdGenerator {
if let Some(id) = id {
self.register_result(id, func)
} else {
func(b"\0".as_ptr() as _)
func(LLVM_UNNAMED)
}
}

Expand Down Expand Up @@ -505,10 +505,12 @@ fn emit_function_variable(
) -> Result<(), TranslateError> {
let builder = ctx.builder.get();
let llvm_type = get_llvm_type(ctx, &variable.type_)?;
let addr_space = get_llvm_address_space(&ctx.constants, variable.state_space)?;
let value = ctx.names.register_result(variable.name, |name| unsafe {
LLVMZludaBuildAlloca(builder, llvm_type, addr_space, name)
});
let value = emit_alloca(
ctx,
llvm_type,
get_llvm_address_space(&ctx.constants, variable.state_space)?,
Some(variable.name),
);
match variable.initializer {
None => {}
Some(init) => {
Expand All @@ -531,12 +533,27 @@ fn emit_method<'a, 'input>(
let llvm_method = emit_method_declaration(ctx, &method)?;
emit_linkage_for_method(&method, is_kernel, llvm_method);
emit_tuning(ctx, llvm_method, &method.tuning);
for statement in method.body.iter().flat_map(convert::identity) {
let statements = match method.body {
Some(statements) => statements,
None => return Ok(()),
};
// Initial BB that holds all the variable declarations
let bb_with_variables =
unsafe { LLVMAppendBasicBlockInContext(ctx.context.get(), llvm_method, LLVM_UNNAMED) };
// Rest of the code
let starting_bb =
unsafe { LLVMAppendBasicBlockInContext(ctx.context.get(), llvm_method, LLVM_UNNAMED) };
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), starting_bb) };
for statement in statements.iter() {
register_basic_blocks(ctx, llvm_method, statement);
}
for statement in method.body.into_iter().flatten() {
for statement in statements.into_iter() {
emit_statement(ctx, is_kernel, statement)?;
}
// happens if there is a post-ret trailing label
terminate_current_block_if_needed(ctx, None);
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), bb_with_variables) };
unsafe { LLVMBuildBr(ctx.builder.get(), starting_bb) };
Ok(())
}

Expand Down Expand Up @@ -604,7 +621,6 @@ fn emit_statement(
is_kernel: bool,
statement: crate::translate::ExpandedStatement,
) -> Result<(), TranslateError> {
start_synthetic_basic_block_if_needed(ctx, &statement);
Ok(match statement {
crate::translate::Statement::Label(label) => emit_label(ctx, label)?,
crate::translate::Statement::Variable(var) => emit_function_variable(ctx, var)?,
Expand Down Expand Up @@ -749,27 +765,6 @@ fn emit_ret_value(
Ok(())
}

fn start_synthetic_basic_block_if_needed(
ctx: &mut EmitContext,
statement: &crate::translate::ExpandedStatement,
) {
let current_block = unsafe { LLVMGetInsertBlock(ctx.builder.get()) };
if current_block == ptr::null_mut() {
return;
}
let terminator = unsafe { LLVMGetBasicBlockTerminator(current_block) };
if terminator == ptr::null_mut() {
return;
}
if let crate::translate::Statement::Label(..) = statement {
return;
}
let new_block =
unsafe { LLVMCreateBasicBlockInContext(ctx.context.get(), b"\0".as_ptr() as _) };
unsafe { LLVMInsertExistingBasicBlockAfterInsertBlock(ctx.builder.get(), new_block) };
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), new_block) };
}

fn emit_ptr_access(
ctx: &mut EmitContext,
ptr_access: &crate::translate::PtrAccess<crate::translate::ExpandedArgParams>,
Expand Down Expand Up @@ -1073,14 +1068,36 @@ fn emit_value_copy(
) -> Result<(), TranslateError> {
let builder = ctx.builder.get();
let type_ = get_llvm_type(ctx, type_)?;
let temp_value = unsafe { LLVMBuildAlloca(builder, type_, LLVM_UNNAMED) };
let temp_value = emit_alloca(ctx, type_, ctx.constants.private_space, None);
unsafe { LLVMBuildStore(builder, src, temp_value) };
ctx.names.register_result(dst, |dst| unsafe {
LLVMBuildLoad2(builder, type_, temp_value, dst)
});
Ok(())
}

// From "Performance Tips for Frontend Authors" (https://llvm.org/docs/Frontend/PerformanceTips.html):
// "The SROA (Scalar Replacement Of Aggregates) and Mem2Reg passes only attempt to eliminate alloca
// instructions that are in the entry basic block. Given SSA is the canonical form expected by much
// of the optimizer; if allocas can not be eliminated by Mem2Reg or SROA, the optimizer is likely to
// be less effective than it could be."
fn emit_alloca(
ctx: &mut EmitContext,
type_: LLVMTypeRef,
addr_space: u32,
name: Option<Id>,
) -> LLVMValueRef {
let builder = ctx.builder.get();
let current_bb = unsafe { LLVMGetInsertBlock(builder) };
let variables_bb = unsafe { LLVMGetFirstBasicBlock(LLVMGetBasicBlockParent(current_bb)) };
unsafe { LLVMPositionBuilderAtEnd(builder, variables_bb) };
let result = ctx.names.register_result_option(name, |name| unsafe {
LLVMZludaBuildAlloca(builder, type_, addr_space, name)
});
unsafe { LLVMPositionBuilderAtEnd(builder, current_bb) };
result
}

fn emit_instruction(
ctx: &mut EmitContext,
is_kernel: bool,
Expand Down Expand Up @@ -3494,12 +3511,12 @@ fn emit_store_var(

fn emit_label(ctx: &mut EmitContext, label: Id) -> Result<(), TranslateError> {
let new_block = unsafe { LLVMValueAsBasicBlock(ctx.names.value(label)?) };
terminate_current_block_if_needed(ctx, new_block);
terminate_current_block_if_needed(ctx, Some(new_block));
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), new_block) };
Ok(())
}

fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: LLVMBasicBlockRef) {
fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: Option<LLVMBasicBlockRef>) {
let current_block = unsafe { LLVMGetInsertBlock(ctx.builder.get()) };
if current_block == ptr::null_mut() {
return;
Expand All @@ -3508,7 +3525,10 @@ fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: LLVMBasic
if terminator != ptr::null_mut() {
return;
}
unsafe { LLVMBuildBr(ctx.builder.get(), new_block) };
match new_block {
Some(new_block) => unsafe { LLVMBuildBr(ctx.builder.get(), new_block) },
None => unsafe { LLVMBuildUnreachable(ctx.builder.get()) },
};
}

fn emit_method_declaration<'input>(
Expand Down
14 changes: 8 additions & 6 deletions ptx/src/test/spirv_run/abs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
target triple = "amdgcn-amd-amdhsa"

define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
"37":
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"28", align 8
Expand All @@ -19,8 +21,8 @@ define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"27", ptr
store i32 %"29", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"14" to ptr
%"39" = getelementptr inbounds i8, ptr %"31", i64 4
%"32" = load i32, ptr %"39", align 4
%"38" = getelementptr inbounds i8, ptr %"31", i64 4
%"32" = load i32, ptr %"38", align 4
store i32 %"32", ptr addrspace(5) %"7", align 4
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"15" = call i32 @llvm.abs.i32(i32 %"16", i1 false)
Expand All @@ -35,8 +37,8 @@ define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"27", ptr
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load i32, ptr addrspace(5) %"7", align 4
%"35" = inttoptr i64 %"21" to ptr
%"41" = getelementptr inbounds i8, ptr %"35", i64 4
store i32 %"22", ptr %"41", align 4
%"40" = getelementptr inbounds i8, ptr %"35", i64 4
store i32 %"22", ptr %"40", align 4
ret void
}

Expand Down
6 changes: 4 additions & 2 deletions ptx/src/test/spirv_run/activemask.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ target triple = "amdgcn-amd-amdhsa"
declare i32 @__zluda_ptx_impl__activemask() #0

define protected amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"11", ptr addrspace(4) byref(i64) %"12") #1 {
"15":
%"6" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"6", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i32, align 4, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"6", align 1
%"7" = load i64, ptr addrspace(4) %"12", align 8
store i64 %"7", ptr addrspace(5) %"4", align 8
%"8" = call i32 @__zluda_ptx_impl__activemask()
Expand Down
6 changes: 4 additions & 2 deletions ptx/src/test/spirv_run/add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
target triple = "amdgcn-amd-amdhsa"

define protected amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"22":
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
Expand Down
6 changes: 4 additions & 2 deletions ptx/src/test/spirv_run/add_global.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ target triple = "amdgcn-amd-amdhsa"
@PI = protected addrspace(1) externally_initialized global float 0x400921FB60000000, align 4

define protected amdgpu_kernel void @add_global(ptr addrspace(4) byref(i64) %"20", ptr addrspace(4) byref(i64) %"21") #0 {
"24":
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca float, align 4, addrspace(5)
%"8" = alloca float, align 4, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"20", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(4) %"21", align 8
Expand Down
6 changes: 4 additions & 2 deletions ptx/src/test/spirv_run/add_non_coherent.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
target triple = "amdgcn-amd-amdhsa"

define protected amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"22":
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
Expand Down
26 changes: 14 additions & 12 deletions ptx/src/test/spirv_run/add_param_ptr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,34 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
target triple = "amdgcn-amd-amdhsa"

define protected amdgpu_kernel void @add_param_ptr(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 {
"38":
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
%1 = alloca i64, align 8, addrspace(5)
%2 = alloca i64, align 8, addrspace(5)
br label %3

3: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"31" = ptrtoint ptr addrspace(4) %"26" to i64
%0 = alloca i64, align 8, addrspace(5)
store i64 %"31", ptr addrspace(5) %0, align 8
%"30" = load i64, ptr addrspace(5) %0, align 8
store i64 %"31", ptr addrspace(5) %1, align 8
%"30" = load i64, ptr addrspace(5) %1, align 8
store i64 %"30", ptr addrspace(5) %"4", align 8
%"33" = ptrtoint ptr addrspace(4) %"27" to i64
%1 = alloca i64, align 8, addrspace(5)
store i64 %"33", ptr addrspace(5) %1, align 8
%"32" = load i64, ptr addrspace(5) %1, align 8
store i64 %"33", ptr addrspace(5) %2, align 8
%"32" = load i64, ptr addrspace(5) %2, align 8
store i64 %"32", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"34" = inttoptr i64 %"12" to ptr addrspace(4)
%"40" = getelementptr inbounds i8, ptr addrspace(4) %"34", i64 0
%"11" = load i64, ptr addrspace(4) %"40", align 8
%"39" = getelementptr inbounds i8, ptr addrspace(4) %"34", i64 0
%"11" = load i64, ptr addrspace(4) %"39", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"35" = inttoptr i64 %"14" to ptr addrspace(4)
%"42" = getelementptr inbounds i8, ptr addrspace(4) %"35", i64 0
%"13" = load i64, ptr addrspace(4) %"42", align 8
%"41" = getelementptr inbounds i8, ptr addrspace(4) %"35", i64 0
%"13" = load i64, ptr addrspace(4) %"41", align 8
store i64 %"13", ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"36" = inttoptr i64 %"16" to ptr
Expand Down
6 changes: 4 additions & 2 deletions ptx/src/test/spirv_run/add_tuning.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
target triple = "amdgcn-amd-amdhsa"

define protected amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"22":
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1

1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
Expand Down
Loading