Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Algol68
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C with Coccinelle
C++ with Coccinelle
C++ (Circle)
CIRCT
Clean
Clojure
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Mojo
Nim
Numba
Nix
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
PTX
Python
Racket
Raku
Ruby
Rust
Sail
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
Triton
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Yul (Solidity IR)
Zig
Javascript
GIMPLE
Ygen
sway
llvm source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
clang (assertions trunk)
clang (trunk)
clang 10.0.0
clang 10.0.1
clang 11.0.0
clang 11.0.1
clang 12.0.0
clang 12.0.1
clang 13.0.0
clang 14.0.0
clang 15.0.0
clang 16.0.0
clang 17.0.1
clang 18.1.0
clang 19.1.0
clang 20.1.0
clang 21.1.0
clang 4.0.1
clang 5.0.0
clang 6.0.0
clang 7.0.0
clang 8.0.0
clang 9.0.0
hexagon-clang 16.0.5
llc (assertions trunk)
llc (trunk)
llc 10.0.0
llc 10.0.1
llc 11.0.0
llc 11.0.1
llc 12.0.0
llc 12.0.1
llc 13.0.0
llc 14.0.0
llc 15.0.0
llc 16.0.0
llc 17.0.1
llc 18.1.0
llc 19.1.0
llc 20.1.0
llc 21.1.0
llc 3.2
llc 3.3
llc 3.9.1
llc 4.0.0
llc 4.0.1
llc 5.0.0
llc 6.0.0
llc 7.0.0
llc 8.0.0
llc 9.0.0
opt (assertions trunk)
opt (trunk)
opt 10.0.0
opt 10.0.1
opt 11.0.0
opt 11.0.1
opt 12.0.0
opt 12.0.1
opt 13.0.0
opt 14.0.0
opt 15.0.0
opt 16.0.0
opt 17.0.1
opt 18.1.0
opt 19.1.0
opt 20.1.0
opt 21.1.0
opt 3.2
opt 3.3
opt 3.9.1
opt 4.0.0
opt 4.0.1
opt 5.0.0
opt 6.0.0
opt 7.0.0
opt 8.0.0
opt 9.0.0
Options
Source code
source_filename = "/app/example.cpp" target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" target triple = "amdgcn-amd-amdhsa" define amdgpu_kernel void @kernel_sum(ptr addrspace(1) noundef %partial_sums.coerce, i32 noundef %Nparticles) #0 { entry: %partial_sums = alloca ptr, align 8, addrspace(5) %partial_sums.addr = alloca ptr, align 8, addrspace(5) %Nparticles.addr = alloca i32, align 4, addrspace(5) %x = alloca i32, align 4, addrspace(5) %sum = alloca float, align 4, addrspace(5) %num_blocks = alloca i32, align 4, addrspace(5) %partial_sums.ascast = addrspacecast ptr addrspace(5) %partial_sums to ptr %partial_sums.addr.ascast = addrspacecast ptr addrspace(5) %partial_sums.addr to ptr %Nparticles.addr.ascast = addrspacecast ptr addrspace(5) %Nparticles.addr to ptr %x.ascast = addrspacecast ptr addrspace(5) %x to ptr %sum.ascast = addrspacecast ptr addrspace(5) %sum to ptr %num_blocks.ascast = addrspacecast ptr addrspace(5) %num_blocks to ptr store ptr addrspace(1) %partial_sums.coerce, ptr %partial_sums.ascast, align 8 %partial_sums1 = load ptr, ptr %partial_sums.ascast, align 8 store ptr %partial_sums1, ptr %partial_sums.addr.ascast, align 8 store i32 %Nparticles, ptr %Nparticles.addr.ascast, align 4 call void @llvm.lifetime.start.p5(ptr addrspace(5) %x) call void @llvm.lifetime.start.p5(ptr addrspace(5) %sum) store float 0.000000e+00, ptr %sum.ascast, align 4 call void @llvm.lifetime.start.p5(ptr addrspace(5) %num_blocks) %0 = load i32, ptr %Nparticles.addr.ascast, align 4 %add = add nsw i32 %0, 256 %sub = sub nsw i32 %add, 1 %div = sdiv i32 %sub, 256 store i32 %div, ptr %num_blocks.ascast, align 4 store i32 0, ptr %x.ascast, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %1 = load i32, ptr %x.ascast, align 4 %2 = load i32, ptr %num_blocks.ascast, align 4 %cmp = icmp slt i32 %1, %2 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %3 = load ptr, ptr %partial_sums.addr.ascast, align 8 %4 = load i32, ptr %x.ascast, align 4 %idxprom = sext i32 %4 to i64 %arrayidx = getelementptr inbounds float, ptr %3, i64 %idxprom %5 = load float, ptr %arrayidx, align 4 %6 = load float, ptr %sum.ascast, align 4 %add2 = fadd contract float %6, %5 store float %add2, ptr %sum.ascast, align 4 br label %for.inc for.inc: ; preds = %for.body %7 = load i32, ptr %x.ascast, align 4 %inc = add nsw i32 %7, 1 store i32 %inc, ptr %x.ascast, align 4 br label %for.cond for.end: ; preds = %for.cond %8 = load float, ptr %sum.ascast, align 4 %9 = load ptr, ptr %partial_sums.addr.ascast, align 8 %arrayidx3 = getelementptr inbounds float, ptr %9, i64 0 store float %8, ptr %arrayidx3, align 4 call void @llvm.lifetime.end.p5(ptr addrspace(5) %num_blocks) call void @llvm.lifetime.end.p5(ptr addrspace(5) %sum) call void @llvm.lifetime.end.p5(ptr addrspace(5) %x) ret void } define hidden spir_kernel void @kernel_sum_with_spirv_opts_allowed(ptr addrspace(1) noundef %partial_sums.coerce, i32 noundef %Nparticles) #0 { entry: %0 = ptrtoint ptr addrspace(1) %partial_sums.coerce to i64 %1 = inttoptr i64 %0 to ptr %cmp8 = icmp sgt i32 %Nparticles, 0 br i1 %cmp8, label %for.body.preheader, label %for.end for.body.preheader: %sub = add nuw nsw i32 %Nparticles, 255 %div1112 = lshr i32 %sub, 8 %wide.trip.count = zext nneg i32 %div1112 to i64 br label %for.body for.body: %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] %sum.09 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add2, %for.body ] %arrayidx = getelementptr inbounds nuw float, ptr %1, i64 %indvars.iv %2 = load float, ptr %arrayidx, align 4 %add2 = fadd contract float %sum.09, %2 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.end, label %for.body for.end: %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ] store float %sum.0.lcssa, ptr %1, align 4 ret void } attributes #0 = { mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 22.0.0git (https://github.com/llvm/llvm-project.git e665cf397686e881cfb0728d896d2f375c0aead5)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) !1 = !DIFile(filename: "/app/example.cpp", directory: "/app", checksumkind: CSK_MD5, checksum: "ef522d5ba4f4e8751ebcefa37a16d95a") !2 = !{i32 1, !"amdhsa_code_object_version", i32 600} !3 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} !4 = !{i32 7, !"Dwarf Version", i32 5} !5 = !{i32 2, !"Debug Info Version", i32 3} !6 = !{i32 1, !"wchar_size", i32 4} !7 = !{i32 8, !"PIC Level", i32 2} !8 = !{i32 7, !"frame-pointer", i32 2} !9 = !{i32 0, i32 0} !10 = !{!"clang version 22.0.0git (https://github.com/llvm/llvm-project.git e665cf397686e881cfb0728d896d2f375c0aead5)"} !11 = distinct !DISubprogram(name: "kernel_sum", linkageName: "kernel_sum(float*, int)", scope: !12, file: !12, line: 4, type: !13, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !19) !12 = !DIFile(filename: "example.cpp", directory: "/app", checksumkind: CSK_MD5, checksum: "ef522d5ba4f4e8751ebcefa37a16d95a") !13 = !DISubroutineType(cc: DW_CC_LLVM_SpirFunction, types: !14) !14 = !{null, !15, !17} !15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, dwarfAddressSpace: 4) !16 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) !17 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !18) !18 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !19 = !{!20, !21, !22, !23, !24} !20 = !DILocalVariable(name: "partial_sums", arg: 1, scope: !11, file: !12, line: 4, type: !15) !21 = !DILocalVariable(name: "Nparticles", arg: 2, scope: !11, file: !12, line: 4, type: !17) !22 = !DILocalVariable(name: "x", scope: !11, file: !12, line: 6, type: !18) !23 = !DILocalVariable(name: "sum", scope: !11, file: !12, line: 7, type: !16) !24 = !DILocalVariable(name: "num_blocks", scope: !11, file: !12, line: 8, type: !18) !25 = !{i32 1024, i32 1, i32 1} !26 = !DILocation(line: 0, scope: !11) !27 = !DILocation(line: 9, column: 17, scope: !28) !28 = distinct !DILexicalBlock(scope: !29, file: !12, line: 9, column: 3) !29 = distinct !DILexicalBlock(scope: !11, file: !12, line: 9, column: 3) !30 = !DILocation(line: 9, column: 3, scope: !29) !31 = !DILocation(line: 8, column: 45, scope: !11) !32 = !DILocation(line: 8, column: 50, scope: !11) !33 = !DILocation(line: 10, column: 12, scope: !34) !34 = distinct !DILexicalBlock(scope: !28, file: !12, line: 9, column: 36) !35 = !{!36, !36, i64 0} !36 = !{!"float", !37, i64 0} !37 = !{!"omnipotent char", !38, i64 0} !38 = !{!"Simple C++ TBAA"} !39 = !DILocation(line: 10, column: 9, scope: !34) !40 = !DILocation(line: 9, column: 32, scope: !28) !41 = distinct !{!41, !30, !42, !43} !42 = !DILocation(line: 11, column: 3, scope: !29) !43 = !{!"llvm.loop.mustprogress"} !44 = !DILocation(line: 12, column: 19, scope: !11) !45 = !DILocation(line: 13, column: 1, scope: !11)
Become a Patron
Sponsor on GitHub
Donate via PayPal
Compiler Explorer Shop
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
Statistics
Changelog
Version tree