14 missing floating point folds

My student Zhengyang (@liuz) has written a superoptimizer for LLVM IR, and lately he has turned it towards codes that make heavy use of floating point. It comes up with plenty of interesting optimizations that LLVM currently misses; below is small collection of them that seem pretty easy to understand and also pretty clearly desirable. All of these were found when compiling OpenBLAS, but also they seem like they would probably be generically useful to our users who find FP performance to be important.

; Compiler Explorer
; Compiler Explorer
define float @src(float %0) {
entry:
%1 = fmul float %0, 0.000000e+00
%2 = fmul float %1, 3.000000e+00
ret float %2
}

define float @tgt(float %0) {
entry:
%1 = fmul float %0, 0.000000e+00
ret float %1
}

;==============================cut==============================
;Compiler Explorer
;Compiler Explorer
define <2 x double> @src(double %0, double %1, i1 %2, i1 %3, i1 %4) {
if.end:
%5 = fadd double %1, -1.0
%6 = insertelement <2 x double> poison, double %0, i64 0
%7 = insertelement <2 x double> %6, double %5, i64 1
%8 = fadd <2 x double> %7, <double 4.0, double poison>
ret <2 x double> %8
}

define <2 x double> @tgt(double %0, double %1, i1 %2, i1 %3, i1 %4) {
%6 = insertelement <2 x double> poison, double %0, i64 0
%7 = fadd <2 x double> %6, <double 4.0, double poison>
ret <2 x double> %7
}

;==============================cut==============================
;Compiler Explorer
;Compiler Explorer
define i1 @src(float %0, float %1) {
if.then:
%2 = fsub float %0, %1
%3 = fcmp ogt float %2, 0.000000e+00
ret i1 %3
}

define i1 @tgt(float %0, float %1) {
if.then:
%olt = fcmp olt float %1, %0
ret i1 %olt
}

;==============================cut==============================
;Compiler Explorer
;Compiler Explorer
define i1 @src(double %0, double %1, double %2, double %3, double %4, double %5, i1 %6, i1 %7, i8 %8) {
entry:
%t1 = fcmp uno double %0, 0.000000e+00
%t2 = fcmp uno double %1, 0.000000e+00
%t3 = select i1 %t1, i1 true, i1 %t2
ret i1 %t3
}

define i1 @tgt(double %0, double %1, double %2, double %3, double %4, double %5, i1 %6, i1 %7, i8 %8) {
entry:
%uno = fcmp uno double %0, %1
ret i1 %uno
}

; ==============================cut==============================
; this one is weird
; Compiler Explorer
; Compiler Explorer

declare double @llvm.floor.f64(double)

define i1 @src(double %0, i1 %1) {
%2 = fmul double %0, 7.812500e-03
%3 = call double @llvm.floor.f64(double %2)
%4 = fcmp ogt double %3, 0.000000e+00
ret i1 %4

}

define i1 @tgt(double %0, i1 %1) {
%oge = fcmp oge double %0, 1.280000e+02
ret i1 %oge
}

; ==============================cut==============================
; Compiler Explorer
; Compiler Explorer
define i1 @src(float %0) {
if.end192:
%1 = fcmp oge float %0, 0.000000e+00
%2 = fneg float %0
%3 = select i1 %1, float %0, float %2
%4 = fcmp oeq float %3, 0.000000e+00
ret i1 %4
}

define i1 @tgt(float %0) {
if.end192:
%oeq = fcmp oeq float %0, 0.000000e+00
ret i1 %oeq
}

; ==============================cut==============================
; Compiler Explorer
; Compiler Explorer
define float @src(i1 %0) {
if.else285:
%t2 = select i1 %0, float -1.000000e+00, float 1.000000e+00
%t3 = fneg float %t2
ret float %t3
}

define float @tgt(i1 %0) {
if.else285:
%sel = select i1 %0, float 1.000000e+00, float -1.000000e+00
ret float %sel
}

; ==============================cut==============================
; Compiler Explorer
; Compiler Explorer
define i1 @src(float %0, float %1, i1 %2, i1 %3, i1 %4) {
%t5 = fcmp ult float %0, 0.000000e+00
%t6 = select i1 %t5, float -1.000000e+00, float 1.000000e+00
%t7 = fcmp ult float %1, 0.000000e+00
%t8 = select i1 %t7, float -1.000000e+00, float 1.000000e+00
%t9 = fcmp une float %t6, %t8
ret i1 %t9
}

define i1 @tgt(float %0, float %1, i1 %2, i1 %3, i1 %4) {
%t5 = fcmp ult float %0, 0.000000e+00
%t6 = fcmp ult float %1, 0.000000e+00
%xor = xor i1 %t5, %t6
ret i1 %xor
}

; ==============================cut==============================
; Compiler Explorer
; Compiler Explorer

define float @src(float %0, float %1, i1 %c1, i1 %c2) {
if.else:
%4 = fneg float %1
%5 = select i1 %c1, float %1, float %4
%6 = select i1 %c1, float %4, float %1
%7 = select i1 %c2, float %6, float %5
%8 = fneg float %7
ret float %8

sink: ; No predecessors!
unreachable
}

define float @tgt(float %0, float %1, i1 %c1, i1 %c2) {
if.else:
%4 = fneg float %1
%5 = select i1 %c1, float %1, float %4
%6 = select i1 %c1, float %4, float %1
%sel = select i1 %c2, float %5, float %6
ret float %sel

sink: ; No predecessors!
unreachable
}

; ==============================cut==============================
; Compiler Explorer
; Compiler Explorer
define i1 @src(float %0) {
if.end35:
%1 = fmul float %0, 2.000000e+00
%2 = fcmp oge float %1, 0.000000e+00
ret i1 %2
}

define i1 @tgt(float %0) {
if.end35:
%oge = fcmp oge float %0, 0.000000e+00
ret i1 %oge
}

; ==============================cut==============================
; Compiler Explorer
; Compiler Explorer
define i1 @src(double %0) {
if.end155:
%1 = fptrunc double %0 to float
%2 = fcmp oge float %1, 1.000000e+02
ret i1 %2
}
define i1 @tgt(double %0) {
if.end155:
%oge = fcmp oge double %0, 0x4058FFFFF0000000
ret i1 %oge
}

; ==============================cut==============================
; Compiler Explorer
; Compiler Explorer
define i1 @src(float %0) {
%t1 = fmul float %0, 0x3FF0CCCCC0000000
%t2 = fcmp olt float %t1, 0x3FE20418A0000000
ret i1 %t2
}

define i1 @tgt(float %0) {
%ole = fcmp ole float %0, 0x3FE12878E0000000
ret i1 %ole
}

;==============================cut==============================
https://alive2.llvm.org/ce/z/bFW9vZ

define i64 @src(float %0) {
if.end27:
%1 = fptosi float %0 to i32
%2 = sext i32 %1 to i64
ret i64 %2

sink: ; No predecessors!
unreachable
}

define i64 @tgt(float %0) {
if.end27:
%1 = fptosi float %0 to i64
ret i64 %1

sink: ; No predecessors!
unreachable
}

;==============================cut==============================
; Compiler Explorer
; Compiler Explorer
define i32 @src(float %0) {
if.then83:
%1 = tail call float @llvm.fabs.f32(float %0)
%2 = bitcast float %0 to i32
%3 = bitcast float %1 to i32
%4 = xor i32 %3, %2
ret i32 %4
}

declare float @llvm.fabs.f32(float)

define i32 @tgt(float %0) {
if.then83:
%1 = bitcast float %0 to i32
%and = and i32 %1, -2147483648
ret i32 %and
}

8 Likes

Nice!

I’m confused by this one:

What are the unreachables for? And for the optimization itself, I would expect it to go wrong if %0 is a value like 240 (~ 1.1e12) that fits exactly in i64 but not in i32.

This one is probably part of Reassociatable fmul by constants unnecessarily requires nsz · Issue #64967 · llvm/llvm-project · GitHub

I also recently noticed that there’s probably room in LLVM to simplify min/max chains with redundancies better, e.g. min(max(min(x, 10.0), 0.0), 8.0) which at least mathematically can be simplified to min(max(x, 0.0), 8.0).

(For a real implementation on maxnum/minnum/maximum/minimum, there are subtleties around NaNs that I would have to page back in from my notes, but there are cases we currently miss.)

I believe the unreachables are an artifact of Zhengyang’s extraction strategy, it would have been better if he’d cleaned them up.

Regarding the optimization… yeah it doesn’t look right. Minotaur uses Alive2 as a verification backend (and I just checked that Alive verifies this optimization) but Alive and Z3 have bugs too. We’ll dig into this and figure out what’s up.

What are the unreachables for? And for the optimization itself, I would expect it to go wrong if %0 is a value like 2^40 (~ 1.1e12) that fits exactly in i64 but not in i32.

fptosi says that any value out of range of the target type is a poison value, so the fact that it changes for 0x1p40 isn’t an issue.

Was just coming back here to say this, thanks Joshua. It’s always poison/undef. So yes this optimization is correct.

Here’s one that reduces the cost from 9 uops to 6 on a modern x86-64.

https://alive2.llvm.org/ce/z/WYxQ7W

define i1 @src(float %0) {
  %2 = tail call float @llvm.fabs.f32(float %0)
  %3 = fpext float %2 to double
  %4 = fcmp ole double %3, 0x3E7000000102F4FD
  ret i1 %4
}

define i1 @tgt(float %0) {
  %2 = tail call float @llvm.fabs.f32(float %0)
  %3 = bitcast float %2 to i32
  %4 = icmp ult i32 %3, 864026625
  ret i1 %4
}

Good point. (I’m still sceptical that this fold is actually beneficial: the sext is likely a single fast integer instruction, whereas converting float to i64 instead of i32 could be significantly more expensive.)

Could someone please raise a github issue for each of these folds? It would be nice to have a place to discuss and/or implement each of them individually.

1 Like

Each of these is validated by compiling to asm and then using llvm-mca to check if the optimized version yields fewer uops on a couple of x86-64 targets. I think @liuz is using one reasonably modern AMD and one reasonably modern Intel to do this. It would probably be good a good idea to check an AArch64 implementation and – if the llvm-mca support is good enough – also a RISC-V. Zhengyang, are your scripts setup to do that?

Zhengyang, can you create issues for these and then add a comment here linking to all of them?

1 Like

llvm-mca can definitely have some issues in certain cases, especially on newer microarchitectures where the scheduling models might just be copies of old ones (or not completely updated). @RKSimon will know what microarchitectures have good scheduling models/which ones don’t. Validating against other simulators like uiCA would probably be good as well.

We have infrastructure for microbenchmarking assembly snippets in llvm-exegesis too (including arbitrary memory accesses). llvm-exegesis benchmarks of these assembly snippets would probably be the gold standard in terms of timing data. I can probably help with that if issues crop up.

Yeah… we’re using llvm-mca because it’s what we have, not because it’s perfect. I’d love to hear views about where we can mostly trust its output.

Realistically, most of the folds above seem obviously a good idea, but for sure when there’s any question about whether something is profitable we should measure.

llvm-mca is fine for straight-line code - its the scheduler models that let it down :wink:

I’d stay away from the Skylake, IceLake and Zen4 models - the Jaguar, Silvermont, SandyBridge and Haswell models are good as is the more recent AlderLake model (as long as you’re running on big-cores!).

Also, PLEASE raise issues for problems you encounter with any x86 scheduler models!

3 Likes