LLVM Discussion Forums

Any advice on helping LLVM eliminate address calculations in a loop?

I’m using LLVM through Julia as a front end.
I’m having problems with suboptimal asm generation:

L1536:
        prefetcht0      -256(%rax,%r10)
        leaq    (%r12,%r14,8), %rcx
        leaq    (%rcx,%r15), %r9
        leaq    (%r9,%r15), %rdx
        vbroadcastsd    (%r12,%r14,8), %zmm28
        vmovups -256(%rax), %zmm30
        vbroadcastsd    (%rcx,%r11,8), %zmm29
        vbroadcastsd    (%r9,%r11,8), %zmm26
        vbroadcastsd    (%rdx,%r11,8), %zmm25
        addq    %r15, %rdx
        vbroadcastsd    (%rdx,%r11,8), %zmm27
        movq    %rbx, 160(%rsp)
        movq    %rsi, 152(%rsp)
        movq    %rdi, 144(%rsp)
        vfmadd231pd     %zmm30, %zmm28, %zmm24 # zmm24 = (zmm28 * zmm30) + zmm24
        vfmadd231pd     %zmm30, %zmm29, %zmm23 # zmm23 = (zmm29 * zmm30) + zmm23
        vfmadd231pd     %zmm30, %zmm26, %zmm22 # zmm22 = (zmm26 * zmm30) + zmm22
        vfmadd231pd     %zmm30, %zmm25, %zmm21 # zmm21 = (zmm25 * zmm30) + zmm21
        vfmadd231pd     %zmm30, %zmm27, %zmm20 # zmm20 = (zmm27 * zmm30) + zmm20
        prefetcht0      -192(%rax,%r10)
        vmovups -192(%rax), %zmm30
        vfmadd231pd     %zmm30, %zmm28, %zmm19 # zmm19 = (zmm28 * zmm30) + zmm19
        vfmadd231pd     %zmm30, %zmm29, %zmm18 # zmm18 = (zmm29 * zmm30) + zmm18
        vfmadd231pd     %zmm30, %zmm26, %zmm17 # zmm17 = (zmm26 * zmm30) + zmm17
        vfmadd231pd     %zmm30, %zmm25, %zmm16 # zmm16 = (zmm25 * zmm30) + zmm16
        vfmadd231pd     %zmm30, %zmm27, %zmm15 # zmm15 = (zmm27 * zmm30) + zmm15
        prefetcht0      -128(%rax,%r10)
        vmovups -128(%rax), %zmm30
        vfmadd231pd     %zmm30, %zmm28, %zmm14 # zmm14 = (zmm28 * zmm30) + zmm14
        vfmadd231pd     %zmm30, %zmm29, %zmm13 # zmm13 = (zmm29 * zmm30) + zmm13
        vfmadd231pd     %zmm30, %zmm26, %zmm12 # zmm12 = (zmm26 * zmm30) + zmm12
        vfmadd231pd     %zmm30, %zmm25, %zmm11 # zmm11 = (zmm25 * zmm30) + zmm11
        vfmadd231pd     %zmm30, %zmm27, %zmm10 # zmm10 = (zmm27 * zmm30) + zmm10
        prefetcht0      -64(%rax,%r10)
        vmovups -64(%rax), %zmm30
        vfmadd231pd     %zmm30, %zmm28, %zmm9 # zmm9 = (zmm28 * zmm30) + zmm9
        vfmadd231pd     %zmm30, %zmm29, %zmm8 # zmm8 = (zmm29 * zmm30) + zmm8
        vfmadd231pd     %zmm30, %zmm26, %zmm7 # zmm7 = (zmm26 * zmm30) + zmm7
        vfmadd231pd     %zmm30, %zmm25, %zmm6 # zmm6 = (zmm25 * zmm30) + zmm6
        vfmadd231pd     %zmm30, %zmm27, %zmm5 # zmm5 = (zmm27 * zmm30) + zmm5
        prefetcht0      (%rax,%r10)
        vmovupd (%rax), %zmm30
        vfmadd231pd     %zmm30, %zmm28, %zmm4 # zmm4 = (zmm28 * zmm30) + zmm4
        vfmadd231pd     %zmm30, %zmm29, %zmm3 # zmm3 = (zmm29 * zmm30) + zmm3
        vfmadd231pd     %zmm30, %zmm26, %zmm2 # zmm2 = (zmm26 * zmm30) + zmm2
        vfmadd231pd     %zmm30, %zmm25, %zmm1 # zmm1 = (zmm25 * zmm30) + zmm1
        vfmadd231pd     %zmm30, %zmm27, %zmm0 # zmm0 = (zmm27 * zmm30) + zmm0
        incq    %r14
        addq    %r8, %rax
        cmpq    %r14, %r11
        jne     L1536

Note all the leaq, addq, and movq instructions.
All the memory addresses here are with respect to only two pointers.

Currently, the code I am generating is calculating offsets using integer arithmetic (e.g., that it’s 153 doubles from the base), and then there is a getelementptr call with the appropriate data type as base on the pointer, and finally that pointer is bitcast to a pointer to vectors.

Is there some sort of pattern I can present to LLVM that would make it more easily eliminate these instructions in the hot loop or represent them internally in terms of the offfsets to generate better code?

I tweaked things. The asm is a little better:

L720:
        leaq    (,%rbx,8), %rcx
        addq    %rbp, %rcx
        prefetcht0      -256(%r14,%r8)
        vbroadcastsd    -8(%rbp,%rbx,8), %zmm30
        vmovups -256(%r14), %zmm29
        vmovups -192(%r14), %zmm28
        vmovups -128(%r14), %zmm27
        vmovups -64(%r14), %zmm26
        vmovupd (%r14), %zmm25
        prefetcht0      -192(%r14,%r8)
        prefetcht0      -128(%r14,%r8)
        prefetcht0      -64(%r14,%r8)
        prefetcht0      (%r14,%r8)
        vfmadd231pd     %zmm29, %zmm30, %zmm0 # zmm0 = (zmm30 * zmm29) + zmm0
        vfmadd231pd     %zmm28, %zmm30, %zmm1 # zmm1 = (zmm30 * zmm28) + zmm1
        vfmadd231pd     %zmm27, %zmm30, %zmm2 # zmm2 = (zmm30 * zmm27) + zmm2
        vfmadd231pd     %zmm26, %zmm30, %zmm3 # zmm3 = (zmm30 * zmm26) + zmm3
        vfmadd231pd     %zmm25, %zmm30, %zmm4 # zmm4 = (zmm30 * zmm25) + zmm4
        leaq    -8(%r11,%rcx), %rdx
        vbroadcastsd    -8(%rcx,%rsi,8), %zmm30
        vfmadd231pd     %zmm29, %zmm30, %zmm5 # zmm5 = (zmm30 * zmm29) + zmm5
        vfmadd231pd     %zmm28, %zmm30, %zmm6 # zmm6 = (zmm30 * zmm28) + zmm6
        vfmadd231pd     %zmm27, %zmm30, %zmm7 # zmm7 = (zmm30 * zmm27) + zmm7
        vfmadd231pd     %zmm26, %zmm30, %zmm8 # zmm8 = (zmm30 * zmm26) + zmm8
        vfmadd231pd     %zmm25, %zmm30, %zmm9 # zmm9 = (zmm30 * zmm25) + zmm9
        vbroadcastsd    (%rdx,%rsi,8), %zmm30
        addq    %r11, %rdx
        vfmadd231pd     %zmm29, %zmm30, %zmm10 # zmm10 = (zmm30 * zmm29) + zmm10
        vfmadd231pd     %zmm28, %zmm30, %zmm11 # zmm11 = (zmm30 * zmm28) + zmm11
        vfmadd231pd     %zmm27, %zmm30, %zmm12 # zmm12 = (zmm30 * zmm27) + zmm12
        vfmadd231pd     %zmm26, %zmm30, %zmm13 # zmm13 = (zmm30 * zmm26) + zmm13
        vfmadd231pd     %zmm25, %zmm30, %zmm14 # zmm14 = (zmm30 * zmm25) + zmm14
        vbroadcastsd    (%rdx,%rsi,8), %zmm30
        addq    %r11, %rdx
        vfmadd231pd     %zmm29, %zmm30, %zmm15 # zmm15 = (zmm30 * zmm29) + zmm15
        vfmadd231pd     %zmm28, %zmm30, %zmm16 # zmm16 = (zmm30 * zmm28) + zmm16
        vfmadd231pd     %zmm27, %zmm30, %zmm17 # zmm17 = (zmm30 * zmm27) + zmm17
        vfmadd231pd     %zmm26, %zmm30, %zmm18 # zmm18 = (zmm30 * zmm26) + zmm18
        vfmadd231pd     %zmm25, %zmm30, %zmm19 # zmm19 = (zmm30 * zmm25) + zmm19
        vbroadcastsd    (%rdx,%rsi,8), %zmm30
        vfmadd231pd     %zmm29, %zmm30, %zmm20 # zmm20 = (zmm30 * zmm29) + zmm20
        vfmadd231pd     %zmm28, %zmm30, %zmm21 # zmm21 = (zmm30 * zmm28) + zmm21
        vfmadd231pd     %zmm27, %zmm30, %zmm22 # zmm22 = (zmm30 * zmm27) + zmm22
        vfmadd231pd     %zmm26, %zmm30, %zmm23 # zmm23 = (zmm30 * zmm26) + zmm23
        vfmadd231pd     %zmm25, %zmm30, %zmm24 # zmm24 = (zmm30 * zmm25) + zmm24
        addq    %r15, %r14
        cmpq    %rdi, %rbx
        leaq    1(%rbx), %rbx
        jb      L720

The associated optimized LLVM IR:

L111:                                             ; preds = %L111, %L111.preheader
  %value_phi2 = phi <8 x double> [ %res.i2399, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi4 = phi i64 [ %res.i2708, %L111 ], [ 0, %L111.preheader ]
  %value_phi5 = phi <8 x double> [ %res.i2400, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi7 = phi <8 x double> [ %res.i2401, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi9 = phi <8 x double> [ %res.i2402, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi11 = phi <8 x double> [ %res.i2403, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi13 = phi <8 x double> [ %res.i2543, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi15 = phi <8 x double> [ %res.i2544, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi17 = phi <8 x double> [ %res.i2545, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi19 = phi <8 x double> [ %res.i2546, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi21 = phi <8 x double> [ %res.i2547, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi23 = phi <8 x double> [ %res.i2563, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi25 = phi <8 x double> [ %res.i2564, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi27 = phi <8 x double> [ %res.i2565, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi29 = phi <8 x double> [ %res.i2566, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi31 = phi <8 x double> [ %res.i2567, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi33 = phi <8 x double> [ %res.i2582, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi35 = phi <8 x double> [ %res.i2583, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi37 = phi <8 x double> [ %res.i2584, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi39 = phi <8 x double> [ %res.i2585, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi41 = phi <8 x double> [ %res.i2586, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi43 = phi <8 x double> [ %res.i2703, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi45 = phi <8 x double> [ %res.i2704, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi47 = phi <8 x double> [ %res.i2705, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi49 = phi <8 x double> [ %res.i2706, %L111 ], [ zeroinitializer, %L111.preheader ]
  %value_phi51 = phi <8 x double> [ %res.i2707, %L111 ], [ zeroinitializer, %L111.preheader ]
  %res.i1789 = add nsw i64 %value_phi4, %res.i1787
  %offsetptr.i1792 = getelementptr inbounds double, double* %ptr.i1791, i64 %res.i1789
  %res.i1803 = load double, double* %offsetptr.i1792, align 8
  %res.i1811 = mul nsw i64 %value_phi4, %.sroa.2.0.copyload
  %res.i1813 = add nsw i64 %res.i1811, %value_phi15290
  %offsetptr.i1816 = getelementptr inbounds double, double* %ptr.i1815, i64 %res.i1813
  %ptr.i1826 = bitcast double* %offsetptr.i1816 to <8 x double>*
  %res.i1827 = load <8 x double>, <8 x double>* %ptr.i1826, align 8
  %offsetptr.i1930 = getelementptr inbounds double, double* %offsetptr.i1816, i64 8
  %ptr.i1931 = bitcast double* %offsetptr.i1930 to <8 x double>*
  %res.i1932 = load <8 x double>, <8 x double>* %ptr.i1931, align 8
  %offsetptr.i1948 = getelementptr inbounds double, double* %offsetptr.i1816, i64 16
  %ptr.i1949 = bitcast double* %offsetptr.i1948 to <8 x double>*
  %res.i1950 = load <8 x double>, <8 x double>* %ptr.i1949, align 8
  %offsetptr.i2164 = getelementptr inbounds double, double* %offsetptr.i1816, i64 24
  %ptr.i2165 = bitcast double* %offsetptr.i2164 to <8 x double>*
  %res.i2166 = load <8 x double>, <8 x double>* %ptr.i2165, align 8
  %offsetptr.i2177 = getelementptr inbounds double, double* %offsetptr.i1816, i64 32
  %ptr.i2178 = bitcast double* %offsetptr.i2177 to <8 x double>*
  %res.i2179 = load <8 x double>, <8 x double>* %ptr.i2178, align 8
  %offsetptr.i2185 = getelementptr inbounds double, double* %offsetptr.i1816, i64 %res.i2183
  %addr.i2194 = bitcast double* %offsetptr.i2185 to i8*
  call void @llvm.prefetch.p0i8(i8* %addr.i2194, i32 0, i32 3, i32 1)
  %offsetptr.i2200 = getelementptr inbounds double, double* %offsetptr.i1816, i64 %res.i2198
  %addr.i2203 = bitcast double* %offsetptr.i2200 to i8*
  call void @llvm.prefetch.p0i8(i8* %addr.i2203, i32 0, i32 3, i32 1)
  %offsetptr.i2215 = getelementptr inbounds double, double* %offsetptr.i1816, i64 %res.i2213
  %addr.i2218 = bitcast double* %offsetptr.i2215 to i8*
  call void @llvm.prefetch.p0i8(i8* %addr.i2218, i32 0, i32 3, i32 1)
  %offsetptr.i2370 = getelementptr inbounds double, double* %offsetptr.i1816, i64 %res.i2368
  %addr.i2373 = bitcast double* %offsetptr.i2370 to i8*
  call void @llvm.prefetch.p0i8(i8* %addr.i2373, i32 0, i32 3, i32 1)
  %offsetptr.i2386 = getelementptr inbounds double, double* %offsetptr.i1816, i64 %res.i2384
  %addr.i2389 = bitcast double* %offsetptr.i2386 to i8*
  call void @llvm.prefetch.p0i8(i8* %addr.i2389, i32 0, i32 3, i32 1)
  %ie.i2396 = insertelement <8 x double> undef, double %res.i1803, i32 0
  %v.i2397 = shufflevector <8 x double> %ie.i2396, <8 x double> undef, <8 x i32> zeroinitializer
  %res.i2399 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi2, <8 x double> %v.i2397, <8 x double> %res.i1827) #8
  %res.i2400 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi5, <8 x double> %v.i2397, <8 x double> %res.i1932) #8
  %res.i2401 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi7, <8 x double> %v.i2397, <8 x double> %res.i1950) #8
  %res.i2402 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi9, <8 x double> %v.i2397, <8 x double> %res.i2166) #8
  %res.i2403 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi11, <8 x double> %v.i2397, <8 x double> %res.i2179) #8
  %ptr.i2406 = getelementptr inbounds double, double* %offsetptr.i1792, i64 %.sroa.5.16.copyload
  %res.i2407 = load double, double* %ptr.i2406, align 8
  %ie.i2409 = insertelement <8 x double> undef, double %res.i2407, i32 0
  %v.i2410 = shufflevector <8 x double> %ie.i2409, <8 x double> undef, <8 x i32> zeroinitializer
  %res.i2543 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi13, <8 x double> %v.i2410, <8 x double> %res.i1827) #8
  %res.i2544 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi15, <8 x double> %v.i2410, <8 x double> %res.i1932) #8
  %res.i2545 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi17, <8 x double> %v.i2410, <8 x double> %res.i1950) #8
  %res.i2546 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi19, <8 x double> %v.i2410, <8 x double> %res.i2166) #8
  %res.i2547 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi21, <8 x double> %v.i2410, <8 x double> %res.i2179) #8
  %ptr.i2551 = getelementptr inbounds double, double* %offsetptr.i1792, i64 %res.i2548
  %res.i2552 = load double, double* %ptr.i2551, align 8
  %ie.i2554 = insertelement <8 x double> undef, double %res.i2552, i32 0
  %v.i2555 = shufflevector <8 x double> %ie.i2554, <8 x double> undef, <8 x i32> zeroinitializer
  %res.i2563 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi23, <8 x double> %v.i2555, <8 x double> %res.i1827) #8
  %res.i2564 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi25, <8 x double> %v.i2555, <8 x double> %res.i1932) #8
  %res.i2565 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi27, <8 x double> %v.i2555, <8 x double> %res.i1950) #8
  %res.i2566 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi29, <8 x double> %v.i2555, <8 x double> %res.i2166) #8
  %res.i2567 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi31, <8 x double> %v.i2555, <8 x double> %res.i2179) #8
  %ptr.i2571 = getelementptr inbounds double, double* %offsetptr.i1792, i64 %res.i2568
  %res.i2572 = load double, double* %ptr.i2571, align 8
  %ie.i2574 = insertelement <8 x double> undef, double %res.i2572, i32 0
  %v.i2575 = shufflevector <8 x double> %ie.i2574, <8 x double> undef, <8 x i32> zeroinitializer
  %res.i2582 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi33, <8 x double> %v.i2575, <8 x double> %res.i1827) #8
  %res.i2583 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi35, <8 x double> %v.i2575, <8 x double> %res.i1932) #8
  %res.i2584 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi37, <8 x double> %v.i2575, <8 x double> %res.i1950) #8
  %res.i2585 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi39, <8 x double> %v.i2575, <8 x double> %res.i2166) #8
  %res.i2586 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi41, <8 x double> %v.i2575, <8 x double> %res.i2179) #8
  %ptr.i2590 = getelementptr inbounds double, double* %offsetptr.i1792, i64 %res.i2587
  %res.i2591 = load double, double* %ptr.i2590, align 8
  %ie.i2593 = insertelement <8 x double> undef, double %res.i2591, i32 0
  %v.i2594 = shufflevector <8 x double> %ie.i2593, <8 x double> undef, <8 x i32> zeroinitializer
  %res.i2703 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi43, <8 x double> %v.i2594, <8 x double> %res.i1827) #8
  %res.i2704 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi45, <8 x double> %v.i2594, <8 x double> %res.i1932) #8
  %res.i2705 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi47, <8 x double> %v.i2594, <8 x double> %res.i1950) #8
  %res.i2706 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi49, <8 x double> %v.i2594, <8 x double> %res.i2166) #8
  %res.i2707 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi51, <8 x double> %v.i2594, <8 x double> %res.i2179) #8
  %res.i2708 = add nuw nsw i64 %value_phi4, 1
  %24 = icmp ult i64 %res.i2708, %15
  %actual.i2711 = call i1 @llvm.expect.i1(i1 %24, i1 true)
  br i1 %actual.i2711, label %L111, label %L414

There are now 3 unnecessary addq and 2 unnecessary leaq.

It should look a little more like this:

L2688:
        prefetcht0      2570(%r12)
        vbroadcastsd    (%r10), %zmm25
        vmovupd (%r12), %zmm26
        vmovupd 64(%r12), %zmm27
        vmovupd 128(%r12), %zmm28
        vmovupd 192(%r12), %zmm29
        vfmadd231pd     %zmm26, %zmm25, %zmm0 # zmm0 = (zmm25 * zmm26) + zmm0
        vfmadd231pd     %zmm27, %zmm25, %zmm1 # zmm1 = (zmm25 * zmm27) + zmm1
        vfmadd231pd     %zmm28, %zmm25, %zmm2 # zmm2 = (zmm25 * zmm28) + zmm2
        vfmadd231pd     %zmm29, %zmm25, %zmm3 # zmm3 = (zmm25 * zmm29) + zmm3
        vmovupd 256(%r12), %zmm30
        vbroadcastsd    (%r10,%rsi,8), %zmm31
        vfmadd231pd     %zmm30, %zmm25, %zmm4 # zmm4 = (zmm25 * zmm30) + zmm4
        vfmadd231pd     %zmm26, %zmm31, %zmm5 # zmm5 = (zmm31 * zmm26) + zmm5
        vfmadd231pd     %zmm27, %zmm31, %zmm6 # zmm6 = (zmm31 * zmm27) + zmm6
        vfmadd231pd     %zmm28, %zmm31, %zmm7 # zmm7 = (zmm31 * zmm28) + zmm7
        vfmadd231pd     %zmm29, %zmm31, %zmm8 # zmm8 = (zmm31 * zmm29) + zmm8
        vfmadd231pd     %zmm30, %zmm31, %zmm9 # zmm9 = (zmm31 * zmm30) + zmm9
        vbroadcastsd    (%r10,%rdi), %zmm25
        vfmadd231pd     %zmm26, %zmm25, %zmm10 # zmm10 = (zmm25 * zmm26) + zmm10
        vfmadd231pd     %zmm27, %zmm25, %zmm11 # zmm11 = (zmm25 * zmm27) + zmm11
        vfmadd231pd     %zmm28, %zmm25, %zmm12 # zmm12 = (zmm25 * zmm28) + zmm12
        vfmadd231pd     %zmm29, %zmm25, %zmm13 # zmm13 = (zmm25 * zmm29) + zmm13
        vfmadd231pd     %zmm30, %zmm25, %zmm14 # zmm14 = (zmm25 * zmm30) + zmm14
        vbroadcastsd    (%r10,%rbx), %zmm25
        vfmadd231pd     %zmm26, %zmm25, %zmm15 # zmm15 = (zmm25 * zmm26) + zmm15
        vfmadd231pd     %zmm27, %zmm25, %zmm16 # zmm16 = (zmm25 * zmm27) + zmm16
        vfmadd231pd     %zmm28, %zmm25, %zmm17 # zmm17 = (zmm25 * zmm28) + zmm17
        vfmadd231pd     %zmm29, %zmm25, %zmm18 # zmm18 = (zmm25 * zmm29) + zmm18
        vfmadd231pd     %zmm30, %zmm25, %zmm19 # zmm19 = (zmm25 * zmm30) + zmm19
        vbroadcastsd    (%r10,%rcx), %zmm25
        vfmadd231pd     %zmm26, %zmm25, %zmm20 # zmm20 = (zmm25 * zmm26) + zmm20
        vfmadd231pd     %zmm27, %zmm25, %zmm21 # zmm21 = (zmm25 * zmm27) + zmm21
        vfmadd231pd     %zmm28, %zmm25, %zmm22 # zmm22 = (zmm25 * zmm28) + zmm22
        vfmadd231pd     %zmm29, %zmm25, %zmm23 # zmm23 = (zmm25 * zmm29) + zmm23
        vfmadd231pd     %zmm30, %zmm25, %zmm24 # zmm24 = (zmm25 * zmm30) + zmm24
        addq    $320, %r12              # imm = 0x140
        addq    $8, %r10
        decq    %rdx
        jne     L2688
        jmp     L2952

With associated optimized LLVM IR:

L1007:                                            ; preds = %L1007, %L945
  %value_phi92 = phi i64 [ %200, %L1007 ], [ %value_phi38, %L945 ]
  %value_phi93 = phi i64 [ %202, %L1007 ], [ %value_phi39, %L945 ]
  %value_phi94 = phi <8 x double> [ %res.i1084, %L1007 ], [ %value_phi40, %L945 ]
  %value_phi95 = phi <8 x double> [ %res.i1085, %L1007 ], [ %value_phi42, %L945 ]
  %value_phi96 = phi <8 x double> [ %res.i1086, %L1007 ], [ %value_phi44, %L945 ]
  %value_phi97 = phi <8 x double> [ %res.i1087, %L1007 ], [ %value_phi46, %L945 ]
  %value_phi98 = phi <8 x double> [ %res.i1088, %L1007 ], [ %value_phi48, %L945 ]
  %value_phi99 = phi <8 x double> [ %res.i1107, %L1007 ], [ %value_phi50, %L945 ]
  %value_phi100 = phi <8 x double> [ %res.i1108, %L1007 ], [ %value_phi52, %L945 ]
  %value_phi101 = phi <8 x double> [ %res.i1109, %L1007 ], [ %value_phi54, %L945 ]
  %value_phi102 = phi <8 x double> [ %res.i1110, %L1007 ], [ %value_phi56, %L945 ]
  %value_phi103 = phi <8 x double> [ %res.i1111, %L1007 ], [ %value_phi58, %L945 ]
  %value_phi104 = phi <8 x double> [ %res.i1118, %L1007 ], [ %value_phi60, %L945 ]
  %value_phi105 = phi <8 x double> [ %res.i1119, %L1007 ], [ %value_phi62, %L945 ]
  %value_phi106 = phi <8 x double> [ %res.i1120, %L1007 ], [ %value_phi64, %L945 ]
  %value_phi107 = phi <8 x double> [ %res.i1121, %L1007 ], [ %value_phi66, %L945 ]
  %value_phi108 = phi <8 x double> [ %res.i1122, %L1007 ], [ %value_phi68, %L945 ]
  %value_phi109 = phi <8 x double> [ %res.i1131, %L1007 ], [ %value_phi70, %L945 ]
  %value_phi110 = phi <8 x double> [ %res.i1132, %L1007 ], [ %value_phi72, %L945 ]
  %value_phi111 = phi <8 x double> [ %res.i1133, %L1007 ], [ %value_phi74, %L945 ]
  %value_phi112 = phi <8 x double> [ %res.i1134, %L1007 ], [ %value_phi76, %L945 ]
  %value_phi113 = phi <8 x double> [ %res.i1135, %L1007 ], [ %value_phi78, %L945 ]
  %value_phi114 = phi <8 x double> [ %res.i1142, %L1007 ], [ %value_phi80, %L945 ]
  %value_phi115 = phi <8 x double> [ %res.i1143, %L1007 ], [ %value_phi82, %L945 ]
  %value_phi116 = phi <8 x double> [ %res.i1144, %L1007 ], [ %value_phi84, %L945 ]
  %value_phi117 = phi <8 x double> [ %res.i1145, %L1007 ], [ %value_phi86, %L945 ]
  %value_phi118 = phi <8 x double> [ %res.i1146, %L1007 ], [ %value_phi88, %L945 ]
  %value_phi119 = phi i64 [ %204, %L1007 ], [ 1, %L945 ]
  %188 = inttoptr i64 %value_phi92 to i8*
  %189 = getelementptr i8, i8* %188, i64 2570
  call void @llvm.prefetch.p0i8(i8* %189, i32 0, i32 3, i32 1)
  %ptr.i1061 = inttoptr i64 %value_phi92 to <8 x double>*
  %unmaskedload1911 = load <8 x double>, <8 x double>* %ptr.i1061, align 8
  %190 = getelementptr i8, i8* %188, i64 64
  %ptr.i1064 = bitcast i8* %190 to <8 x double>*
  %unmaskedload1912 = load <8 x double>, <8 x double>* %ptr.i1064, align 8
  %191 = getelementptr i8, i8* %188, i64 128
  %ptr.i1067 = bitcast i8* %191 to <8 x double>*
  %unmaskedload1913 = load <8 x double>, <8 x double>* %ptr.i1067, align 8
  %192 = getelementptr i8, i8* %188, i64 192
  %ptr.i1072 = bitcast i8* %192 to <8 x double>*
  %unmaskedload1914 = load <8 x double>, <8 x double>* %ptr.i1072, align 8
  %193 = getelementptr i8, i8* %188, i64 256
  %ptr.i1075 = bitcast i8* %193 to <8 x double>*
  %unmaskedload1915 = load <8 x double>, <8 x double>* %ptr.i1075, align 8
  %ptr.i1078 = inttoptr i64 %value_phi93 to double*
  %res.i1079 = load double, double* %ptr.i1078, align 8
  %ie.i1081 = insertelement <8 x double> undef, double %res.i1079, i32 0
  %v.i1082 = shufflevector <8 x double> %ie.i1081, <8 x double> undef, <8 x i32> zeroinitializer
  %res.i1084 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi94, <8 x double> %v.i1082, <8 x double> %unmaskedload1911) #8
  %res.i1085 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi95, <8 x double> %v.i1082, <8 x double> %unmaskedload1912) #8
  %res.i1086 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi96, <8 x double> %v.i1082, <8 x double> %unmaskedload1913) #8
  %res.i1087 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi97, <8 x double> %v.i1082, <8 x double> %unmaskedload1914) #8
  %res.i1088 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi98, <8 x double> %v.i1082, <8 x double> %unmaskedload1915) #8
  %194 = inttoptr i64 %value_phi93 to i8*
  %195 = getelementptr i8, i8* %194, i64 %73
  %ptr.i1089 = bitcast i8* %195 to double*
  %res.i1090 = load double, double* %ptr.i1089, align 8
  %ie.i1104 = insertelement <8 x double> undef, double %res.i1090, i32 0
  %v.i1105 = shufflevector <8 x double> %ie.i1104, <8 x double> undef, <8 x i32> zeroinitializer
  %res.i1107 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi99, <8 x double> %v.i1105, <8 x double> %unmaskedload1911) #8
  %res.i1108 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi100, <8 x double> %v.i1105, <8 x double> %unmaskedload1912) #8
  %res.i1109 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi101, <8 x double> %v.i1105, <8 x double> %unmaskedload1913) #8
  %res.i1110 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi102, <8 x double> %v.i1105, <8 x double> %unmaskedload1914) #8
  %res.i1111 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi103, <8 x double> %v.i1105, <8 x double> %unmaskedload1915) #8
  %196 = getelementptr i8, i8* %194, i64 %75
  %ptr.i1112 = bitcast i8* %196 to double*
  %res.i1113 = load double, double* %ptr.i1112, align 8
  %ie.i1115 = insertelement <8 x double> undef, double %res.i1113, i32 0
  %v.i1116 = shufflevector <8 x double> %ie.i1115, <8 x double> undef, <8 x i32> zeroinitializer
  %res.i1118 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi104, <8 x double> %v.i1116, <8 x double> %unmaskedload1911) #8
  %res.i1119 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi105, <8 x double> %v.i1116, <8 x double> %unmaskedload1912) #8
  %res.i1120 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi106, <8 x double> %v.i1116, <8 x double> %unmaskedload1913) #8
  %res.i1121 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi107, <8 x double> %v.i1116, <8 x double> %unmaskedload1914) #8
  %res.i1122 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi108, <8 x double> %v.i1116, <8 x double> %unmaskedload1915) #8
  %197 = getelementptr i8, i8* %194, i64 %76
  %ptr.i1123 = bitcast i8* %197 to double*
  %res.i1124 = load double, double* %ptr.i1123, align 8
  %ie.i1126 = insertelement <8 x double> undef, double %res.i1124, i32 0
  %v.i1127 = shufflevector <8 x double> %ie.i1126, <8 x double> undef, <8 x i32> zeroinitializer
  %res.i1131 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi109, <8 x double> %v.i1127, <8 x double> %unmaskedload1911) #8
  %res.i1132 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi110, <8 x double> %v.i1127, <8 x double> %unmaskedload1912) #8
  %res.i1133 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi111, <8 x double> %v.i1127, <8 x double> %unmaskedload1913) #8
  %res.i1134 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi112, <8 x double> %v.i1127, <8 x double> %unmaskedload1914) #8
  %res.i1135 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi113, <8 x double> %v.i1127, <8 x double> %unmaskedload1915) #8
  %198 = getelementptr i8, i8* %194, i64 %77
  %ptr.i1136 = bitcast i8* %198 to double*
  %res.i1137 = load double, double* %ptr.i1136, align 8
  %ie.i1139 = insertelement <8 x double> undef, double %res.i1137, i32 0
  %v.i1140 = shufflevector <8 x double> %ie.i1139, <8 x double> undef, <8 x i32> zeroinitializer
  %res.i1142 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi114, <8 x double> %v.i1140, <8 x double> %unmaskedload1911) #8
  %res.i1143 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi115, <8 x double> %v.i1140, <8 x double> %unmaskedload1912) #8
  %res.i1144 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi116, <8 x double> %v.i1140, <8 x double> %unmaskedload1913) #8
  %res.i1145 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi117, <8 x double> %v.i1140, <8 x double> %unmaskedload1914) #8
  %res.i1146 = call <8 x double> asm "vfmadd231pd $3, $2, $1", "=v,0,v,v"(<8 x double> %value_phi118, <8 x double> %v.i1140, <8 x double> %unmaskedload1915) #8
  %199 = getelementptr i8, i8* %188, i64 320
  %200 = ptrtoint i8* %199 to i64
  %201 = getelementptr i8, i8* %194, i64 8
  %202 = ptrtoint i8* %201 to i64
  %203 = icmp eq i64 %value_phi119, %65
  %204 = add nuw i64 %value_phi119, 1
  br i1 %203, label %L1194, label %L1007

The main difference I see is that the good version increments pointers between iterations:

  %value_phi92 = phi i64 [ %200, %L1007 ], [ %value_phi38, %L945 ]
  %value_phi93 = phi i64 [ %202, %L1007 ], [ %value_phi39, %L945 ]
# ...
  %199 = getelementptr i8, i8* %188, i64 320
  %200 = ptrtoint i8* %199 to i64
  %201 = getelementptr i8, i8* %194, i64 8
  %202 = ptrtoint i8* %201 to i64
  %203 = icmp eq i64 %value_phi119, %65
  %204 = add nuw i64 %value_phi119, 1
  br i1 %203, label %L1194, label %L1007

While the bad version increments a loop counter, and uses this to calculate the pointers:

 %value_phi4 = phi i64 [ %res.i2708, %L111 ], [ 0, %L111.preheader ]
# ...
  %res.i1789 = add nsw i64 %value_phi4, %res.i1787
  %offsetptr.i1792 = getelementptr inbounds double, double* %ptr.i1791, i64 %res.i1789
  %res.i1803 = load double, double* %offsetptr.i1792, align 8
  %res.i1811 = mul nsw i64 %value_phi4, %.sroa.2.0.copyload
  %res.i1813 = add nsw i64 %res.i1811, %value_phi15290
  %offsetptr.i1816 = getelementptr inbounds double, double* %ptr.i1815, i64 %res.i1813
# ...
  %res.i2708 = add nuw nsw i64 %value_phi4, 1

Is there anything I can do to help LLVM trigger this optimization?

The “bad LLVM IR” is what my Julia library LoopVectorization.jl produces for matrix multiplication, while the “good” are kernels from a dedicated library.

I would like to know how to fix my code-gen. Any guidance on that front would be greatly appreciated.
I’m also happy to provide more information, e.g. all of the LLVM IR (optimized or unoptimized), or any other information that may help.

I’ve been throwing mud at a wall recently.
For example, I’ve spent the better part of today converting all my libraries to use 0-based indexing instead of 1-based indexing, like a cargo-cultist:

  1. The code generating the good IR was.
  2. Most LLVM languages are, so perhaps the optimizer has trouble with it.
    Result: an extra addq/leaq, and resulting code performs about 3% worse in benchmarks.

But I suspect this is just me overfitting noise, and flailing cluelessly due to lack of any understanding or intuition about why LLVM does what it does.

1 Like

For a bizarre example of a performance regression caused by slightly changing the code presented to LLVM, I had a 25% regression, where the former hot loop asm was:

L1440:
        leaq    (%rbx,%rsi,8), %rdi
        vmovupd (%rdx,%rdi), %zmm6
        addq    %rdx, %rdi
        vmovupd (%rdx,%rdi), %zmm7
        addq    %rdx, %rdi
        vbroadcastsd    (%r12,%rsi,8), %zmm8
        vbroadcastsd    (%rax,%rsi,8), %zmm9
        vfmadd231pd     (%rbx,%rsi,8), %zmm8, %zmm4 # zmm4 = (zmm8 * mem) + zmm4
        vfmadd231pd     %zmm6, %zmm9, %zmm4 # zmm4 = (zmm9 * zmm6) + zmm4
        vbroadcastsd    (%r8,%rsi,8), %zmm10
        vfmadd231pd     %zmm7, %zmm10, %zmm4 # zmm4 = (zmm10 * zmm7) + zmm4
        vmovupd (%rdx,%rdi), %zmm11
        addq    %rdx, %rdi
        vfmadd231pd     %zmm6, %zmm8, %zmm5 # zmm5 = (zmm8 * zmm6) + zmm5
        vfmadd231pd     %zmm7, %zmm9, %zmm5 # zmm5 = (zmm9 * zmm7) + zmm5
        vfmadd231pd     %zmm11, %zmm10, %zmm5 # zmm5 = (zmm10 * zmm11) + zmm5
        vmovupd (%rdx,%rdi), %zmm6
        addq    %rdx, %rdi
        vfmadd231pd     %zmm7, %zmm8, %zmm3 # zmm3 = (zmm8 * zmm7) + zmm3
        vfmadd231pd     %zmm11, %zmm9, %zmm3 # zmm3 = (zmm9 * zmm11) + zmm3
        vfmadd231pd     %zmm6, %zmm10, %zmm3 # zmm3 = (zmm10 * zmm6) + zmm3
        vmovupd (%rdx,%rdi), %zmm7
        addq    %rdx, %rdi
        vfmadd231pd     %zmm11, %zmm8, %zmm2 # zmm2 = (zmm8 * zmm11) + zmm2
        vfmadd231pd     %zmm6, %zmm9, %zmm2 # zmm2 = (zmm9 * zmm6) + zmm2
        vfmadd231pd     %zmm7, %zmm10, %zmm2 # zmm2 = (zmm10 * zmm7) + zmm2
        vmovupd (%rdx,%rdi), %zmm11
        addq    %rdx, %rdi
        vfmadd231pd     %zmm6, %zmm8, %zmm1 # zmm1 = (zmm8 * zmm6) + zmm1
        vfmadd231pd     %zmm7, %zmm9, %zmm1 # zmm1 = (zmm9 * zmm7) + zmm1
        vfmadd231pd     %zmm11, %zmm10, %zmm1 # zmm1 = (zmm10 * zmm11) + zmm1
        vfmadd231pd     %zmm7, %zmm8, %zmm0 # zmm0 = (zmm8 * zmm7) + zmm0
        vfmadd231pd     %zmm11, %zmm9, %zmm0 # zmm0 = (zmm9 * zmm11) + zmm0
        vfmadd231pd     (%rdx,%rdi), %zmm10, %zmm0 # zmm0 = (zmm10 * mem) + zmm0
        incq    %rsi
        cmpq    %rsi, %rcx
        jne     L1440

and the regressed hot loop:

L1824:
        leaq    (%r12,%rdi), %rbx
        vmovupd (%r15,%rdi,8), %zmm9
        vmovupd (%rax,%rdi,8), %zmm10
        vbroadcastsd    (%rcx,%rdi,8), %zmm11
        vfmadd231pd     (%rbp,%rdi,8), %zmm11, %zmm5 # zmm5 = (zmm11 * mem) + zmm5
        vbroadcastsd    (%rsi,%rdi,8), %zmm12
        vfmadd231pd     %zmm9, %zmm12, %zmm5 # zmm5 = (zmm12 * zmm9) + zmm5
        vbroadcastsd    (%r14,%rdi,8), %zmm13
        vfmadd231pd     %zmm10, %zmm13, %zmm5 # zmm5 = (zmm13 * zmm10) + zmm5
        vmovupd (%rdx,%rdi,8), %zmm14
        vfmadd231pd     %zmm9, %zmm11, %zmm7 # zmm7 = (zmm11 * zmm9) + zmm7
        vfmadd231pd     %zmm10, %zmm12, %zmm7 # zmm7 = (zmm12 * zmm10) + zmm7
        vfmadd231pd     %zmm14, %zmm13, %zmm7 # zmm7 = (zmm13 * zmm14) + zmm7
        vpbroadcastq    %rbx, %ymm9
        vpaddq  %ymm8, %ymm9, %ymm9
        vmovq   %xmm9, %rbx
        vmovupd (%r13,%rbx,8), %zmm15
        vfmadd231pd     %zmm10, %zmm11, %zmm6 # zmm6 = (zmm11 * zmm10) + zmm6
        vfmadd231pd     %zmm14, %zmm12, %zmm6 # zmm6 = (zmm12 * zmm14) + zmm6
        vpextrq $1, %xmm9, %rbx
        vfmadd231pd     %zmm15, %zmm13, %zmm6 # zmm6 = (zmm13 * zmm15) + zmm6
        vmovupd (%r13,%rbx,8), %zmm10
        vfmadd231pd     %zmm14, %zmm11, %zmm4 # zmm4 = (zmm11 * zmm14) + zmm4
        vfmadd231pd     %zmm15, %zmm12, %zmm4 # zmm4 = (zmm12 * zmm15) + zmm4
        vfmadd231pd     %zmm10, %zmm13, %zmm4 # zmm4 = (zmm13 * zmm10) + zmm4
        vextracti128    $1, %ymm9, %xmm0
        vmovq   %xmm0, %rbx
        vmovupd (%r13,%rbx,8), %zmm9
        vfmadd231pd     %zmm15, %zmm11, %zmm3 # zmm3 = (zmm11 * zmm15) + zmm3
        vfmadd231pd     %zmm10, %zmm12, %zmm3 # zmm3 = (zmm12 * zmm10) + zmm3
        vpextrq $1, %xmm0, %rbx
        vfmadd231pd     %zmm9, %zmm13, %zmm3 # zmm3 = (zmm13 * zmm9) + zmm3
        vfmadd231pd     %zmm10, %zmm11, %zmm2 # zmm2 = (zmm11 * zmm10) + zmm2
        vfmadd231pd     %zmm9, %zmm12, %zmm2 # zmm2 = (zmm12 * zmm9) + zmm2
        vfmadd231pd     (%r13,%rbx,8), %zmm13, %zmm2 # zmm2 = (zmm13 * mem) + zmm2
        incq    %rdi
        cmpq    %rdi, %r10
        jne     L1824

In the regressed version, it decided to try and calculate pointer offsets using a SIMD instruction instead of repeated addqs:

  vpaddq  %ymm8, %ymm9, %ymm9
  vmovq   %xmm9, %rbx
  vmovupd (%r13,%rbx,8), %zmm15
  vpextrq $1, %xmm9, %rbx
  vmovupd (%r13,%rbx,8), %zmm10

I made a lot of tweaks, and here is the assembly I am now getting with LLVM 10 (versus that from this post:

L1024:
        prefetcht0      byte ptr [r11 + r14]
        vbroadcastsd    zmm30, qword ptr [r8]
        vmovups zmm29, zmmword ptr [r11]                                                                                                                                                                                                                                                                                                                                                               vmovups zmm28, zmmword ptr [r11 + 64]
        vmovupd zmm27, zmmword ptr [r11 + 128]                                                                                                                                                                                                                                                                                                                                                         prefetcht0      byte ptr [r11 + r14 + 64]
        prefetcht0      byte ptr [r11 + r14 + 128]
        vfmadd231pd     zmm26, zmm30, zmm29 # zmm26 = (zmm30 * zmm29) + zmm26
        vfmadd231pd     zmm23, zmm30, zmm28 # zmm23 = (zmm30 * zmm28) + zmm23
        vbroadcastsd    zmm31, qword ptr [r8 + rdi]
        vfmadd231pd     zmm17, zmm30, zmm27 # zmm17 = (zmm30 * zmm27) + zmm17
        vfmadd231pd     zmm25, zmm31, zmm29 # zmm25 = (zmm31 * zmm29) + zmm25
        vfmadd231pd     zmm21, zmm31, zmm28 # zmm21 = (zmm31 * zmm28) + zmm21
        vfmadd231pd     zmm14, zmm31, zmm27 # zmm14 = (zmm31 * zmm27) + zmm14
        vbroadcastsd    zmm30, qword ptr [r8 + 2*rdi]
        vfmadd231pd     zmm24, zmm30, zmm29 # zmm24 = (zmm30 * zmm29) + zmm24
        vfmadd231pd     zmm19, zmm30, zmm28 # zmm19 = (zmm30 * zmm28) + zmm19
        vfmadd231pd     zmm11, zmm30, zmm27 # zmm11 = (zmm30 * zmm27) + zmm11
        vbroadcastsd    zmm30, qword ptr [r8 + r9]
        vfmadd231pd     zmm22, zmm30, zmm29 # zmm22 = (zmm30 * zmm29) + zmm22
        vfmadd231pd     zmm16, zmm30, zmm28 # zmm16 = (zmm30 * zmm28) + zmm16
        vfmadd231pd     zmm8, zmm30, zmm27 # zmm8 = (zmm30 * zmm27) + zmm8
        vbroadcastsd    zmm30, qword ptr [r8 + 4*rdi]
        vfmadd231pd     zmm20, zmm30, zmm29 # zmm20 = (zmm30 * zmm29) + zmm20
        vfmadd231pd     zmm13, zmm30, zmm28 # zmm13 = (zmm30 * zmm28) + zmm13
        vfmadd231pd     zmm6, zmm30, zmm27 # zmm6 = (zmm30 * zmm27) + zmm6
        vbroadcastsd    zmm30, qword ptr [r8 + r15]
        vfmadd231pd     zmm18, zmm30, zmm29 # zmm18 = (zmm30 * zmm29) + zmm18
        vfmadd231pd     zmm10, zmm30, zmm28 # zmm10 = (zmm30 * zmm28) + zmm10
        vfmadd231pd     zmm4, zmm30, zmm27 # zmm4 = (zmm30 * zmm27) + zmm4
        vbroadcastsd    zmm30, qword ptr [r8 + r12]
        vfmadd231pd     zmm15, zmm30, zmm29 # zmm15 = (zmm30 * zmm29) + zmm15
        vfmadd231pd     zmm7, zmm30, zmm28 # zmm7 = (zmm30 * zmm28) + zmm7
        vbroadcastsd    zmm31, qword ptr [r8 + rbp]
        vfmadd231pd     zmm2, zmm30, zmm27 # zmm2 = (zmm30 * zmm27) + zmm2
        vfmadd231pd     zmm12, zmm31, zmm29 # zmm12 = (zmm31 * zmm29) + zmm12
        vfmadd231pd     zmm5, zmm31, zmm28 # zmm5 = (zmm31 * zmm28) + zmm5
        vfmadd231pd     zmm1, zmm31, zmm27 # zmm1 = (zmm31 * zmm27) + zmm1
        vbroadcastsd    zmm30, qword ptr [r8 + 8*rdi]
        vfmadd231pd     zmm9, zmm30, zmm29 # zmm9 = (zmm30 * zmm29) + zmm9
        vfmadd231pd     zmm3, zmm30, zmm28 # zmm3 = (zmm30 * zmm28) + zmm3
        vfmadd231pd     zmm0, zmm30, zmm27 # zmm0 = (zmm30 * zmm27) + zmm0
        add     r11, r10
        add     r8, 8
        cmp     r11, rdx
        jbe     L1024

What I found was that rather than relying on LLVM to optimize integer calculations, I had to produce code as close to the assembly I wanted as possible.

This means:

  • Instead of incrementing some loop counter/offset, increment the pointers in the loops. Check vs the pointer to decide when to break out.
  • Exception is in the hottest part of the loop, where I write all accesses as offset calculations from the base pointer, corresponding to how the accesses should be. To do this:
  • Premultiply all strides by the size of the element type. This encourages the accesses like r8 + r12.
  • We don’t have infinite integer registers, so inttoptr/ptrtoint/bitcast to get i1, i2, i4, or i8 pointers as necessary when I want to take 1/2/4/or 8 strides. This yields the sequence r8, r8 + rdi, r8 + 2*rdi, r8 + 4*rdi, and r8 + 8*rdi.

As a future improvement, I could save another integer register by applying the last trick with more than just the base stride. E.g., get it to have r8 + r9 (as it does now) and then r8 + 2*r9 instead of r8 + r12.
I did not multiply define r9 = 3*rdi, r15 = 5 * rdi, r12 = 6*rdi, etc. LLVM assigned those values to the integers on its own.
It would be really neat if LLVM could figure out that it should (instead of assigning 6*rdi to r12) address using 2*r9 on it’s own.