Unoptimized address generating code for RISC-V vector instrinsics

Hi,

I observed some unoptimized RISC-V code related to address generation when I used RISC-V vector instrinsics with std::thread library. The following is an example demonstrating the issue:

#include <cstdlib>                                                              
#include <thread>                                                               
#include <vector>                                                               
#include <riscv_vector.h>                                                       
                                                                                
int main(int argc, char* argv[]) {                                              
  size_t size = atoi(argv[1]);                                                  
                                                                                
  // initialize data set                                                        
  int* src0 = new int[size];                                                    
  int* src1 = new int[size];                                                    
  int* dest = new int[size];                                                    
  int* ref  = new int[size];                                                    
                                                                                
  for (size_t i = 0; i < size; ++i) {                                           
    src0[i] = i;                                                                
    src1[i] = size - i;                                                         
    ref[i]  = src0[i] + src1[i];                                                
  }                                                                             
                                                                                
  std::vector<std::thread> threads;                                             
                                                                                
  auto vvadd = [=](size_t begin, size_t end)                                    
                      {                                                         
                        size_t vlen = 16;                                       
                                                                                
                        vint32m1_t vs0;                                         
                        vint32m1_t vs1;                                         
                        vint32m1_t vd;                                          
                                                                                
                        for (size_t i = begin; i < end; i += vlen) {            
                          vlen = vsetvl_e32m1(end - i);                         
                          vs0 = vle32_v_i32m1(src0 + i, vlen);                  
                          vs1 = vle32_v_i32m1(src1 + i, vlen);                  
                          vd = vadd_vv_i32m1(vs0, vs1, vlen);                   
                          vse32_v_i32m1(dest + i, vd, vlen);                    
                        }                                                       
                      };                                                        
                                                                                
  size_t nthreads = 4;                                                          
                                                                                
  for (size_t i = 0; i < nthreads; ++i) {                                       
    threads.emplace_back(std::thread(vvadd,                                     
                                     i * size / nthreads,                       
                                     (i + 1) * size / nthreads));               
  }                                                                             
                                                                                
  for (auto& t : threads)                                                       
    t.join();                                                                   

  delete[] src0;                                                                
  delete[] src1;                                                                
  delete[] dest;
                                                                                
  return 0;                                                                     
}      

This is how I compiled the code with clang-13:

clang++ -Wall -O3 -std=c++17 -menable-experimental-extensions --target=riscv64-unknown-linux -march=rv64gcv0p10 vvadd-thread.cc -lpthread -o vvadd-thread

These are the corresponding instructions generated by LLVM:

0000000000010ed2 <_ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZ4mainE3$_0mmEEEEE6_M_runEv>:
   10ed2: 690c                  ld  a1,16(a0)                                   
   10ed4: 6510                  ld  a2,8(a0)                                    
   10ed6: 02c5f963            bgeu  a1,a2,10f08 <_ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZ4mainE3$_0mmEEEEE6_M_runEv+0x36>
   10eda: 40b606b3            sub a3,a2,a1                                      
   10ede: 0506f6d7            vsetvli a3,a3,e32,m1,ta,mu                        
   10ee2: 6d18                  ld  a4,24(a0)                                      
   10ee4: 00259793            slli  a5,a1,0x2                                   
   10ee8: 973e                  add a4,a4,a5                                    
   10eea: 02076c87            vle32.v v25,(a4)                                     
   10eee: 7118                  ld  a4,32(a0)                                   
   10ef0: 973e                  add a4,a4,a5                                    
   10ef2: 02076d07            vle32.v v26,(a4)                                  
   10ef6: 7518                  ld  a4,40(a0)                                   
   10ef8: 039d0cd7            vadd.vv v25,v25,v26                               
   10efc: 973e                  add a4,a4,a5                                    
   10efe: 95b6                  add a1,a1,a3                                    
   10f00: 02076ca7            vse32.v v25,(a4)                                  
   10f04: fcc5ebe3            bltu  a1,a2,10eda <_ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZ4mainE3$_0mmEEEEE6_M_runEv+0x8>
   10f08: 8082                  ret

Let’s consider instructions inside the for loop in the lambda:

   10eda: 40b606b3            sub a3,a2,a1                                      
   10ede: 0506f6d7            vsetvli a3,a3,e32,m1,ta,mu                        
   10ee2: 6d18                  ld  a4,24(a0)         <-- generating address for src0 + i                                
   10ee4: 00259793            slli  a5,a1,0x2                                   
   10ee8: 973e                  add a4,a4,a5                                    
   10eea: 02076c87            vle32.v v25,(a4)                                     
   10eee: 7118                  ld  a4,32(a0)         <-- generating address for src1 + i                           
   10ef0: 973e                  add a4,a4,a5                                    
   10ef2: 02076d07            vle32.v v26,(a4)                                  
   10ef6: 7518                  ld  a4,40(a0)         <-- generating address for dest + i                         
   10ef8: 039d0cd7            vadd.vv v25,v25,v26                               
   10efc: 973e                  add a4,a4,a5                                    
   10efe: 95b6                  add a1,a1,a3                                    
   10f00: 02076ca7            vse32.v v25,(a4)                                  
   10f04: fcc5ebe3            bltu  a1,a2,10eda

The annotated instructions are redundant in the loop since the base addresses are unchanged. For some reasons, the compiler loads the base addresses for the three arrays every iteration.

I’m able to make the compiler generate optimal instruction sequence by creating local variables for those pointers inside the lambda:

  auto vvadd = [=](size_t begin, size_t end)                                    
                      {                                                         
                        size_t vlen = 16;                                       
                                                                                
                        vint32m1_t vs0;                                         
                        vint32m1_t vs1;                                         
                        vint32m1_t vd;                                          
                                                                                
                        int* _src0 = src0;                                      
                        int* _src1 = src1;                                      
                        int* _dest = dest;                                      
                                                                                
                        for (size_t i = begin; i < end; i += vlen) {            
                          vlen = vsetvl_e32m1(end - i);                         
                          vs0 = vle32_v_i32m1(_src0 + i, vlen);                 
                          vs1 = vle32_v_i32m1(_src1 + i, vlen);                 
                          vd = vadd_vv_i32m1(vs0, vs1, vlen);                   
                          vse32_v_i32m1(_dest + i, vd, vlen);                   
                        }                                                       
                      };            

This is the optimal code sequence:

   10ee4: 40b607b3            sub a5,a2,a1                                      
   10ee8: 0507f7d7            vsetvli a5,a5,e32,m1,ta,mu                        
   10eec: 00259693            slli  a3,a1,0x2                                   
   10ef0: 00d80733            add a4,a6,a3                                      
   10ef4: 02076c87            vle32.v v25,(a4)                                  
   10ef8: 00d88733            add a4,a7,a3                                      
   10efc: 02076d07            vle32.v v26,(a4)                                  
   10f00: 039d0cd7            vadd.vv v25,v25,v26                               
   10f04: 96aa                  add a3,a3,a0                                    
   10f06: 95be                  add a1,a1,a5                                    
   10f08: 0206eca7            vse32.v v25,(a3)                                  
   10f0c: fcc5ece3            bltu  a1,a2,10ee4 <_ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZ4mainE3$_0mmEEEEE6_M_runEv+0x12>

It is worth noting that calling the lambda (original lambda without copying those pointers to local variables) directly without using std::thread makes the compiler not generate those redundant instructions.

Any insight on this issue (if any)?

Thanks,

Tuan