Computing the sparse matrix vector product using block-based kernels without zero padding on processors with AVX-512 instructions

View article
PeerJ Computer Science

Main article text

 

Introduction

  • we study block based SpMV without padding with AVX-512,

  • we describe an efficient implementation targeting the next generation of Intel’s HPC architecture,

  • we provide a record-based strategy to choose the most appropriate kernel (using polynomial interpolation in sequential, and linear regression for shared-memory parallel approaches),

  • the paper introduces the SPC5 package that includes the source code related to the present study (https://gitlab.mpcdf.mpg.de/bbramas/spc5).

Background

Vectorization

Sparse Matrix Vector Product (SpMV)

Matrix permutation/reordering

Design of Block-based SpMV without Padding

Block-based storage without zero padding

SpMV kernels for β(rc) storages

 
_______________________________________________________________________________________________________ 
ALGORITHM 1: SpMV for a matrix mat in format β(r,c). The lines in blue 
∙ are to compute in scalar and have to be replaced by the line in green ∙ to have 
the vectorized equivalent. 
________________________________________________________________________________________________________ 
     Input: x : vector to multiply with the matrix. mat : a matrix in the block format β(r,c). 
                r, c : the size of the blocks. 
     Output: y : the result of the product. 
  1  function spmv(x, mat, r, c, y) 
  2    // Index to access the array’s values 
  3    idxVal ← 0 
  4    for idxRow ← 0 to mat.numberOfRows-1 inc by r do 
  5         sum[r] ← init_scalar_array(r, 0) 
  6         sum[r] ← init_simd_array(r, 0) 
  7         for idxBlock ← mat.block_rowptr[idxRow/r] to mat.block_rowptr[idxRow/r+1]-1 
do 
  8              idxCol ← mat.block_colidx[idxBlock] 
  9              for idxRowBlock ← 0 to r do 
 10                    valMask ← mat.block_masks[idxBlock × r + idxRowBlock] 
 11                    // The next loop can be vectorized with vexpand 
 12                    for k ← 0 to c do 
 13                       if bit_shift(1 , k) BIT_AND valMask then 
 14                             sum[idxRowBlock] += x[idxCol+k] * mat.values[idxVal] 
 15                             idxVal += 1 
 16                       end 
 17                    end 
 18                   // To replace the k-loop 
 19                  sum[idxRowBlock] += simd_load(x[idxCol]) * 
simd_vexpand(mat.values[idxVal], valMask) 
 20              end 
 21           end 
 22           for idxRowBlock ← 0 to r do 
 23                 y[ridxRowBlock] += sum[r] 
 24                 y[ridxRowBlock] += simd_hsum(sum[r]) 
 25           end 
 26    end 
 _______________________________________________________________________________________________________    

Relation between matrix shape and number of blocks

Optimized kernel implementation

 
____________________________________________________________________________________ 
ALGORITHM 2: Scalar SpMV for a matrix mat in format β(1,V EC_SIZE) 
with test (vectorized in the second idxBlock loop). 
___________________________________________________________________________________ 
     Input: x : vector to multiply with the matrix. mat : a matrix in the block 
                format. 
     Output: y : the result of the product. 
  1  function spmv_scalar_1_VECSIZE(x, mat, y) 
  2      // Index to access to the values array 
  3      idxVal ← 0 
  4      for idxRow ← 0 to mat.numberOfRows-1 inc by 1 do 
  5           sum_scalar ← 0 
  6           sum_vec ← simd_set(0) 
  7           // Loop for mask equal to 1 
  8           for idxBlock ← mat.block_rowptr[idxRow] to 
mat.block_rowptr[idxRow]-1 do 
  9               idxCol ← mat.block_colidx[idxBlock] 
 10               valMask ← mat.block_masks[idxBlock] 
 11               if valMask not equal to 1 then 
 12                    goto loop-not-1 
 13               end 
 14               label loop-for-1: 
 15               sum_scalar += x[idxCol] * mat.values[idxVal] 
 16               idxVal += 1 
 17           end 
 18           goto end-of-loop 
 19           for idxBlock ← mat.block_rowptr[idxRow] to 
mat.block_rowptr[idxRow]-1 do 
 20                idxCol ← mat.block_colidx[idxBlock] 
 21                valMask ← mat.block_masks[idxBlock] 
 22                if valMask equal to 1 then 
 23                      goto loop-for-1 
 24                end 
 25                label loop-not-1: 
 26                vec_sum += simd_load(x[idxCol]) * 
simd_vexpand(mat.values[idxVal], valMask) 
 27                idxVal += pop_count(valMask) 
 28           end 
 29           label end-of-loop: 
 30           y[idxRowBlock+idxRow] += sum_scalar + simd_hsum(sum_vec); 
 31      end 
 ______________________________________________________________________________________    
 
 
1 extern  ”C ”  void  core_SPC5_1rVc_Spmv_asm_double (const  long  int nbRows ,  const int ∗ rowsSizes , 
2                                                     const  unsigned  char ∗ headers , const  double ∗ values , 
3                                                     const  double ∗ x , double ∗ y ) ; 
4 
5 //  ( nbRows  rdi  ,  rowsSizes  rsi  ,  headers  rdx  ,  values  rcx ,  x  r8 ,  y  r9  ) 
6 //  save  some  registers 
7 ( commented out ) . . . 
8 xorq %r12 , %r12 ; //  valIdx  =  0 
9 //  if  no  rows  in  the  matrix ,  jump  to  end 
10 test %rdi , %rdi ; 
11 jz   compute_Spmv512_avx_asm_double_out_exp ; 
12    xorq %r10 , %r10 ; //  idxRow / r10  =  0 
13    compute_Spmv512_avx_asm_double_loop_exp : 
14 
15    movslq 4(% rsi ,% r10 ,4) , %r11 ; //  rowsSizes [ idxRow +1] 
16    subl 0(% rsi ,% r10 ,4) , %r11d ; //  nbBlocks  =  rowsSizes [ idxRow +1]− rowsSizes [ idxRow ] 
17    //  if  no  blocks  for  this  row ,  jump  to  next  interval 
18    jz compute_Spmv512_avx_asm_double_outrow_exp ; 
19 
20    vpxorq %zmm0 ,%zmm0 ,%zmm0 ; //  %zmm0  sum  =  0 
21    compute_Spmv512_avx_asm_double_inrow_exp : 
22            movslq 0(% rdx ) , %r13 ; //  colIdx  =  ∗ rdx  or  ∗ headers 
23            movzbl 4(% rdx ) , %r14d ; //  mask  =  ∗( rdx +4)  or  ∗( headers +4) 
24            kmovw %r14d , %k1 ; //  mask 
25            vexpandpd (%rcx ,%r12 ,8) , %zmm1 { %k1 }{ z }; //  values  ( only  some of  them ) 
26            vfmadd231pd (%r8 ,%r13 ,8) , %zmm1 , %zmm0 ; //  mul  add  to  sum 
27 
28            popcntw %r14w , %r14w ; //  count  the  number  of  bits  in  the  mask 
29            addq %r14 , %r12 ; //  valIdx  +=  number  of  bits ( mask ) 
30            addq $5 , %rdx ; //  headers  +=  1  ∗  int  +  1  mask 
31            subq $1 , %r11 ; //  nbBlocks  −=1,  if  equal  zero  go  to  end  of  interval 
32     jnz compute_Spmv512_avx_asm_double_inrow_exp ; 
33 
34     compute_Spmv512_avx_asm_double_inrow_exp_stop : 
35     //  Horizontal  sum  from  ymm0  to  xmm0 
36     ( commented out ) . . . 
37     //  add  to  y ,  r9 [ r10 ]  =>  y [ idxRow ] 
38     vaddsd (%r9 ,%r10 ,8) , %xmm0 , %xmm0 ; 
39     vmovsd %xmm0 , (%r9 ,%r10 ,8) ; 
40     compute_Spmv512_avx_asm_double_outrow_exp : 
41 
42     addq $1 , %r10 ; //  idxRow  +=  1 
43     cmp %rdi , %r10 ; //  idxRow  ==  nbRows 
44     jne compute_Spmv512_avx_asm_double_loop_exp ; //  if  nbBlocks  !=  0  go  to beginning 
45 compute_Spmv512_avx_asm_double_out_exp :  
_______________________________________________________________________________________ 
 
                              Code 1: β(1,8) kernel in assembly language.    

Parallelization

Performance Analysis

Hardware/software

Test matrices

Sequential SpMV performance

Parallel SpMV performance

Performance Prediction and Optimal Kernel Selection

Performance polynomial interpolation (sequential)

Parallel performance estimation

Conclusions

Additional Information and Declarations

Competing Interests

Berenger Bramas is a researcher at MPCDF. Pavel Kus is a researcher at MPCDF.

Author Contributions

Bérenger Bramas and Pavel Kus conceived and designed the experiments, performed the experiments, analyzed the data, contributed reagents/materials/analysis tools, prepared figures and/or tables, performed the computation work, approved the final draft.

Data Availability

The following information was supplied regarding data availability:

GitHub: https://gitlab.mpcdf.mpg.de/bbramas/spc5

Funding

The authors received no funding for this work.

12 Citations 2,798 Views 690 Downloads

Your institution may have Open Access funds available for qualifying authors. See if you qualify

Publish for free

Comment on Articles or Preprints and we'll waive your author fee
Learn more

Five new journals in Chemistry

Free to publish • Peer-reviewed • From PeerJ
Find out more