c - Calculate 4d vectors average with SSE -


i try speed calculation of average of 4d vectors placed in array. here code:

#include <sys/time.h> #include <sys/param.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <xmmintrin.h>  typedef float dot[4]; #define n 1000000  double gettime () {     struct timeval tv;     gettimeofday (&tv, 0);     return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec); }  void calc_avg1 (dot res, const dot array[], int n) {     int i,j;     memset (res, 0, sizeof (dot));     (i = 0; < n; i++)     {         (j = 0; j<4; j++) res[j] += array[i][j];     }     (j = 0; j<4; j++) res[j] /= n; }  void calc_avg2 (dot res, const dot array[], int n) {     int i;     __v4sf r = _mm_set1_ps (0.0);     (i=0; i<n; i++) r += _mm_load_ps (array[i]);     r /= _mm_set1_ps ((float)n);     _mm_store_ps (res, r); }  int main () {     void *space = malloc (n*sizeof(dot)+15);     dot *array = (dot*)(((unsigned long)space+15) & ~(unsigned long)15);     dot avg __attribute__((aligned(16)));     int i;     double time;      (i = 0; < n; i++)     {         array[i][0] = 1.0*random();         array[i][1] = 1.0*random();         array[i][2] = 1.0*random();     }     time = gettime();     calc_avg1 (avg, array, n);     time = gettime() - time;     printf ("%f\n%f %f %f\n", time, avg[0], avg[1], avg[2]);      time = gettime();     calc_avg2 (avg, array, n);     time = gettime() - time;     printf ("%f\n%f %f %f\n", time, avg[0], avg[1], avg[2]);     return 0; } 

so can see calc_avg1 uses naive loops 0 4 , calc_avg2 replaces them sse instructions. compile code clang 3.4:

cc -o2 -o test test.c 

here disassemble of calc_avgx functions:

0000000000400860 <calc_avg1>:   400860:   55                      push   %rbp   400861:   48 89 e5                mov    %rsp,%rbp   400864:   85 d2                   test   %edx,%edx   400866:   0f 57 c0                xorps  %xmm0,%xmm0   400869:   0f 11 07                movups %xmm0,(%rdi)   40086c:   7e 42                   jle    4008b0 <calc_avg1+0x50>   40086e:   48 83 c6 0c             add    $0xc,%rsi   400872:   0f 57 c0                xorps  %xmm0,%xmm0   400875:   89 d0                   mov    %edx,%eax   400877:   0f 57 c9                xorps  %xmm1,%xmm1   40087a:   0f 57 d2                xorps  %xmm2,%xmm2   40087d:   0f 57 db                xorps  %xmm3,%xmm3   400880:   f3 0f 58 5e f4          addss  -0xc(%rsi),%xmm3   400885:   f3 0f 11 1f             movss  %xmm3,(%rdi)   400889:   f3 0f 58 56 f8          addss  -0x8(%rsi),%xmm2   40088e:   f3 0f 11 57 04          movss  %xmm2,0x4(%rdi)   400893:   f3 0f 58 4e fc          addss  -0x4(%rsi),%xmm1   400898:   f3 0f 11 4f 08          movss  %xmm1,0x8(%rdi)   40089d:   f3 0f 58 06             addss  (%rsi),%xmm0   4008a1:   f3 0f 11 47 0c          movss  %xmm0,0xc(%rdi)   4008a6:   48 83 c6 10             add    $0x10,%rsi   4008aa:   ff c8                   dec    %eax   4008ac:   75 d2                   jne    400880 <calc_avg1+0x20>   4008ae:   eb 0c                   jmp    4008bc <calc_avg1+0x5c>   4008b0:   0f 57 c0                xorps  %xmm0,%xmm0   4008b3:   0f 57 c9                xorps  %xmm1,%xmm1   4008b6:   0f 57 d2                xorps  %xmm2,%xmm2   4008b9:   0f 57 db                xorps  %xmm3,%xmm3   4008bc:   f3 0f 2a e2             cvtsi2ss %edx,%xmm4   4008c0:   f3 0f 5e dc             divss  %xmm4,%xmm3   4008c4:   f3 0f 11 1f             movss  %xmm3,(%rdi)   4008c8:   f3 0f 5e d4             divss  %xmm4,%xmm2   4008cc:   f3 0f 11 57 04          movss  %xmm2,0x4(%rdi)   4008d1:   f3 0f 5e cc             divss  %xmm4,%xmm1   4008d5:   f3 0f 11 4f 08          movss  %xmm1,0x8(%rdi)   4008da:   f3 0f 5e c4             divss  %xmm4,%xmm0   4008de:   f3 0f 11 47 0c          movss  %xmm0,0xc(%rdi)   4008e3:   5d                      pop    %rbp   4008e4:   c3                      retq      4008e5:   66 66 2e 0f 1f 84 00    nopw   %cs:0x0(%rax,%rax,1)   4008ec:   00 00 00 00   00000000004008f0 <calc_avg2>:   4008f0:   55                      push   %rbp   4008f1:   48 89 e5                mov    %rsp,%rbp   4008f4:   85 d2                   test   %edx,%edx   4008f6:   0f 57 c0                xorps  %xmm0,%xmm0   4008f9:   7e 10                   jle    40090b <calc_avg2+0x1b>   4008fb:   89 d0                   mov    %edx,%eax   4008fd:   0f 1f 00                nopl   (%rax)   400900:   0f 58 06                addps  (%rsi),%xmm0   400903:   48 83 c6 10             add    $0x10,%rsi   400907:   ff c8                   dec    %eax   400909:   75 f5                   jne    400900 <calc_avg2+0x10>   40090b:   66 0f 6e ca             movd   %edx,%xmm1   40090f:   66 0f 70 c9 00          pshufd $0x0,%xmm1,%xmm1   400914:   0f 5b c9                cvtdq2ps %xmm1,%xmm1   400917:   0f 5e c1                divps  %xmm1,%xmm0   40091a:   0f 29 07                movaps %xmm0,(%rdi)   40091d:   5d                      pop    %rbp   40091e:   c3                      retq      40091f:   90                      nop     

and here result:

> ./test 0.004287 1073864320.000000 1074018048.000000 1073044224.000000 0.003661 1073864320.000000 1074018048.000000 1073044224.000000 

so sse version 1.17 times faster. when try seemingly same job, calculate average of single precision scalars in array (as described here sse reduction of float vector example), sse version runs 3.32 times faster. here code of calc_avgx functions:

float calc_avg1 (const float array[], int n) {     int i;     float avg = 0;     (i = 0; < n; i++) avg += array[i];     return avg / n; }  float calc_avg3 (const float array[], int n) {     int i;     __v4sf r = _mm_set1_ps (0.0);     (i=0; i<n; i+=4) r += _mm_load_ps (&(array[i]));     r = _mm_hadd_ps (r, r);     r = _mm_hadd_ps (r, r);     return r[0] / n; } 

so question this: why benefit sse in last example (calculation of average of single float scalars) , not in first (calculation of average of 4d vectors)? me, these jobs same. right way speed calculations in first example, if wrong?

upd: if think it's relevant, supply disassembly of second example, average of scalars calculated (also compiled clang3.4 -o2).

0000000000400860 <calc_avg1>:   400860:   55                      push   %rbp   400861:   48 89 e5                mov    %rsp,%rbp   400864:   85 d2                   test   %edx,%edx   400866:   0f 57 c0                xorps  %xmm0,%xmm0   400869:   0f 11 07                movups %xmm0,(%rdi)   40086c:   7e 42                   jle    4008b0 <calc_avg1+0x50>   40086e:   48 83 c6 0c             add    $0xc,%rsi   400872:   0f 57 c0                xorps  %xmm0,%xmm0   400875:   89 d0                   mov    %edx,%eax   400877:   0f 57 c9                xorps  %xmm1,%xmm1   40087a:   0f 57 d2                xorps  %xmm2,%xmm2   40087d:   0f 57 db                xorps  %xmm3,%xmm3   400880:   f3 0f 58 5e f4          addss  -0xc(%rsi),%xmm3   400885:   f3 0f 11 1f             movss  %xmm3,(%rdi)   400889:   f3 0f 58 56 f8          addss  -0x8(%rsi),%xmm2   40088e:   f3 0f 11 57 04          movss  %xmm2,0x4(%rdi)   400893:   f3 0f 58 4e fc          addss  -0x4(%rsi),%xmm1   400898:   f3 0f 11 4f 08          movss  %xmm1,0x8(%rdi)   40089d:   f3 0f 58 06             addss  (%rsi),%xmm0   4008a1:   f3 0f 11 47 0c          movss  %xmm0,0xc(%rdi)   4008a6:   48 83 c6 10             add    $0x10,%rsi   4008aa:   ff c8                   dec    %eax   4008ac:   75 d2                   jne    400880 <calc_avg1+0x20>   4008ae:   eb 0c                   jmp    4008bc <calc_avg1+0x5c>   4008b0:   0f 57 c0                xorps  %xmm0,%xmm0   4008b3:   0f 57 c9                xorps  %xmm1,%xmm1   4008b6:   0f 57 d2                xorps  %xmm2,%xmm2   4008b9:   0f 57 db                xorps  %xmm3,%xmm3   4008bc:   f3 0f 2a e2             cvtsi2ss %edx,%xmm4   4008c0:   f3 0f 5e dc             divss  %xmm4,%xmm3   4008c4:   f3 0f 11 1f             movss  %xmm3,(%rdi)   4008c8:   f3 0f 5e d4             divss  %xmm4,%xmm2   4008cc:   f3 0f 11 57 04          movss  %xmm2,0x4(%rdi)   4008d1:   f3 0f 5e cc             divss  %xmm4,%xmm1   4008d5:   f3 0f 11 4f 08          movss  %xmm1,0x8(%rdi)   4008da:   f3 0f 5e c4             divss  %xmm4,%xmm0   4008de:   f3 0f 11 47 0c          movss  %xmm0,0xc(%rdi)   4008e3:   5d                      pop    %rbp   4008e4:   c3                      retq      4008e5:   66 66 2e 0f 1f 84 00    nopw   %cs:0x0(%rax,%rax,1)   4008ec:   00 00 00 00   00000000004008d0 <calc_avg3>:   4008d0:   55                      push   %rbp   4008d1:   48 89 e5                mov    %rsp,%rbp   4008d4:   31 c0                   xor    %eax,%eax   4008d6:   85 f6                   test   %esi,%esi   4008d8:   0f 57 c0                xorps  %xmm0,%xmm0   4008db:   7e 0f                   jle    4008ec <calc_avg3+0x1c>   4008dd:   0f 1f 00                nopl   (%rax)   4008e0:   0f 58 04 87             addps  (%rdi,%rax,4),%xmm0   4008e4:   48 83 c0 04             add    $0x4,%rax   4008e8:   39 f0                   cmp    %esi,%eax   4008ea:   7c f4                   jl     4008e0 <calc_avg3+0x10>   4008ec:   66 0f 70 c8 01          pshufd $0x1,%xmm0,%xmm1   4008f1:   f3 0f 58 c8             addss  %xmm0,%xmm1   4008f5:   66 0f 70 d0 03          pshufd $0x3,%xmm0,%xmm2   4008fa:   0f 12 c0                movhlps %xmm0,%xmm0   4008fd:   f3 0f 58 c1             addss  %xmm1,%xmm0   400901:   f3 0f 58 c2             addss  %xmm2,%xmm0   400905:   0f 57 c9                xorps  %xmm1,%xmm1   400908:   f3 0f 2a ce             cvtsi2ss %esi,%xmm1   40090c:   f3 0f 5e c1             divss  %xmm1,%xmm0   400910:   5d                      pop    %rbp   400911:   c3                      retq      400912:   66 66 66 66 66 2e 0f    nopw   %cs:0x0(%rax,%rax,1)   400919:   1f 84 00 00 00 00 00  

sorry answer got little long , rambling. ran benchmarks, didn't take long time editing earlier stuff after thinking of else try.

your working set 15.25mib (16mb). benchmark routine this, you'd average smaller buffer multiple times, fits in cache. don't see difference between slow version , fast version because diff hidden memory bottleneck.

calc_avg1 doesn't auto-vectorize @ (note addss. ss means scalar, single-precision, opposed addps (packed single-precision)). think can't autovectorize when inlined main because can't sure there aren't nans in 4th vector position, cause fp exception scalar code wouldn't have. tried compiling sandybridge gcc 4.9.2 -o3 -march=native -ffast-math, , clang-3.5, no luck either.

even so, version inlined main runs slower, because memory bottleneck. 32bit loads can keep 128b loads, when hitting main memory. (the non-inlined version bad, though: every += result stored res array, because loop accumulates directly memory might have other references it. has make every operation visible store. version posted disassembly for, btw. sorting out part of main assisted compiling -s -fverbose-asm.)

somewhat disappointingly, clang , gcc can't auto-vectorize __v4sf 4-wide 8-wide avx.

scratch that, after wrapping for (int i=0; i<4000 ; i++) around calls calc_avgx, , reducing n 10k, gcc -o3 turns inner inner-loop of avg1 into:

  400690:       c5 f8 10 08             vmovups (%rax),%xmm1   400694:       48 83 c0 20             add    $0x20,%rax   400698:       c4 e3 75 18 48 f0 01    vinsertf128 $0x1,-0x10(%rax),%ymm1,%ymm1   40069f:       c5 fc 58 c1             vaddps %ymm1,%ymm0,%ymm0   4006a3:       48 39 d8                cmp    %rbx,%rax   4006a6:       75 e8                   jne    400690 <main+0xe0>  $ (get cpu max-turbo frequency) && time ./a.out 0.016515 1071570752.000000 1066917696.000000 1073897344.000000 0.032875 1071570944.000000 1066916416.000000 1073895680.000000 

this bizzare; have no idea why doesn't use 32b loads. use 32b vaddps, bottleneck when working data set fits in l2 cache.

idk why managed auto-vectorize inner loop when inside loop. note only applies version inlined main. callable version still scalar-only. note gcc managed this. clang 3.5 didn't. maybe gcc knew going use malloc in way returned zeroed buffer (so didn't have worry nans in 4th element)?

i'm surprised clang's non-vectorized avg1 not slower, when fits in cache. n=10000, repeat-count = 40k.

3.3ghz snb i5 2500k, max turbo = 3.8ghz. avg1: 0.350422s:  clang -o3 -march=native (not vectorized.  loop of 6 scalar addss memory operands) avg2: 0.320173s:  clang -o3 -march=native avg1: 0.497040s:  clang -o3 -march=native -ffast-math (haven't looked @ asm see happened)  avg1: 0.160374s:  gcc -o3 -march=native (256b addps, 2 128b loads) avg2: 0.321028s:  gcc -o3 -march=native (128b addps memory operand)  avg2: ~0.16:  clang, unrolled 2 dependency chains hide latency (see below). avg2: ~0.08:  unrolled 4 dep chains avg2: ~0.04:  in theory unrolled-by-4 256b avx.  didn't try unrolling 1 gcc auto-vectorized 256b addps 

so big surprise scalar-only clang avg1 code keeps avg2. perhaps loop-carried dependency chain bigger bottleneck?

perf showing 1.47 insns per cycle clang's non-vectorized avg1, saturating fp add unit on port 1. (most of loop instructions adds).

however, avg2, using 128b addps memory operand, getting 0.58 insns per cycle. reducing array size factor of 10, n=1000, gets 0.60 insns per cycle, because of more time spend in prologue/epilogue. there's serious issue loop-carried dependency chain, think. clang unrolls loop 4, uses single accumulator. loop has 7 instructions, decodes 10 uops. (each vaddps 2, since it's used memory operand 2-register addressing mode, preventing micro-fusion. cmp , jne macro-fuse). http://www.brendangregg.com/perf.html says perf event uops_dispatched.core r2b1, so:

$ perf stat -d -e cycles,instructions,r2b1 ./a.out 0.031793 1053298112.000000 1052673664.000000 1116960256.000000   performance counter stats './a.out':         118,453,541      cycles         71,181,299      instructions              #    0.60  insns per cycle        102,025,443      r2b1  # uops, perf doesn't have nice name         40,256,019      l1-dcache-loads             21,254      l1-dcache-load-misses     #    0.05% of l1-dcache hits              9,588      llc-loads                  0      llc-load-misses:hg        #    0.00% of ll-cache hits         0.032276233 seconds time elapsed 

this confirms 7:10 instructions uops analysis. that's not relevant performance problem here: loop running way slower 4 uops per cycle upper limit. changing inner loop 2 separate dep chains going doubles throughput (60m cycles instead of 117m, 81m insns instead of 71m):

for (i=0; i<n-1; i+=2) {  // todo: make sure loop end condition correct    r0 += _mm_load_ps (array[i]);    r1 += _mm_load_ps (array[i+1]); } r0 += r1; 

unrolling 4 (with 4 accumulators merge @ end of loop) doubles performance again. (down 42m cycles, 81m insns, 112m uops.) inner loop has 4x vaddps -0x30(%rcx),%xmm4,%xmm4 (and similar), 2x add, cmp, jl. form of vaddps should micro-fuse, i'm still seeing lot more uops instructions, guess r2b1 counts unfused-domain uops. (linux perf doesn't have docs platform-specific hw events). cranking n again, make sure it's innermost loop dominates counts, see uop:insn ratio of 1.39, matches 8 insns, 11 uops (1.375) (counting vaddps 2, counting cmp + jl one). found http://www.bnikolic.co.uk/blog/hpc-prof-events.html, has full list of supported perf events, including codes sandybridge. (and instructions on how dump table other cpu). (look code: line in each block. need umask byte, , code, arg perf.)

# a.out avg2, unrolled-by-4 version. $ perf stat -d -e cycles,instructions,r14a1,r2b1,r10e,r2c2,r1c2 ./a.out 0.011331 1053298752.000000 1052674496.000000 1116959488.000000   performance counter stats './a.out':          42,250,312      cycles                    [34.11%]         56,103,429      instructions              #    1.33  insns per cycle         20,864,416      r14a1 # uops_dispatched_port: 0x14=port2&3 loads        111,943,380      r2b1 # uops_dispatched: (2->umask 00 -> core, thread).         72,208,772      r10e # uops_issued: fused-domain         71,422,907      r2c2 # uops_retired: retirement slots used (fused-domain)        111,597,049      r1c2 # uops_retired: (unfused-domain)                  0      l1-dcache-loads             18,470      l1-dcache-load-misses     #    0.00% of l1-dcache hits              5,717      llc-loads                                                    [66.05%]                  0      llc-load-misses:hg        #    0.00% of ll-cache hits         0.011920301 seconds time elapsed 

so yeah, looks can count fused-domain , unfused-domain uops!

unrolling 8 doesn't @ all: still 42m cycles. (but down 61m insns, , 97m uops, less loop overhead). neat, clang uses sub $-128, %rsi instead of add, because -128 fits in imm8, +128 doesn't. guess unrolling 4 enough saturate fp add port.

as 1avg functions return single float, rather vector, clang didn't auto-vectorize first one, gcc does. emits giant prologue , epilogue scalar sums until gets aligned address, 32b avx vaddps in small loop. found bigger speed diff them, maybe testing smaller buffer? account seeing big speedup vector code vs. non-vector.


Comments