i try speed calculation of average of 4d vectors placed in array. here code:
#include <sys/time.h> #include <sys/param.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <xmmintrin.h> typedef float dot[4]; #define n 1000000 double gettime () { struct timeval tv; gettimeofday (&tv, 0); return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec); } void calc_avg1 (dot res, const dot array[], int n) { int i,j; memset (res, 0, sizeof (dot)); (i = 0; < n; i++) { (j = 0; j<4; j++) res[j] += array[i][j]; } (j = 0; j<4; j++) res[j] /= n; } void calc_avg2 (dot res, const dot array[], int n) { int i; __v4sf r = _mm_set1_ps (0.0); (i=0; i<n; i++) r += _mm_load_ps (array[i]); r /= _mm_set1_ps ((float)n); _mm_store_ps (res, r); } int main () { void *space = malloc (n*sizeof(dot)+15); dot *array = (dot*)(((unsigned long)space+15) & ~(unsigned long)15); dot avg __attribute__((aligned(16))); int i; double time; (i = 0; < n; i++) { array[i][0] = 1.0*random(); array[i][1] = 1.0*random(); array[i][2] = 1.0*random(); } time = gettime(); calc_avg1 (avg, array, n); time = gettime() - time; printf ("%f\n%f %f %f\n", time, avg[0], avg[1], avg[2]); time = gettime(); calc_avg2 (avg, array, n); time = gettime() - time; printf ("%f\n%f %f %f\n", time, avg[0], avg[1], avg[2]); return 0; } so can see calc_avg1 uses naive loops 0 4 , calc_avg2 replaces them sse instructions. compile code clang 3.4:
cc -o2 -o test test.c here disassemble of calc_avgx functions:
0000000000400860 <calc_avg1>: 400860: 55 push %rbp 400861: 48 89 e5 mov %rsp,%rbp 400864: 85 d2 test %edx,%edx 400866: 0f 57 c0 xorps %xmm0,%xmm0 400869: 0f 11 07 movups %xmm0,(%rdi) 40086c: 7e 42 jle 4008b0 <calc_avg1+0x50> 40086e: 48 83 c6 0c add $0xc,%rsi 400872: 0f 57 c0 xorps %xmm0,%xmm0 400875: 89 d0 mov %edx,%eax 400877: 0f 57 c9 xorps %xmm1,%xmm1 40087a: 0f 57 d2 xorps %xmm2,%xmm2 40087d: 0f 57 db xorps %xmm3,%xmm3 400880: f3 0f 58 5e f4 addss -0xc(%rsi),%xmm3 400885: f3 0f 11 1f movss %xmm3,(%rdi) 400889: f3 0f 58 56 f8 addss -0x8(%rsi),%xmm2 40088e: f3 0f 11 57 04 movss %xmm2,0x4(%rdi) 400893: f3 0f 58 4e fc addss -0x4(%rsi),%xmm1 400898: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi) 40089d: f3 0f 58 06 addss (%rsi),%xmm0 4008a1: f3 0f 11 47 0c movss %xmm0,0xc(%rdi) 4008a6: 48 83 c6 10 add $0x10,%rsi 4008aa: ff c8 dec %eax 4008ac: 75 d2 jne 400880 <calc_avg1+0x20> 4008ae: eb 0c jmp 4008bc <calc_avg1+0x5c> 4008b0: 0f 57 c0 xorps %xmm0,%xmm0 4008b3: 0f 57 c9 xorps %xmm1,%xmm1 4008b6: 0f 57 d2 xorps %xmm2,%xmm2 4008b9: 0f 57 db xorps %xmm3,%xmm3 4008bc: f3 0f 2a e2 cvtsi2ss %edx,%xmm4 4008c0: f3 0f 5e dc divss %xmm4,%xmm3 4008c4: f3 0f 11 1f movss %xmm3,(%rdi) 4008c8: f3 0f 5e d4 divss %xmm4,%xmm2 4008cc: f3 0f 11 57 04 movss %xmm2,0x4(%rdi) 4008d1: f3 0f 5e cc divss %xmm4,%xmm1 4008d5: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi) 4008da: f3 0f 5e c4 divss %xmm4,%xmm0 4008de: f3 0f 11 47 0c movss %xmm0,0xc(%rdi) 4008e3: 5d pop %rbp 4008e4: c3 retq 4008e5: 66 66 2e 0f 1f 84 00 nopw %cs:0x0(%rax,%rax,1) 4008ec: 00 00 00 00 00000000004008f0 <calc_avg2>: 4008f0: 55 push %rbp 4008f1: 48 89 e5 mov %rsp,%rbp 4008f4: 85 d2 test %edx,%edx 4008f6: 0f 57 c0 xorps %xmm0,%xmm0 4008f9: 7e 10 jle 40090b <calc_avg2+0x1b> 4008fb: 89 d0 mov %edx,%eax 4008fd: 0f 1f 00 nopl (%rax) 400900: 0f 58 06 addps (%rsi),%xmm0 400903: 48 83 c6 10 add $0x10,%rsi 400907: ff c8 dec %eax 400909: 75 f5 jne 400900 <calc_avg2+0x10> 40090b: 66 0f 6e ca movd %edx,%xmm1 40090f: 66 0f 70 c9 00 pshufd $0x0,%xmm1,%xmm1 400914: 0f 5b c9 cvtdq2ps %xmm1,%xmm1 400917: 0f 5e c1 divps %xmm1,%xmm0 40091a: 0f 29 07 movaps %xmm0,(%rdi) 40091d: 5d pop %rbp 40091e: c3 retq 40091f: 90 nop and here result:
> ./test 0.004287 1073864320.000000 1074018048.000000 1073044224.000000 0.003661 1073864320.000000 1074018048.000000 1073044224.000000 so sse version 1.17 times faster. when try seemingly same job, calculate average of single precision scalars in array (as described here sse reduction of float vector example), sse version runs 3.32 times faster. here code of calc_avgx functions:
float calc_avg1 (const float array[], int n) { int i; float avg = 0; (i = 0; < n; i++) avg += array[i]; return avg / n; } float calc_avg3 (const float array[], int n) { int i; __v4sf r = _mm_set1_ps (0.0); (i=0; i<n; i+=4) r += _mm_load_ps (&(array[i])); r = _mm_hadd_ps (r, r); r = _mm_hadd_ps (r, r); return r[0] / n; } so question this: why benefit sse in last example (calculation of average of single float scalars) , not in first (calculation of average of 4d vectors)? me, these jobs same. right way speed calculations in first example, if wrong?
upd: if think it's relevant, supply disassembly of second example, average of scalars calculated (also compiled clang3.4 -o2).
0000000000400860 <calc_avg1>: 400860: 55 push %rbp 400861: 48 89 e5 mov %rsp,%rbp 400864: 85 d2 test %edx,%edx 400866: 0f 57 c0 xorps %xmm0,%xmm0 400869: 0f 11 07 movups %xmm0,(%rdi) 40086c: 7e 42 jle 4008b0 <calc_avg1+0x50> 40086e: 48 83 c6 0c add $0xc,%rsi 400872: 0f 57 c0 xorps %xmm0,%xmm0 400875: 89 d0 mov %edx,%eax 400877: 0f 57 c9 xorps %xmm1,%xmm1 40087a: 0f 57 d2 xorps %xmm2,%xmm2 40087d: 0f 57 db xorps %xmm3,%xmm3 400880: f3 0f 58 5e f4 addss -0xc(%rsi),%xmm3 400885: f3 0f 11 1f movss %xmm3,(%rdi) 400889: f3 0f 58 56 f8 addss -0x8(%rsi),%xmm2 40088e: f3 0f 11 57 04 movss %xmm2,0x4(%rdi) 400893: f3 0f 58 4e fc addss -0x4(%rsi),%xmm1 400898: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi) 40089d: f3 0f 58 06 addss (%rsi),%xmm0 4008a1: f3 0f 11 47 0c movss %xmm0,0xc(%rdi) 4008a6: 48 83 c6 10 add $0x10,%rsi 4008aa: ff c8 dec %eax 4008ac: 75 d2 jne 400880 <calc_avg1+0x20> 4008ae: eb 0c jmp 4008bc <calc_avg1+0x5c> 4008b0: 0f 57 c0 xorps %xmm0,%xmm0 4008b3: 0f 57 c9 xorps %xmm1,%xmm1 4008b6: 0f 57 d2 xorps %xmm2,%xmm2 4008b9: 0f 57 db xorps %xmm3,%xmm3 4008bc: f3 0f 2a e2 cvtsi2ss %edx,%xmm4 4008c0: f3 0f 5e dc divss %xmm4,%xmm3 4008c4: f3 0f 11 1f movss %xmm3,(%rdi) 4008c8: f3 0f 5e d4 divss %xmm4,%xmm2 4008cc: f3 0f 11 57 04 movss %xmm2,0x4(%rdi) 4008d1: f3 0f 5e cc divss %xmm4,%xmm1 4008d5: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi) 4008da: f3 0f 5e c4 divss %xmm4,%xmm0 4008de: f3 0f 11 47 0c movss %xmm0,0xc(%rdi) 4008e3: 5d pop %rbp 4008e4: c3 retq 4008e5: 66 66 2e 0f 1f 84 00 nopw %cs:0x0(%rax,%rax,1) 4008ec: 00 00 00 00 00000000004008d0 <calc_avg3>: 4008d0: 55 push %rbp 4008d1: 48 89 e5 mov %rsp,%rbp 4008d4: 31 c0 xor %eax,%eax 4008d6: 85 f6 test %esi,%esi 4008d8: 0f 57 c0 xorps %xmm0,%xmm0 4008db: 7e 0f jle 4008ec <calc_avg3+0x1c> 4008dd: 0f 1f 00 nopl (%rax) 4008e0: 0f 58 04 87 addps (%rdi,%rax,4),%xmm0 4008e4: 48 83 c0 04 add $0x4,%rax 4008e8: 39 f0 cmp %esi,%eax 4008ea: 7c f4 jl 4008e0 <calc_avg3+0x10> 4008ec: 66 0f 70 c8 01 pshufd $0x1,%xmm0,%xmm1 4008f1: f3 0f 58 c8 addss %xmm0,%xmm1 4008f5: 66 0f 70 d0 03 pshufd $0x3,%xmm0,%xmm2 4008fa: 0f 12 c0 movhlps %xmm0,%xmm0 4008fd: f3 0f 58 c1 addss %xmm1,%xmm0 400901: f3 0f 58 c2 addss %xmm2,%xmm0 400905: 0f 57 c9 xorps %xmm1,%xmm1 400908: f3 0f 2a ce cvtsi2ss %esi,%xmm1 40090c: f3 0f 5e c1 divss %xmm1,%xmm0 400910: 5d pop %rbp 400911: c3 retq 400912: 66 66 66 66 66 2e 0f nopw %cs:0x0(%rax,%rax,1) 400919: 1f 84 00 00 00 00 00
sorry answer got little long , rambling. ran benchmarks, didn't take long time editing earlier stuff after thinking of else try.
your working set 15.25mib (16mb). benchmark routine this, you'd average smaller buffer multiple times, fits in cache. don't see difference between slow version , fast version because diff hidden memory bottleneck.
calc_avg1 doesn't auto-vectorize @ (note addss. ss means scalar, single-precision, opposed addps (packed single-precision)). think can't autovectorize when inlined main because can't sure there aren't nans in 4th vector position, cause fp exception scalar code wouldn't have. tried compiling sandybridge gcc 4.9.2 -o3 -march=native -ffast-math, , clang-3.5, no luck either.
even so, version inlined main runs slower, because memory bottleneck. 32bit loads can keep 128b loads, when hitting main memory. (the non-inlined version bad, though: every += result stored res array, because loop accumulates directly memory might have other references it. has make every operation visible store. version posted disassembly for, btw. sorting out part of main assisted compiling -s -fverbose-asm.)
somewhat disappointingly, clang , gcc can't auto-vectorize __v4sf 4-wide 8-wide avx.
scratch that, after wrapping for (int i=0; i<4000 ; i++) around calls calc_avgx, , reducing n 10k, gcc -o3 turns inner inner-loop of avg1 into:
400690: c5 f8 10 08 vmovups (%rax),%xmm1 400694: 48 83 c0 20 add $0x20,%rax 400698: c4 e3 75 18 48 f0 01 vinsertf128 $0x1,-0x10(%rax),%ymm1,%ymm1 40069f: c5 fc 58 c1 vaddps %ymm1,%ymm0,%ymm0 4006a3: 48 39 d8 cmp %rbx,%rax 4006a6: 75 e8 jne 400690 <main+0xe0> $ (get cpu max-turbo frequency) && time ./a.out 0.016515 1071570752.000000 1066917696.000000 1073897344.000000 0.032875 1071570944.000000 1066916416.000000 1073895680.000000 this bizzare; have no idea why doesn't use 32b loads. use 32b vaddps, bottleneck when working data set fits in l2 cache.
idk why managed auto-vectorize inner loop when inside loop. note only applies version inlined main. callable version still scalar-only. note gcc managed this. clang 3.5 didn't. maybe gcc knew going use malloc in way returned zeroed buffer (so didn't have worry nans in 4th element)?
i'm surprised clang's non-vectorized avg1 not slower, when fits in cache. n=10000, repeat-count = 40k.
3.3ghz snb i5 2500k, max turbo = 3.8ghz. avg1: 0.350422s: clang -o3 -march=native (not vectorized. loop of 6 scalar addss memory operands) avg2: 0.320173s: clang -o3 -march=native avg1: 0.497040s: clang -o3 -march=native -ffast-math (haven't looked @ asm see happened) avg1: 0.160374s: gcc -o3 -march=native (256b addps, 2 128b loads) avg2: 0.321028s: gcc -o3 -march=native (128b addps memory operand) avg2: ~0.16: clang, unrolled 2 dependency chains hide latency (see below). avg2: ~0.08: unrolled 4 dep chains avg2: ~0.04: in theory unrolled-by-4 256b avx. didn't try unrolling 1 gcc auto-vectorized 256b addps so big surprise scalar-only clang avg1 code keeps avg2. perhaps loop-carried dependency chain bigger bottleneck?
perf showing 1.47 insns per cycle clang's non-vectorized avg1, saturating fp add unit on port 1. (most of loop instructions adds).
however, avg2, using 128b addps memory operand, getting 0.58 insns per cycle. reducing array size factor of 10, n=1000, gets 0.60 insns per cycle, because of more time spend in prologue/epilogue. there's serious issue loop-carried dependency chain, think. clang unrolls loop 4, uses single accumulator. loop has 7 instructions, decodes 10 uops. (each vaddps 2, since it's used memory operand 2-register addressing mode, preventing micro-fusion. cmp , jne macro-fuse). http://www.brendangregg.com/perf.html says perf event uops_dispatched.core r2b1, so:
$ perf stat -d -e cycles,instructions,r2b1 ./a.out 0.031793 1053298112.000000 1052673664.000000 1116960256.000000 performance counter stats './a.out': 118,453,541 cycles 71,181,299 instructions # 0.60 insns per cycle 102,025,443 r2b1 # uops, perf doesn't have nice name 40,256,019 l1-dcache-loads 21,254 l1-dcache-load-misses # 0.05% of l1-dcache hits 9,588 llc-loads 0 llc-load-misses:hg # 0.00% of ll-cache hits 0.032276233 seconds time elapsed this confirms 7:10 instructions uops analysis. that's not relevant performance problem here: loop running way slower 4 uops per cycle upper limit. changing inner loop 2 separate dep chains going doubles throughput (60m cycles instead of 117m, 81m insns instead of 71m):
for (i=0; i<n-1; i+=2) { // todo: make sure loop end condition correct r0 += _mm_load_ps (array[i]); r1 += _mm_load_ps (array[i+1]); } r0 += r1; unrolling 4 (with 4 accumulators merge @ end of loop) doubles performance again. (down 42m cycles, 81m insns, 112m uops.) inner loop has 4x vaddps -0x30(%rcx),%xmm4,%xmm4 (and similar), 2x add, cmp, jl. form of vaddps should micro-fuse, i'm still seeing lot more uops instructions, guess r2b1 counts unfused-domain uops. (linux perf doesn't have docs platform-specific hw events). cranking n again, make sure it's innermost loop dominates counts, see uop:insn ratio of 1.39, matches 8 insns, 11 uops (1.375) (counting vaddps 2, counting cmp + jl one). found http://www.bnikolic.co.uk/blog/hpc-prof-events.html, has full list of supported perf events, including codes sandybridge. (and instructions on how dump table other cpu). (look code: line in each block. need umask byte, , code, arg perf.)
# a.out avg2, unrolled-by-4 version. $ perf stat -d -e cycles,instructions,r14a1,r2b1,r10e,r2c2,r1c2 ./a.out 0.011331 1053298752.000000 1052674496.000000 1116959488.000000 performance counter stats './a.out': 42,250,312 cycles [34.11%] 56,103,429 instructions # 1.33 insns per cycle 20,864,416 r14a1 # uops_dispatched_port: 0x14=port2&3 loads 111,943,380 r2b1 # uops_dispatched: (2->umask 00 -> core, thread). 72,208,772 r10e # uops_issued: fused-domain 71,422,907 r2c2 # uops_retired: retirement slots used (fused-domain) 111,597,049 r1c2 # uops_retired: (unfused-domain) 0 l1-dcache-loads 18,470 l1-dcache-load-misses # 0.00% of l1-dcache hits 5,717 llc-loads [66.05%] 0 llc-load-misses:hg # 0.00% of ll-cache hits 0.011920301 seconds time elapsed so yeah, looks can count fused-domain , unfused-domain uops!
unrolling 8 doesn't @ all: still 42m cycles. (but down 61m insns, , 97m uops, less loop overhead). neat, clang uses sub $-128, %rsi instead of add, because -128 fits in imm8, +128 doesn't. guess unrolling 4 enough saturate fp add port.
as 1avg functions return single float, rather vector, clang didn't auto-vectorize first one, gcc does. emits giant prologue , epilogue scalar sums until gets aligned address, 32b avx vaddps in small loop. found bigger speed diff them, maybe testing smaller buffer? account seeing big speedup vector code vs. non-vector.
Comments
Post a Comment