## 從簡單的 C 語言函式來看現代 Compiler 使用 SIMD 的威力

double data[LEN];

void compute()
{
const double A = 1.1, B = 2.2, C = 3.3;

int i;
for(i=0; i<LEN; i++) {
data[i] = A*i*i + B*i + C;
}
}


void compute()
{
const double A = 1.1, B = 2.2, C = 3.3;
const double A2 = A+A;
double Z = A+B;
double Y = C;

int i;
for(i=0; i<LEN; i++) {
data[i] = Y;
Y += Z;
Z += A2;
}
}


int i;
for (i = 0; i < LEN; i += 4) {
data[i+0] = A*(i+0)*(i+0) + B*(i+0) + C;
data[i+1] = A*(i+1)*(i+1) + B*(i+1) + C;
data[i+2] = A*(i+2)*(i+2) + B*(i+2) + C;
data[i+3] = A*(i+3)*(i+3) + B*(i+3) + C;
}


$repeat 10 ./a [-] Took: 248830 ns. [-] Took: 249150 ns. [-] Took: 248760 ns. [-] Took: 248730 ns. [-] Took: 248770 ns. [-] Took: 248861 ns. [-] Took: 248760 ns. [-] Took: 253050 ns. [-] Took: 248640 ns. [-] Took: 249211 ns.$ repeat 10 ./b
[-] Took: 686660 ns.
[-] Took: 696090 ns.
[-] Took: 696310 ns.
[-] Took: 694431 ns.
[-] Took: 691971 ns.
[-] Took: 697690 ns.
[-] Took: 693241 ns.
[-] Took: 692900 ns.
[-] Took: 654751 ns.
[-] Took: 679101 ns.


13a0:       66 0f 6f c2             movdqa xmm0,xmm2
13a4:       48 83 c0 20             add    rax,0x20
13a8:       66 0f fe d6             paddd  xmm2,xmm6
13ac:       f3 0f e6 f8             cvtdq2pd xmm7,xmm0
13b0:       66 0f 28 cf             movapd xmm1,xmm7
13b4:       66 0f 70 c0 ee          pshufd xmm0,xmm0,0xee
13b9:       66 0f 59 cd             mulpd  xmm1,xmm5
13bd:       f3 0f e6 c0             cvtdq2pd xmm0,xmm0
13c1:       66 0f 59 cf             mulpd  xmm1,xmm7
13c5:       66 0f 59 fc             mulpd  xmm7,xmm4
13c9:       66 0f 58 cf             addpd  xmm1,xmm7
13cd:       66 0f 58 cb             addpd  xmm1,xmm3
13d1:       0f 29 48 e0             movaps XMMWORD PTR [rax-0x20],xmm1
13d5:       66 0f 28 c8             movapd xmm1,xmm0
13d9:       66 0f 59 cd             mulpd  xmm1,xmm5
13dd:       66 0f 59 c8             mulpd  xmm1,xmm0
13e1:       66 0f 59 c4             mulpd  xmm0,xmm4
13e5:       66 0f 58 c1             addpd  xmm0,xmm1
13e9:       66 0f 58 c3             addpd  xmm0,xmm3
13ed:       0f 29 40 f0             movaps XMMWORD PTR [rax-0x10],xmm0
13f1:       48 39 c2                cmp    rdx,rax
13f4:       75 aa                   jne    13a0 <compute+0x40>


1340:       f2 0f 11 08             movsd  QWORD PTR [rax],xmm1
1344:       48 83 c0 08             add    rax,0x8
1348:       f2 0f 58 c8             addsd  xmm1,xmm0
134c:       f2 0f 58 c2             addsd  xmm0,xmm2
1350:       48 39 d0                cmp    rax,rdx
1353:       75 eb                   jne    1340 <compute+0x30>


$repeat 10 ./a [-] Took: 571140 ns. [-] Took: 570280 ns. [-] Took: 571271 ns. [-] Took: 573971 ns. [-] Took: 571981 ns. [-] Took: 569650 ns. [-] Took: 566361 ns. [-] Took: 571600 ns. [-] Took: 571330 ns. [-] Took: 571030 ns.$ repeat 10 ./b
[-] Took: 697521 ns.
[-] Took: 696961 ns.
[-] Took: 696201 ns.
[-] Took: 694921 ns.
[-] Took: 696930 ns.
[-] Took: 695001 ns.
[-] Took: 701661 ns.
[-] Took: 698100 ns.
[-] Took: 702430 ns.
[-] Took: 702641 ns.


11b1:       66 0f ef c9             pxor   xmm1,xmm1
11b5:       f2 0f 2a c8             cvtsi2sd xmm1,eax
11b9:       66 0f 28 c1             movapd xmm0,xmm1
11bd:       f2 0f 59 c4             mulsd  xmm0,xmm4
11c1:       f2 0f 59 c1             mulsd  xmm0,xmm1
11c5:       f2 0f 59 cb             mulsd  xmm1,xmm3
11c9:       f2 0f 58 c1             addsd  xmm0,xmm1
11cd:       f2 0f 58 c2             addsd  xmm0,xmm2
11d1:       f2 0f 11 04 c2          movsd  QWORD PTR [rdx+rax*8],xmm0
11d6:       48 83 c0 01             add    rax,0x1
11da:       48 3d 40 42 0f 00       cmp    rax,0xf4240
11e0:       75 cf                   jne    11b1 <compute+0x28>


11b3:       f2 0f 11 08             movsd  QWORD PTR [rax],xmm1
11b7:       f2 0f 58 c8             addsd  xmm1,xmm0
11bb:       f2 0f 58 c2             addsd  xmm0,xmm2
11bf:       48 83 c0 08             add    rax,0x8
11c3:       48 39 d0                cmp    rax,rdx
11c6:       75 eb                   jne    11b3 <compute+0x2a>


$repeat 10 ./a [-] Took: 1097091 ns. [-] Took: 1092941 ns. [-] Took: 1092501 ns. [-] Took: 1091991 ns. [-] Took: 1092441 ns. [-] Took: 1093970 ns. [-] Took: 1091341 ns. [-] Took: 1093931 ns. [-] Took: 1094111 ns. [-] Took: 1092231 ns.$ repeat 10 ./b
[-] Took: 2703282 ns.
[-] Took: 2705933 ns.
[-] Took: 2703582 ns.
[-] Took: 2702622 ns.
[-] Took: 2703043 ns.
[-] Took: 2702262 ns.
[-] Took: 2703352 ns.
[-] Took: 2703532 ns.
[-] Took: 2703112 ns.
[-] Took: 2702533 ns.


11c1:       f2 0f 2a 45 e4          cvtsi2sd xmm0,DWORD PTR [rbp-0x1c]
11c6:       66 0f 28 c8             movapd xmm1,xmm0
11ca:       f2 0f 59 4d e8          mulsd  xmm1,QWORD PTR [rbp-0x18]
11cf:       f2 0f 2a 45 e4          cvtsi2sd xmm0,DWORD PTR [rbp-0x1c]
11d4:       f2 0f 59 c8             mulsd  xmm1,xmm0
11d8:       f2 0f 2a 45 e4          cvtsi2sd xmm0,DWORD PTR [rbp-0x1c]
11dd:       f2 0f 59 45 f0          mulsd  xmm0,QWORD PTR [rbp-0x10]
11e2:       f2 0f 58 c1             addsd  xmm0,xmm1
11e6:       f2 0f 58 45 f8          addsd  xmm0,QWORD PTR [rbp-0x8]
11eb:       8b 45 e4                mov    eax,DWORD PTR [rbp-0x1c]
11ee:       48 98                   cdqe
11f0:       48 8d 14 c5 00 00 00    lea    rdx,[rax*8+0x0]
11f7:       00
11f8:       48 8d 05 41 2e 00 00    lea    rax,[rip+0x2e41]
11ff:       f2 0f 11 04 02          movsd  QWORD PTR [rdx+rax*1],xmm0
1204:       83 45 e4 01             add    DWORD PTR [rbp-0x1c],0x1
1208:       81 7d e4 3f 42 0f 00    cmp    DWORD PTR [rbp-0x1c],0xf423f
120f:       7e b0                   jle    11c1 <compute+0x38>


11e8:       8b 45 cc                mov    eax,DWORD PTR [rbp-0x34]
11eb:       48 98                   cdqe
11ed:       48 8d 14 c5 00 00 00    lea    rdx,[rax*8+0x0]
11f4:       00
11f5:       48 8d 05 44 2e 00 00    lea    rax,[rip+0x2e44]
11fc:       f2 0f 10 45 d8          movsd  xmm0,QWORD PTR [rbp-0x28]
1201:       f2 0f 11 04 02          movsd  QWORD PTR [rdx+rax*1],xmm0
1206:       f2 0f 10 45 d8          movsd  xmm0,QWORD PTR [rbp-0x28]
120b:       f2 0f 58 45 d0          addsd  xmm0,QWORD PTR [rbp-0x30]
1210:       f2 0f 11 45 d8          movsd  QWORD PTR [rbp-0x28],xmm0
1215:       f2 0f 10 45 d0          movsd  xmm0,QWORD PTR [rbp-0x30]
121a:       f2 0f 58 45 f8          addsd  xmm0,QWORD PTR [rbp-0x8]
121f:       f2 0f 11 45 d0          movsd  QWORD PTR [rbp-0x30],xmm0
1224:       83 45 cc 01             add    DWORD PTR [rbp-0x34],0x1
1228:       81 7d cc 3f 42 0f 00    cmp    DWORD PTR [rbp-0x34],0xf423f
122f:       7e b7                   jle    11e8 <compute+0x5f>


## Intel 與 AMD 在 RSQRTSS 的不同

RSQRTSS 是計算平方根倒數，也就是計算 $1 / \sqrt{x}$，另外比較特別的是，這個指令不保證正確性，是允許有誤差產生的。

On Intel Skylake I get
out = 3d7ff000, float = 0.062485

On AMD Rome I get
out = 3d7ff800, float = 0.062492


## Cloudflare 的 jpegtran 在 ARM 上面的表現

Cloudflare 花了不少力氣在 ARM 的伺服器上 (可以參考「Cloudflare 用 ARM 當伺服器的進展...」，或是更早的「Cloudflare 測試 ARM 新的伺服器」這篇)，最近在 ARM 上發現 jpegtran 的效能不是太好，花了不少力氣最佳化，發現有意外收穫：「NEON is the new black: fast JPEG optimization on ARM server」。

Ideally we want to have the ARM performing at or above 50% of the Xeon performance per core. This would make sure we have no performance regressions, and net performance gain, since the ARM CPUs have double the core count as our current 2 socket setup.

In this case, however, I was disappointed to discover an almost 4X slowdown.

Not one to despair, I figured out that applying the same optimizations I did for Intel would be trivial. Surely the NEON instructions map neatly to the SSE instructions I used before?

While going over the ARMv8 NEON instruction set, I found several unique instructions, that have no equivalent in SSE.

With the new implementation Centriq outperforms the Xeon at batch reduction for every number of workers. We usually run Polish with four workers, for which Centriq is now 1.3 times faster while also 6.5 times more power efficient.

## 收 Wikimedia (包括維基百科) 的 Recent Changes

EventStreams is built on the w3c standard Server Sent Events (SSE). SSE is simply a streaming HTTP connection with event data in a particular text format. Client libraries, usually called EventSource, assist with building responsive tools, but because SSE is really just HTTP, you can use any HTTP client (even curl!) to consume it.