從簡單的 C 語言函式來看現代 Compiler 使用 SIMD 的威力

double data[LEN];

void compute()
{
const double A = 1.1, B = 2.2, C = 3.3;

int i;
for(i=0; i<LEN; i++) {
data[i] = A*i*i + B*i + C;
}
}


void compute()
{
const double A = 1.1, B = 2.2, C = 3.3;
const double A2 = A+A;
double Z = A+B;
double Y = C;

int i;
for(i=0; i<LEN; i++) {
data[i] = Y;
Y += Z;
Z += A2;
}
}


int i;
for (i = 0; i < LEN; i += 4) {
data[i+0] = A*(i+0)*(i+0) + B*(i+0) + C;
data[i+1] = A*(i+1)*(i+1) + B*(i+1) + C;
data[i+2] = A*(i+2)*(i+2) + B*(i+2) + C;
data[i+3] = A*(i+3)*(i+3) + B*(i+3) + C;
}


$repeat 10 ./a [-] Took: 248830 ns. [-] Took: 249150 ns. [-] Took: 248760 ns. [-] Took: 248730 ns. [-] Took: 248770 ns. [-] Took: 248861 ns. [-] Took: 248760 ns. [-] Took: 253050 ns. [-] Took: 248640 ns. [-] Took: 249211 ns.$ repeat 10 ./b
[-] Took: 686660 ns.
[-] Took: 696090 ns.
[-] Took: 696310 ns.
[-] Took: 694431 ns.
[-] Took: 691971 ns.
[-] Took: 697690 ns.
[-] Took: 693241 ns.
[-] Took: 692900 ns.
[-] Took: 654751 ns.
[-] Took: 679101 ns.


13a0:       66 0f 6f c2             movdqa xmm0,xmm2
13a4:       48 83 c0 20             add    rax,0x20
13a8:       66 0f fe d6             paddd  xmm2,xmm6
13ac:       f3 0f e6 f8             cvtdq2pd xmm7,xmm0
13b0:       66 0f 28 cf             movapd xmm1,xmm7
13b4:       66 0f 70 c0 ee          pshufd xmm0,xmm0,0xee
13b9:       66 0f 59 cd             mulpd  xmm1,xmm5
13bd:       f3 0f e6 c0             cvtdq2pd xmm0,xmm0
13c1:       66 0f 59 cf             mulpd  xmm1,xmm7
13c5:       66 0f 59 fc             mulpd  xmm7,xmm4
13c9:       66 0f 58 cf             addpd  xmm1,xmm7
13cd:       66 0f 58 cb             addpd  xmm1,xmm3
13d1:       0f 29 48 e0             movaps XMMWORD PTR [rax-0x20],xmm1
13d5:       66 0f 28 c8             movapd xmm1,xmm0
13d9:       66 0f 59 cd             mulpd  xmm1,xmm5
13dd:       66 0f 59 c8             mulpd  xmm1,xmm0
13e1:       66 0f 59 c4             mulpd  xmm0,xmm4
13e5:       66 0f 58 c1             addpd  xmm0,xmm1
13e9:       66 0f 58 c3             addpd  xmm0,xmm3
13ed:       0f 29 40 f0             movaps XMMWORD PTR [rax-0x10],xmm0
13f1:       48 39 c2                cmp    rdx,rax
13f4:       75 aa                   jne    13a0 <compute+0x40>


1340:       f2 0f 11 08             movsd  QWORD PTR [rax],xmm1
1344:       48 83 c0 08             add    rax,0x8
1348:       f2 0f 58 c8             addsd  xmm1,xmm0
134c:       f2 0f 58 c2             addsd  xmm0,xmm2
1350:       48 39 d0                cmp    rax,rdx
1353:       75 eb                   jne    1340 <compute+0x30>


$repeat 10 ./a [-] Took: 571140 ns. [-] Took: 570280 ns. [-] Took: 571271 ns. [-] Took: 573971 ns. [-] Took: 571981 ns. [-] Took: 569650 ns. [-] Took: 566361 ns. [-] Took: 571600 ns. [-] Took: 571330 ns. [-] Took: 571030 ns.$ repeat 10 ./b
[-] Took: 697521 ns.
[-] Took: 696961 ns.
[-] Took: 696201 ns.
[-] Took: 694921 ns.
[-] Took: 696930 ns.
[-] Took: 695001 ns.
[-] Took: 701661 ns.
[-] Took: 698100 ns.
[-] Took: 702430 ns.
[-] Took: 702641 ns.


11b1:       66 0f ef c9             pxor   xmm1,xmm1
11b5:       f2 0f 2a c8             cvtsi2sd xmm1,eax
11b9:       66 0f 28 c1             movapd xmm0,xmm1
11bd:       f2 0f 59 c4             mulsd  xmm0,xmm4
11c1:       f2 0f 59 c1             mulsd  xmm0,xmm1
11c5:       f2 0f 59 cb             mulsd  xmm1,xmm3
11c9:       f2 0f 58 c1             addsd  xmm0,xmm1
11cd:       f2 0f 58 c2             addsd  xmm0,xmm2
11d1:       f2 0f 11 04 c2          movsd  QWORD PTR [rdx+rax*8],xmm0
11d6:       48 83 c0 01             add    rax,0x1
11da:       48 3d 40 42 0f 00       cmp    rax,0xf4240
11e0:       75 cf                   jne    11b1 <compute+0x28>


11b3:       f2 0f 11 08             movsd  QWORD PTR [rax],xmm1
11b7:       f2 0f 58 c8             addsd  xmm1,xmm0
11bb:       f2 0f 58 c2             addsd  xmm0,xmm2
11bf:       48 83 c0 08             add    rax,0x8
11c3:       48 39 d0                cmp    rax,rdx
11c6:       75 eb                   jne    11b3 <compute+0x2a>


$repeat 10 ./a [-] Took: 1097091 ns. [-] Took: 1092941 ns. [-] Took: 1092501 ns. [-] Took: 1091991 ns. [-] Took: 1092441 ns. [-] Took: 1093970 ns. [-] Took: 1091341 ns. [-] Took: 1093931 ns. [-] Took: 1094111 ns. [-] Took: 1092231 ns.$ repeat 10 ./b
[-] Took: 2703282 ns.
[-] Took: 2705933 ns.
[-] Took: 2703582 ns.
[-] Took: 2702622 ns.
[-] Took: 2703043 ns.
[-] Took: 2702262 ns.
[-] Took: 2703352 ns.
[-] Took: 2703532 ns.
[-] Took: 2703112 ns.
[-] Took: 2702533 ns.


11c1:       f2 0f 2a 45 e4          cvtsi2sd xmm0,DWORD PTR [rbp-0x1c]
11c6:       66 0f 28 c8             movapd xmm1,xmm0
11ca:       f2 0f 59 4d e8          mulsd  xmm1,QWORD PTR [rbp-0x18]
11cf:       f2 0f 2a 45 e4          cvtsi2sd xmm0,DWORD PTR [rbp-0x1c]
11d4:       f2 0f 59 c8             mulsd  xmm1,xmm0
11d8:       f2 0f 2a 45 e4          cvtsi2sd xmm0,DWORD PTR [rbp-0x1c]
11dd:       f2 0f 59 45 f0          mulsd  xmm0,QWORD PTR [rbp-0x10]
11e2:       f2 0f 58 c1             addsd  xmm0,xmm1
11e6:       f2 0f 58 45 f8          addsd  xmm0,QWORD PTR [rbp-0x8]
11eb:       8b 45 e4                mov    eax,DWORD PTR [rbp-0x1c]
11ee:       48 98                   cdqe
11f0:       48 8d 14 c5 00 00 00    lea    rdx,[rax*8+0x0]
11f7:       00
11f8:       48 8d 05 41 2e 00 00    lea    rax,[rip+0x2e41]
11ff:       f2 0f 11 04 02          movsd  QWORD PTR [rdx+rax*1],xmm0
1204:       83 45 e4 01             add    DWORD PTR [rbp-0x1c],0x1
1208:       81 7d e4 3f 42 0f 00    cmp    DWORD PTR [rbp-0x1c],0xf423f
120f:       7e b0                   jle    11c1 <compute+0x38>


11e8:       8b 45 cc                mov    eax,DWORD PTR [rbp-0x34]
11eb:       48 98                   cdqe
11ed:       48 8d 14 c5 00 00 00    lea    rdx,[rax*8+0x0]
11f4:       00
11f5:       48 8d 05 44 2e 00 00    lea    rax,[rip+0x2e44]
11fc:       f2 0f 10 45 d8          movsd  xmm0,QWORD PTR [rbp-0x28]
1201:       f2 0f 11 04 02          movsd  QWORD PTR [rdx+rax*1],xmm0
1206:       f2 0f 10 45 d8          movsd  xmm0,QWORD PTR [rbp-0x28]
120b:       f2 0f 58 45 d0          addsd  xmm0,QWORD PTR [rbp-0x30]
1210:       f2 0f 11 45 d8          movsd  QWORD PTR [rbp-0x28],xmm0
1215:       f2 0f 10 45 d0          movsd  xmm0,QWORD PTR [rbp-0x30]
121a:       f2 0f 58 45 f8          addsd  xmm0,QWORD PTR [rbp-0x8]
121f:       f2 0f 11 45 d0          movsd  QWORD PTR [rbp-0x30],xmm0
1224:       83 45 cc 01             add    DWORD PTR [rbp-0x34],0x1
1228:       81 7d cc 3f 42 0f 00    cmp    DWORD PTR [rbp-0x34],0xf423f
122f:       7e b7                   jle    11e8 <compute+0x5f>


Golang 的排序演算法將換成 pdqsort，LLVM libc++ 換成 BlockQuicksort

Hacker News 首頁上看到的消息，Golang 將會把 sort.Sort() 換成 pdqsort (Pattern-defeating Quicksort)：「Go will use pdqsort in next release (github.com/golang)」，對應的 commit 則是在「sort: use pdqsort」這邊可以看到。

It would be nice if only one or two of the sorting methods would dominate all of the others, regardless of application or the computer being used. But in fact, each method has its own peculiar virtues. […] Thus we find that nearly all of the algorithms deserve to be remembered, since there are some applications in which they turn out to be best.

Pattern-defeating quicksort (pdqsort) is a novel sorting algorithm that combines the fast average case of randomized quicksort with the fast worst case of heapsort, while achieving linear time on inputs with certain patterns.

Golang 選擇把 unstable 的 Quicksort 換成 pdqsort，LLVM 則是選擇把 Quicksort 換成 BlockQuicksort，這邊看起來有些分歧...

從三角函數 cosine 的實做問題學一些週邊知識...

cosine 是很基本的函數，所以可以使用的地方很多。另外一方面，也因為他不是那麼直覺就可以實做出來，在現代的實做裡面其實藏了超多細節...

A common software technique to implement nearly quadruple precision using pairs of double-precision values is sometimes called double-double arithmetic.

Using pairs of IEEE double-precision values with 53-bit significands, double-double arithmetic provides operations on numbers with significands of at least[4] 2 × 53 = 106 bits (...), only slightly less precise than the 113-bit significand of IEEE binary128 quadruple precision.

C 語言的兩個笑話 (以及他的惡搞原理)

Twitter 上看到兩則 C 語言的笑話：

(除非你也很精心挑過)，不要誤解亂用 XDDD

拿 pytest 測 C 的程式

Hacker News 上看到「Running C unit tests with Pytest (p403n1x87.github.io)」這串討論，就如同標題所寫的，拿 pytestC 的程式：「Running C unit tests with pytest」。

兩個 unsigned int 取平均值的方法

Hacker News Daily 上看到 Raymond Chen 在講怎麼對兩個 unsigned int 取平均值的方法：「On finding the average of two unsigned integers without overflow」，這篇裡面提到了不少有趣的歷史，另外 Hacker News 上的討論「Finding the average of two unsigned integers without overflow (microsoft.com)」也可以翻翻。

unsigned average(unsigned a, unsigned b)
{
return (a + b) / 2;
}


unsigned average(unsigned low, unsigned high)
{
return low + (high - low) / 2;
}


unsigned average(unsigned a, unsigned b)
{
return (a / 2) + (b / 2) + (a & b & 1);
}


unsigned average(unsigned a, unsigned b)
{
return (a & b) + (a ^ b) / 2;
}


unsigned average(unsigned a, unsigned b)
{
// Suppose "unsigned" is a 32-bit type and
// "unsigned long long" is a 64-bit type.
return ((unsigned long long)a + b) / 2;
}


QOI 圖片無損壓縮演算法

Hacker News Daily 上看到「Lossless Image Compression in O(n) Time」這篇，作者丟出了一個圖片的無損壓縮演算法，壓縮與解壓縮的速度超快，但壓縮率又不輸 PNG 太多，在 Hacker News 上的討論也可以看一下：「QOI: Lossless Image Compression in O(n) Time (phoboslab.org)」。

Yes, stb_image saved us all from the pains of dealing with libpng and is therefore used in countless games and apps. A while ago I aimed to do the same for video with pl_mpeg, with some success.

My name is Dominic Szablewski. I build games, experiment with JavaScript and occasionally tinker with low-level C.

• A run of the previous pixel
• An index into a previously seen pixel
• The difference to the previous pixel
• Full rgba values

StackOverflow 開賣 Ctrl、C、V 的鍵盤

StackOverflow 今年愚人節的鍵盤真的開賣了：「No joke—you can buy our copy/paste keyboard right now」。

They’re also outfitted with Kailh Box Black switches to deliver an ultra-smooth linear feel.

Fully programmable, these three keys can do much more than copy and paste. In fact, you can configure them to perform virtually any key command you want.

Estimated ship date is Dec 13, 2021 PT.

2.6k Sold

USB Type-C 要增加 240W 的規格...

Extended Power Range cables have additional requirements to assure that these cables can deliver the full defined voltage and current range for USB PD EPR operation. EPR cables shall functionally support a reported 50 V and 5 A o peration. The minimum functional voltage that a cable shall support is 53.65 V. The electrical components potentially in the path of V BUS in an EPR cable, e.g. bypass capacitors, should be minimally rated for 63 V.

All EPR cables shall be Electronically Marked and include EPR-specific information in the eMarker as defined by the USB PD specification. As defined in the USB PD specification, EPR cables are marked as 50 V and 5 A capable. All EPR cables shall be visibly identified with EPR cable identification icons as defined by the USB-IF. This is required so that end users will be able to confirm visually that the cable supports up to as high of PDP = 240W as defined in the USB PD specification.