プログラマ専用SNS ミクプラ

開発環境はDev-C++4.9.9.2、コンパイラはデフォルトです。
「インラインアセンブリを使うと高速である」という話を聞くので、実際に実験してみました。
普通のC言語とインラインアセンブリでそれぞれクイックソートを作り、走らせてみました。
結果はインラインアセンブリの方が2倍近く速かったです。
具体的な結果は

データ数(個)  通常(ms)  インラインアセンブリ(ms)
1000000       328       140
5000000       1451      890
10000000      3011      1622
50000000      15943     8549
100000000     33462     17613

(それぞれ1回づつ測定)
でした。
ソースは以下の通りです。

void quicksort(int* arr,int size) {
    int left;
    int right;
    int maeleft;
    int maeright;
    int pipot;
    int temp;
    int lstack[100];
    int rstack[100];
    int stacknum;
    lstack[0]=0;
    rstack[0]=size-1;
    stacknum=0;
    while(stacknum>=0) {
        maeleft=left=lstack[stacknum];
        maeright=right=rstack[stacknum];
        pipot=arr[(left+right)/2];
        while(left<right) {
            for(;left<=maeright && arr[left]<pipot;left++);
            for(;right>=maeleft && arr[right]>pipot;right--);
            if(left<=right) {
                temp=arr[left];
                arr[left]=arr[right];
                arr[right]=temp;
                left++;
                right--;
            }
        }
        stacknum--;
        if(right-maeleft<maeright-left) {
            if(maeleft<right) {
                stacknum++;
                lstack[stacknum]=maeleft;
                rstack[stacknum]=right;
            }
            if(left<maeright) {
                stacknum++;
                lstack[stacknum]=left;
                rstack[stacknum]=maeright;
            }
        } else {
            if(left<maeright) {
                stacknum++;
                lstack[stacknum]=left;
                rstack[stacknum]=maeright;
            }
            if(maeleft<right) {
                stacknum++;
                lstack[stacknum]=maeleft;
                rstack[stacknum]=right;
            }
        }
    }
}

void iasmquicksort(int* arr,int size) {
    asm volatile (
    "push $0xffffffff\n\t"/*スタック末端*/
    "push $0xffffffff\n\t"/*スタック末端*/
    "push $0\n\t"/*左*/
    "mov %1,%%eax\n\t"
    "dec %%eax\n\t"
    "push %%eax\n\t"/*右*/
    "mov %0,%%eax\n\t"
    "wloop1:\n\t"
    "pop %%edi\n\t"
    "pop %%esi\n\t"
    "cmp $0xffffffff,%%edi\n\t"
    "je wloop4\n\t"/*while(stacknum>=0) {*/
    "mov %%esi,%%ecx\n\t"/*maeleft=left=lstack[stacknum];*/
    "mov %%edi,%%edx\n\t"
    "add %%esi,%%edx\n\t"
    "shr $1,%%edx\n\t"
    "mov (%%eax,%%edx,4),%%ebx\n\t"/*pipot=arr[(left+right)/2];*/
    "mov %%edi,%%edx\n\t"/*maeright=right=rstack[stacknum];*/
    "wloop2:\n\t"
    "cmp %%ecx,%%edx\n\t"
    "jle wloop3\n\t"/*while(left<right) {*/
    "floop1:\n\t"
    "cmp (%%eax,%%ecx,4),%%ebx\n\t"
    "jle floop2\n\t"
    "inc %%ecx\n\t"
    "jmp floop1\n\t"/*for(;left<=maeright && arr[left]<pipot;left++);*/
    "floop2:\n\t"
    "cmp (%%eax,%%edx,4),%%ebx\n\t"
    "jge floop3\n\t"
    "dec %%edx\n\t"
    "jmp floop2\n\t"/*for(;right>=maeleft && arr[right]>pipot;right--);*/
    "floop3:\n\t"
    "cmp %%ecx,%%edx\n\t"
    "jl noif1\n\t"/*if(left<=right) {*/
    "push (%%eax,%%ecx,4)\n\t"/*temp=arr[left];*/
    "push (%%eax,%%edx,4)\n\t"
    "pop (%%eax,%%ecx,4)\n\t"/*arr[left]=arr[right];*/
    "pop (%%eax,%%edx,4)\n\t"/*arr[right]=temp;*/
    "inc %%ecx\n\t"/*left++;*/
    "dec %%edx\n\t"/*right--;*/
    "noif1:\n\t"
    "jmp wloop2\n\t"
    "wloop3:\n\t"
    "push %%eax\n\t"
    "mov %%edi,%%eax\n\t"
    "add %%esi,%%eax\n\t"
    "shr $1,%%eax\n\t"
    "cmp %%edx,%%eax\n\t"
    "pop %%eax\n\t"
    "jle if2\n\t"/*right<(left+right)/2*/
    "cmp %%esi,%%edx\n\t"
    "jle noif2\n\t"/*if(maeleft<right) {*/
    "push %%esi\n\t"/*lstack[stacknum]=maeleft;*/
    "push %%edx\n\t"/*rstack[stacknum]=right;*/
    "noif2:\n\t"
    "cmp %%ecx,%%edi\n\t"
    "jle noif3\n\t"/*if(left<maeright) {*/
    "push %%ecx\n\t"/*lstack[stacknum]=left;*/
    "push %%edi\n\t"/*rstack[stacknum]=maeright;*/
    "noif3:\n\t"
    "jmp if3\n\t"
    "if2:\n\t"
    "cmp %%ecx,%%edi\n\t"
    "jle noif4\n\t"/*if(left<maeright) {*/
    "push %%ecx\n\t"/*lstack[stacknum]=left;*/
    "push %%edi\n\t"/*rstack[stacknum]=maeright;*/
    "noif4:\n\t"
    "cmp %%esi,%%edx\n\t"
    "jle noif5\n\t"/*if(maeleft<right) {*/
    "push %%esi\n\t"/*lstack[stacknum]=maeleft;*/
    "push %%edx\n\t"/*rstack[stacknum]=right;*/
    "noif5:\n\t"
    "if3:\n\t"
    "jmp wloop1\n\t"
    "wloop4:\n\t"
    : "+m" (arr)
    : "m" (size)
    : "%eax" , "%ebx" , "%ecx" , "%edx" , "%esi" , "%edi"
    );
}

インラインアセンブリのコードは上の普通のC言語のコードを参考に作りました。
なぜインラインアセンブリを使うとこんなに速くなるのでしょうか？
暇なときにでも答えていただければ幸いです。

C言語版に-Oオプションを付けてコンパイルするとかなり差が縮まるようですけど。
-Sオプションを付けてコンパイルしてアセンブラソースで比較すれば一目瞭然かもしれません。

プログラマ専用SNS ミクプラ

インラインアセンブリの速さ【雑談？】

インラインアセンブリの速さ【雑談？】

Re:インラインアセンブリの速さ【雑談？】