【筆記】匯編入門

鑒于在華子呆著沒事干，開始學習匯編。

簡單循環

int test(int a, int b) {
    for (int i = 0; i < b; ++i) {
        a <<= 1;
    }
    return a;
}

O0

	.file	"a.cpp"
	.text
	.globl	_Z4testii             ; ii 表示接受兩個整型參數
	.type	_Z4testii, @function
_Z4testii:
.LFB0:
	.cfi_startproc
	endbr64
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register 6
	movl	%edi, -20(%rbp)  ; 將 a 保存到棧位置 -20(%rbp)
	movl	%esi, -24(%rbp)  ; 將 b 保存到棧位置 -24(%rbp)
	movl	$0, -4(%rbp)     ; 初始化 i = 0
	jmp	.L2                  
.L3:
	sall	-20(%rbp)        ; a <<= 1
	addl	$1, -4(%rbp)     ; i++
.L2:
	movl	-4(%rbp), %eax   ; 將循環計數器加載到 EAX
	cmpl	-24(%rbp), %eax  ; 比較 i 和第二個參數 b
	jl	.L3                  ; 如果 i < b，跳轉到循環體
	movl	-20(%rbp), %eax  ; 將 a 加載到 EAX（返回值寄存器）
	popq	%rbp
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc
.LFE0:
	.size	_Z4testii, .-_Z4testii
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

這段代碼沒有進行優化（可能是使用 -O0 編譯選項），因此：

內存訪問頻繁: 所有操作都通過內存進行，而不是直接使用寄存器
棧幀完整: 創建了完整的棧幀，包括基指針的保存和恢復
循環效率低: 每次循環都需要從內存加載和存儲值

O1

	.file	"a.cpp"
	.text
	.globl	_Z4testii
	.type	_Z4testii, @function
_Z4testii:
.LFB0:
	.cfi_startproc
	endbr64
	movl	%edi, %eax  ; 將 a 移動到 EAX（結果寄存器）
	testl	%esi, %esi  ; 測試 b 的值. testl指令，是將兩個操作數做與來設置零標志位和負數標識，常用的方法是testl %eax,%eax來檢查%eax是正數負數還是0
	jle	.L2             ; 如果 b <= 0，跳轉到.L2（直接返回）
	movl	$0, %edx    ; 將循環計數器 EDX 初始化為0
.L3:
	addl	%eax, %eax  ; a = a + a
	addl	$1, %edx    ; i++
	cmpl	%edx, %esi  ; 比較 i 和 b
	jne	.L3             ; 如果不相等就繼續循環
.L2:
	ret
	.cfi_endproc
.LFE0:
	.size	_Z4testii, .-_Z4testii
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

寄存器分配優化:

使用EAX同時存儲參數和結果，減少數據移動
使用EDX作為循環計數器，避免內存訪問

循環優化:

使用do-while循環結構，減少一次條件判斷
使用計數器與參數直接比較，而不是與0比較

算術優化:

使用addl %eax, %eax實現乘以2，而不是使用移位指令
這可能是因為在某些處理器上，加法比移位更快

棧幀消除:

沒有創建棧幀（沒有pushq %rbp和movq %rsp, %rbp）
所有操作都在寄存器中完成，極大提高效率

O2

	.file	"a.cpp"
	.text
	.p2align 4
	.globl	_Z4testii
	.type	_Z4testii, @function
_Z4testii:
.LFB0:
	.cfi_startproc
	endbr64
	movl	%edi, %eax
	testl	%esi, %esi  ; 測試第二個參數(b)
	jle	.L2             ; 如果b <= 0，跳轉到.L2（直接返回）
	xorl	%edx, %edx  ; i = 0
	testb	$1, %sil    ; 測試
	je	.L3
	addl	%eax, %eax  ; 如果是奇數，就會執行現在這一段
	movl	$1, %edx    ; i = 1
	cmpl	$1, %esi    ; 
	je	.L11            ; 如果 b == 1，直接 return，否則會進入下面的 L3：
	.p2align 4,,10
	.p2align 3
.L3:
	addl	$2, %edx    ; 每次 i += 2
	sall	$2, %eax    ; a <<= 2
	cmpl	%edx, %esi  ; 比較 i 和 b
	jne	.L3             ; 如果不相等就繼續循環
.L2:
	ret
.L11:
	ret
	.cfi_endproc
.LFE0:
	.size	_Z4testii, .-_Z4testii
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

每次迭代處理2位的移位（乘以4），而不是1位，減少循環迭代次數。

O3

	.file	"a.cpp"
	.text
	.p2align 4
	.globl	_Z4testii
	.type	_Z4testii, @function
_Z4testii:
.LFB0:
	.cfi_startproc
	endbr64
	movl	%edi, %eax
	testl	%esi, %esi
	jle	.L2
	xorl	%edx, %edx
	testb	$1, %sil
	je	.L3
	addl	%eax, %eax
	movl	$1, %edx
	cmpl	$1, %esi
	je	.L11
	.p2align 4,,10
	.p2align 3
.L3:
	addl	$2, %edx
	sall	$2, %eax
	cmpl	%edx, %esi
	jne	.L3
.L2:
	ret
.L11:
	ret
	.cfi_endproc
.LFE0:
	.size	_Z4testii, .-_Z4testii
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

和 O2 的效果是一樣的。

Fibonacci（遞歸）

int fib(int x) {
    if (x == 0) {
        return 0;
    }
    if (x == 1) {
        return 1;
    }
    return fib(x - 1) + fib(x - 2);
}

利用這個代碼的匯編，我們講清楚如何棧幀操作。

O0

	.file	"a.cpp"
	.text
	.globl	_Z3fibi
	.type	_Z3fibi, @function
_Z3fibi:
.LFB0:
	.cfi_startproc
	endbr64
	pushq	%rbp              ; 將調用者函數的基址指針 (%rbp) 的值壓入棧中保存。RBP 是 callee-saved，因為接下來修改了（從調用者的棧幀底部變成當前棧幀的底部），所以必須保存。
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	movq	%rsp, %rbp        ; 從現在開始，%rbp 成了一個穩定的“錨點”，指向當前棧幀的底部。函數內的局部變量和參數都將通過相對于 %rbp 的偏移來訪問（例如 -20(%rbp)）
	.cfi_def_cfa_register 6
	pushq	%rbx              ; 根據調用約定，%rbx 是被調用者保存的寄存器（Callee-saved）。這意味著如果函數要使用它，必須在函數開頭保存其原始值，并在函數返回前恢復它。這里是因為后面的遞歸調用需要用到
	subq	$24, %rsp         ; 在棧上分配24字節空間
	.cfi_offset 3, -24
	movl	%edi, -20(%rbp)   ; 將 n 存入棧幀（位置：%rbp-20）
	cmpl	$0, -20(%rbp)
	jne	.L2                   ; 如果 n != 0，跳轉到.L2
	movl	$0, %eax          ; 否則返回0
	jmp	.L3
.L2:
	cmpl	$1, -20(%rbp)
	jne	.L4                   ; 如果 n != 1，跳轉到.L4（遞歸部分）
	movl	$1, %eax          ; 否則返回 1
	jmp	.L3
.L4:
	movl	-20(%rbp), %eax   
	subl	$1, %eax          ; 計算 n-1，保存至 EAX
	movl	%eax, %edi        ; 將 n-1 作為參數
	call	_Z3fibi           ; f(n-1)
	movl	%eax, %ebx        ; 把 f(n-1) 的結果存到 EBX 上
	movl	-20(%rbp), %eax  
	subl	$2, %eax          ; 計算 n-2
	movl	%eax, %edi        ; 把 n-2 作為參數
	call	_Z3fibi           ; f(n-2)
	addl	%ebx, %eax        ; 把 f(n-1) + f(n-2) 作為返回值
.L3:
	movq	-8(%rbp), %rbx    ; 恢復保存的 %rbx
	leave                     ; 恢復棧幀（等價于 movq %rbp, %rsp + popq %rbp）
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc
.LFE0:
	.size	_Z3fibi, .-_Z3fibi
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

調用約定的規則：調用者保存 vs. 被調用者保存

x86-64架構下的寄存器被分為兩大陣營，這是理解一切的關鍵：

調用者保存寄存器（Caller-saved / Volatile Registers）:
- 包括: RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11 等。
- 規則: 如果一個函數（調用者）希望在這些寄存器中的值在子函數調用后仍然有效，它必須自己在調用call指令之前手動把它們保存到棧上。子函數（被調用者）可以隨意修改這些寄存器而無需恢復。
- 設計目的: 用于傳遞臨時性的參數和結果。
被調用者保存寄存器（Callee-saved / Non-volatile Registers）:
- 包括: RBX, RBP, R12, R13, R14, R15。
- 規則: 如果一個函數（被調用者）想要使用這些寄存器，它必須在函數的開頭（Prologue）保存它們的原始值，并在函數返回前（Epilogue）準確地恢復它們。對于調用者來說，它可以放心地認為這些寄存器的值在子函數調用后不會改變。
- 設計目的: 保存需要跨越函數調用的長期變量。

總結

寄存器類型	是否可用	后果
調用者保存 (如 `RCX`, `RDX`)	不可用	值會在后續 `call` 中被破壞，導致錯誤。必須手動保存，增加指令，降低效率。
被調用者保存 (如 `RBX`, `R12`)	可用且推薦	值在后續 `call` 中保持不變。只需在函數開頭和結尾保存/恢復一次，安全且高效。

結論：
編譯器不是必須使用 RBX，但它必須使用一個被調用者保存寄存器。RBX 是其中一個典型且傳統的選擇。它之所以出現在你的代碼中，是因為它是滿足“將一個值安全地跨越函數調用保存下來”這一需求的最直接、最正確、最高效的方式。

O1

	.file	"a.cpp"
	.text
	.globl	_Z3fibi
	.type	_Z3fibi, @function
_Z3fibi:
.LFB0:
	.cfi_startproc
	endbr64
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	pushq	%rbx
	.cfi_def_cfa_offset 24
	.cfi_offset 3, -24
	subq	$8, %rsp           ; 只分配了 8 字節
	.cfi_def_cfa_offset 32
	movl	%edi, %ebx         ; 把參數 n 保存到 EBX
	testl	%edi, %edi          
	je	.L2                    ; 如果是 0 就到 .L2
	cmpl	$1, %edi
	je	.L2                    ; 如果等于1，跳轉到.L2
	leal	-1(%rdi), %edi     ; 用 leal 來計算 n - 1
	call	_Z3fibi            ; f(n-1)
	movl	%eax, %ebp         ; 暫存結果
	leal	-2(%rbx), %edi    
	call	_Z3fibi            ; f(n-2)
	leal	0(%rbp,%rax), %ebx ; f(n-1)+f(n-2) 存到 EBX
.L2:
	movl	%ebx, %eax         ; 返回值寫入
	addq	$8, %rsp           ; 恢復棧指針（分配了8字節，現在收回）
	.cfi_def_cfa_offset 24
	popq	%rbx
	.cfi_def_cfa_offset 16
	popq	%rbp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE0:
	.size	_Z3fibi, .-_Z3fibi
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

優化帶來的改進

減少棧使用：只分配 8 字節?？臻g，而不是 24 字節
寄存器利用：更好地利用寄存器保存中間值，減少內存訪問
指令選擇：使用更高效的指令做算術運算（如 leal 和 testl）
代碼共享：所有返回路徑共享同一段代碼
減少內存訪問：直接在寄存器間傳遞值，而不是通過棧

O2

	.file	"a.cpp"
	.text
	.p2align 4
	.globl	_Z3fibi
	.type	_Z3fibi, @function
_Z3fibi:
.LFB0:
	.cfi_startproc
	endbr64
	pushq	%r15
	.cfi_def_cfa_offset 16
	.cfi_offset 15, -16
	pushq	%r14
	.cfi_def_cfa_offset 24
	.cfi_offset 14, -24
	pushq	%r13
	.cfi_def_cfa_offset 32
	.cfi_offset 13, -32
	pushq	%r12
	.cfi_def_cfa_offset 40
	.cfi_offset 12, -40
	movl	%edi, %r12d
	pushq	%rbp
	.cfi_def_cfa_offset 48
	.cfi_offset 6, -48
	pushq	%rbx
	.cfi_def_cfa_offset 56
	.cfi_offset 3, -56
	subq	$88, %rsp
	.cfi_def_cfa_offset 144
	testl	%edi, %edi
	je	.L2
	cmpl	$1, %edi
	je	.L2
	leal	-1(%rdi), %r15d
	xorl	%r12d, %r12d
.L27:
	cmpl	$1, %r15d
	je	.L52
	leal	-1(%r15), %r13d
	xorl	%r14d, %r14d
	movl	%r12d, 28(%rsp)
	movl	%r13d, 32(%rsp)
	movl	%r13d, %ebp
	movl	%r14d, %r12d
.L26:
	cmpl	$1, %ebp
	je	.L51
	movl	%r15d, 36(%rsp)
	leal	-1(%rbp), %ecx
	xorl	%r14d, %r14d
	movl	%ebp, 40(%rsp)
	movl	%ecx, %ebp
.L25:
	cmpl	$1, %ebp
	je	.L50
	movl	%r14d, 48(%rsp)
	leal	-1(%rbp), %edi
	xorl	%r15d, %r15d
	movl	%ebp, %r13d
	movl	%ecx, 52(%rsp)
	movl	%r12d, 44(%rsp)
	movl	%edi, %r12d
.L24:
	cmpl	$1, %r12d
	je	.L49
	leal	-1(%r12), %r11d
	xorl	%r14d, %r14d
	movl	%r15d, 56(%rsp)
	movl	%r14d, 16(%rsp)
	movl	%r11d, %ebp
	movl	%edi, 60(%rsp)
	movl	%r11d, 64(%rsp)
	movl	%r12d, 68(%rsp)
.L23:
	cmpl	$1, %ebp
	je	.L48
	leal	-1(%rbp), %r12d
	movl	%ebp, 76(%rsp)
	xorl	%r15d, %r15d
	movl	%r12d, 72(%rsp)
	movl	%r12d, %r14d
.L22:
	cmpl	$1, %r14d
	je	.L47
	leal	-1(%r14), %ebp
	movl	%r13d, 4(%rsp)
	xorl	%ecx, %ecx
	movl	%ebp, 8(%rsp)
	movl	%ebp, %ebx
	movl	%r14d, 12(%rsp)
.L21:
	cmpl	$1, %ebx
	je	.L46
	leal	-1(%rbx), %ebp
	xorl	%r13d, %r13d
	movl	%ebp, %r14d
	movl	%ebp, %edx
	movl	%ecx, %ebp
	movl	%ebx, %ecx
	movl	%r14d, %ebx
.L20:
	movl	%ebx, %r14d
	cmpl	$1, %ebx
	je	.L45
	movl	%ebx, 20(%rsp)
	xorl	%r12d, %r12d
	movl	%edx, %ebx
.L16:
	leal	-1(%r14), %edi
	movl	%ecx, 24(%rsp)
	call	_Z3fibi
	movl	24(%rsp), %ecx
	addl	%eax, %r12d
	subl	$2, %r14d
	je	.L54
	cmpl	$1, %r14d
	jne	.L16
	movl	%ebx, %edx
	movl	20(%rsp), %ebx
	addl	$1, %r12d
.L18:
	addl	%r12d, %r13d
	subl	$2, %ebx
	jne	.L20
.L45:
	movl	%ecx, %ebx
	leal	1(%r13), %esi
	movl	%ebp, %ecx
	addl	%esi, %ecx
	subl	$2, %ebx
	cmpl	$1, %edx
	jne	.L21
.L46:
	movl	12(%rsp), %r14d
	movl	%ecx, %ebx
	movl	8(%rsp), %ebp
	addl	$1, %ebx
	movl	4(%rsp), %r13d
	addl	%ebx, %r15d
	subl	$2, %r14d
	cmpl	$1, %ebp
	jne	.L22
.L47:
	movl	76(%rsp), %ebp
	movl	72(%rsp), %r12d
	addl	$1, %r15d
	addl	%r15d, 16(%rsp)
	subl	$2, %ebp
	cmpl	$1, %r12d
	jne	.L23
.L48:
	movl	16(%rsp), %r14d
	movl	56(%rsp), %r15d
	movl	68(%rsp), %r12d
	movl	64(%rsp), %r11d
	addl	$1, %r14d
	movl	60(%rsp), %edi
	addl	%r14d, %r15d
	subl	$2, %r12d
	cmpl	$1, %r11d
	jne	.L24
.L49:
	movl	48(%rsp), %r14d
	movl	%r13d, %ebp
	addl	$1, %r15d
	movl	52(%rsp), %ecx
	movl	44(%rsp), %r12d
	subl	$2, %ebp
	addl	%r15d, %r14d
	cmpl	$1, %edi
	jne	.L25
.L50:
	movl	40(%rsp), %ebp
	addl	$1, %r14d
	movl	36(%rsp), %r15d
	addl	%r14d, %r12d
	subl	$2, %ebp
	cmpl	$1, %ecx
	jne	.L26
.L51:
	movl	%r12d, %r14d
	movl	32(%rsp), %r13d
	movl	28(%rsp), %r12d
	subl	$2, %r15d
	addl	$1, %r14d
	addl	%r14d, %r12d
	cmpl	$1, %r13d
	jne	.L27
.L52:
	addl	$1, %r12d
.L2:
	addq	$88, %rsp
	.cfi_remember_state
	.cfi_def_cfa_offset 56
	movl	%r12d, %eax
	popq	%rbx
	.cfi_def_cfa_offset 48
	popq	%rbp
	.cfi_def_cfa_offset 40
	popq	%r12
	.cfi_def_cfa_offset 32
	popq	%r13
	.cfi_def_cfa_offset 24
	popq	%r14
	.cfi_def_cfa_offset 16
	popq	%r15
	.cfi_def_cfa_offset 8
	ret
	.p2align 4,,10
	.p2align 3
.L54:
	.cfi_restore_state
	movl	%ebx, %edx
	movl	20(%rsp), %ebx
	jmp	.L18
	.cfi_endproc
.LFE0:
	.size	_Z3fibi, .-_Z3fibi
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

性能權衡
這種優化帶來了性能提升，但代價是代碼大小增加：

優點：

減少函數調用開銷
更好的指令級并行性
更高效的內存訪問模式

缺點：

代碼大小顯著增加
可讀性降低
可能增加指令緩存壓力

O3

和 O2 一樣，略。

尾遞歸優化

int foo(int x) {
    if (x <= 0) {
        return 0;
    }
    return foo(x - 1) + 1;
}

O1

我們這次先看 O1 的結果，因為他比較符合我們對源代碼（C++）的認知。

	.file	"a.cpp"
	.text
	.globl	_Z3fooi
	.type	_Z3fooi, @function
_Z3fooi:
.LFB0:
	.cfi_startproc
	endbr64
	movl	$0, %eax        ; 預先將返回值設為0
	testl	%edi, %edi      ; 測試參數n是否為0或負數
	jle	.L5                 ; 如果n <= 0，直接返回0
	subq	$8, %rsp        ; 分配8字節?？臻g（用于對齊）
	.cfi_def_cfa_offset 16
	subl	$1, %edi        ; 參數減1
	call	_Z3fooi         ; foo(x-1)
	addl	$1, %eax        ; 結果加1
	addq	$8, %rsp        ; 恢復棧指針
	.cfi_def_cfa_offset 8
	ret
.L5:
	ret
	.cfi_endproc
.LFE0:
	.size	_Z3fooi, .-_Z3fooi
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

O0

	.file	"a.cpp"
	.text
	.globl	_Z3fooi
	.type	_Z3fooi, @function
_Z3fooi:
.LFB0:
	.cfi_startproc
	endbr64
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register 6
	subq	$16, %rsp
	movl	%edi, -4(%rbp)
	cmpl	$0, -4(%rbp)
	jg	.L2
	movl	$0, %eax
	jmp	.L3
.L2:
	movl	-4(%rbp), %eax
	subl	$1, %eax
	movl	%eax, %edi
	call	_Z3fooi
	addl	$1, %eax
.L3:
	leave
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc
.LFE0:
	.size	_Z3fooi, .-_Z3fooi
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

O2

	.file	"a.cpp"
	.text
	.p2align 4
	.globl	_Z3fooi
	.type	_Z3fooi, @function
_Z3fooi:
.LFB0:
	.cfi_startproc
	endbr64
	xorl	%eax, %eax
	testl	%edi, %edi
	cmovns	%edi, %eax  ; 如果 n 是非負數（non-negative），將 n 移動到 eax
	ret
	.cfi_endproc
.LFE0:
	.size	_Z3fooi, .-_Z3fooi
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

編譯器完全識別并消除了遞歸調用

尾遞歸優化其二

int foo(int x) {
    if (x <= 0) {
        return 0;
    }
    return foo(x - 1) + (x & (x - 1));
}

O0

	.file	"a.cpp"
	.text
	.globl	_Z3fooi
	.type	_Z3fooi, @function
_Z3fooi:
.LFB0:
	.cfi_startproc
	endbr64
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register 6
	subq	$16, %rsp        ; 開 16 個字節的?？臻g
	movl	%edi, -4(%rbp)   ; 保存 x
	cmpl	$0, -4(%rbp)     
	jg	.L2                  ; 大于零進遞歸
	movl	$0, %eax
	jmp	.L3                  ; 否則返回 0
.L2:
	movl	-4(%rbp), %eax   ; 
	subl	$1, %eax         ; 得到 x-1
	movl	%eax, %edi       ; foo(x-1)
	call	_Z3fooi
	movl	-4(%rbp), %edx   ;
	subl	$1, %edx         ; 得到 x-1
	andl	-4(%rbp), %edx   ; 得到 x & (x-1)
	addl	%edx, %eax       ; foo(x-1) + (x & (x-1))
.L3:
	leave
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc
.LFE0:
	.size	_Z3fooi, .-_Z3fooi
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

O1

	.file	"a.cpp"
	.text
	.globl	_Z3fooi
	.type	_Z3fooi, @function
_Z3fooi:
.LFB0:
	.cfi_startproc
	endbr64
	movl	$0, %eax
	testl	%edi, %edi
	jle	.L5
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	pushq	%rbx               ; 保存 RBX 寄存器
	.cfi_def_cfa_offset 24
	.cfi_offset 3, -24
	subq	$8, %rsp           ; 開 8 個字節的?？臻g
	.cfi_def_cfa_offset 32
	movl	%edi, %ebp         ; 保存 x
	leal	-1(%rdi), %ebx     
	movl	%ebx, %edi         ; 計算 x-1 為參數
	call	_Z3fooi            ; foo(x-1)
	andl	%ebp, %ebx         ; x & (x-1)
	addl	%ebx, %eax         ; EAX = foo(x-1) + (x & (x-1))
	addq	$8, %rsp           ; 恢復?？臻g
	.cfi_def_cfa_offset 24
	popq	%rbx
	.cfi_def_cfa_offset 16
	popq	%rbp
	.cfi_def_cfa_offset 8
	ret
.L5:
	.cfi_restore 3
	.cfi_restore 6
	ret
	.cfi_endproc
.LFE0:
	.size	_Z3fooi, .-_Z3fooi
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

O2

	.file	"a.cpp"
	.text
	.p2align 4
	.globl	_Z3fooi
	.type	_Z3fooi, @function
_Z3fooi:
.LFB0:
	.cfi_startproc
	endbr64
	xorl	%edx, %edx  ; EDX = 0 作為累加器
	testl	%edi, %edi 
	jle	.L1             ; 如果 n <= 0 就跳轉到結束區域
	.p2align 4,,10
	.p2align 3
.L2:
	movl	%edi, %eax  ; EAX = n
	subl	$1, %edi    ; EDI = n - 1
	andl	%edi, %eax  ; EAX = n & (n - 1)
	addl	%eax, %edx  ; 累加
	testl	%edi, %edi
	jne	.L2             ; 如果 n != 0 就繼續循環
.L1:
	movl	%edx, %eax
	ret
	.cfi_endproc
.LFE0:
	.size	_Z3fooi, .-_Z3fooi
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

O2 優化將遞歸完全轉換為迭代，消除所有函數調用開銷。

O3

這個 O3 優化版本展示了編譯器如何通過向量化（SIMD）和循環展開來進一步優化代碼。我們不講解具體的實現，僅僅介紹大體的框架。

向量化主循環：使用 SSE 指令并行處理 4 個元素
標量尾端處理：處理不能被 4 整除的剩余元素
循環展開：進一步優化標量部分

	.file	"a.cpp"
	.text
	.p2align 4
	.globl	_Z3fooi
	.type	_Z3fooi, @function
_Z3fooi:
.LFB0:
	.cfi_startproc
	endbr64
	testl	%edi, %edi
	jle	.L19
	cmpl	$4, %edi
	jle	.L8
	movd	%edi, %xmm6
	movl	%edi, %edx
	movdqa	.LC1(%rip), %xmm5
	xorl	%eax, %eax
	pshufd	$0, %xmm6, %xmm2
	shrl	$2, %edx
	pxor	%xmm0, %xmm0
	pcmpeqd	%xmm4, %xmm4
	paddd	.LC0(%rip), %xmm2
	.p2align 4,,10
	.p2align 3
.L5:
	movdqa	%xmm2, %xmm3
	addl	$1, %eax
	paddd	%xmm5, %xmm2
	movdqa	%xmm3, %xmm1
	paddd	%xmm4, %xmm1
	pand	%xmm3, %xmm1
	paddd	%xmm1, %xmm0
	cmpl	%eax, %edx
	jne	.L5
	movdqa	%xmm0, %xmm1
	movl	%edi, %edx
	psrldq	$8, %xmm1
	andl	$-4, %edx
	paddd	%xmm1, %xmm0
	movdqa	%xmm0, %xmm1
	psrldq	$4, %xmm1
	paddd	%xmm1, %xmm0
	movd	%xmm0, %eax
	testb	$3, %dil
	je	.L1
	subl	%edx, %edi
.L3:
	leal	-1(%rdi), %edx
	movl	%edx, %ecx
	andl	%edi, %ecx
	addl	%ecx, %eax
	testl	%edx, %edx
	je	.L1
	leal	-2(%rdi), %ecx
	andl	%ecx, %edx
	addl	%edx, %eax
	testl	%ecx, %ecx
	je	.L1
	leal	-3(%rdi), %edx
	andl	%edx, %ecx
	addl	%ecx, %eax
	testl	%edx, %edx
	je	.L1
	subl	$4, %edi
	andl	%edx, %edi
	addl	%edi, %eax
.L1:
	ret
	.p2align 4,,10
	.p2align 3
.L19:
	xorl	%eax, %eax
	ret
.L8:
	xorl	%eax, %eax
	jmp	.L3
	.cfi_endproc
.LFE0:
	.size	_Z3fooi, .-_Z3fooi
	.section	.rodata.cst16,"aM",@progbits,16
	.align 16
.LC0:
	.long	0
	.long	-1
	.long	-2
	.long	-3
	.align 16
.LC1:
	.long	-4
	.long	-4
	.long	-4
	.long	-4
	.ident	"GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
	.section	.note.GNU-stack,"",@progbits
	.section	.note.gnu.property,"a"
	.align 8
	.long	1f - 0f
	.long	4f - 1f
	.long	5
0:
	.string	"GNU"
1:
	.align 8
	.long	0xc0000002
	.long	3f - 2f
2:
	.long	0x3
3:
	.align 8
4:

posted @ 2025-09-17 17:00 Imakf 閱讀(14) 評論(0) 收藏舉報

刷新頁面返回頂部

Imakf