Pass function arguments by value if they're small

Posted on 2020-06-25 | In C++

C++ Core Guideline中关于函数参数传递，有两条规则

F.15: Refer simple and conventional ways of passing informaton
F.16: Pass cheaply-copied types by value and others by reference to const

这篇文章我们将以ARM64平台(iOS)为例，讨论一下这背后的原因。

How argument passing works at the CPU level

C里面的函数调用通常由一系列instruction构成，这些instruction需要遵循各个平台的calling convention

函数参数需要根据calling convention放到合适的地方。这样被调用的函数知道去哪里找
函数的返回地址需要被放到某个位置，这样callee会将控制权返回给caller
control会指向function的第一条指令

在modern的CPU上，具体表现为

一些pointer-sized函数参数会被放到某些特殊的寄存器中，其它比较大的参数放到stack上，他们在stack上的地址会被保存到寄存器中。当然，这里也会有例外，如果参数大小正好是2个pointer-size或者参数是floating-point的类型，它们同样也可以直接放到寄存器中。另外，可变参数的行为也不一样(printf(const char*, ...))，这里不做展开
汇编中会有类似于call这样的instruction用来Jump到目标函数，并且stash返回return address
汇编中会有类似于return这样的命令跳转到上一步stash的函数地址，以便继续执行后面的指令

对于iOS的ARM64 CPU来说

编译器会尝试将函数的前8个参数放到通用64bit寄存器 x0-x7，或者v0-v7中(SIMD/floating-point registers)。Integer或者指针类型的参数会被放到x0-x7中，float/double/long double参数会被放到v0-v7中。对于比较大的(大于等于16 bytes)struct参数，它们会被copy到stack上，指向这份copy的pointer会被放到寄存器x0-x7中 (Note: this is easy to miss: it’s item B.3 in section 5.4.2 of “Procedure Call Standard for the ARM 64-bit Architecture”)。对于小的struct(小于16 bytes)，它们会被放到一个或者两个寄存器中。这里有一个例外，如果一个struct包涵小于4个float pointer，它也会被放到寄存器中。
bl (branch and link) 指令是一个函数调用，它会将control交给它后面参数，并将返回地址存到一个特殊的link register中(“LR”)
blr指令用于函数返回，它将control交给LR寄存器中保存的值。

Pass cheaply-copied types by value

我们来看一个例子

void intByValue(int x);
void intByPointer(const int *x);
void intByReference(const int &x);
void doSomething();  // avoid privileging the "right way" further
 
void callIntByValue() {
  int x = 123;
  intByValue(x);
  doSomething();
}
 
void callIntByPointer() {
  int x = 123;
  intByPointer(&x);
  doSomething();
}
 
void callIntByReference() {
  int x = 123;
  intByReference(x);
  doSomething();
}

接下来我们用下面命令生成ARM64的汇编

 xcrun --sdk iphoneos clang++ -S -Os -arch arm64  ./test.cpp

得到的汇编代码如下，由于callIntByReference产生的汇编和callIntByPointer相同，我们下面值对比前两种情况


callIntByValue:
	.cfi_startproc
; %bb.0:
	stp	x29, x30, [sp, #-16]!
	mov	x29, sp
	.cfi_def_cfa w29, 16
	.cfi_offset w30, -8
	.cfi_offset w29, -16
	mov	w0, #123
	bl	intByValue
	ldp	x29, x30, [sp], #16
	b	doSomething
	.cfi_endproc


callIntByPointer:
	.cfi_startproc
; %bb.0:
	sub	sp, sp, #32
	stp	x29, x30, [sp, #16]
	add	x29, sp, #16
	.cfi_def_cfa w29, 16
	.cfi_offset w30, -8
	.cfi_offset w29, -16
	mov	w8, #123
	stur	w8, [x29, #-4]
	sub	x0, x29, #4
	bl	intByPointer
	bl	doSomething
	ldp	x29, x30, [sp, #16]
	add	sp, sp, #32
	ret
	.cfi_endproc

显然，传指针产生了更多的汇编指令，如果我们逐条分析我们会发现callIntByPointer多了下面两条指令

stur	w8, [x29, #-4]
sub	x0, x29, #4

ARM64上，x29用来保存frame pointer

在执行stur之前，123已经被放到了w8的register中。接下来w8中的值被保存(stur)到了stack上，然后sub计算出该值在stack上的地址，将其保存到x0中。这个操作可能并不慢，但却会增加code size

Pass others by reference to const

接下来我们看另一种情况

struct BigStruct {
  int x[64];
};
void structByValue(BigStruct s);
void structByPointer(const BigStruct *s);
void doSomething();
 
void callStructByPointer(int unused, BigStruct s) {
  structByPointer(&s);
  doSomething();
}
 
void callStructByValue(BigStruct s) {
  structByValue(s);
  doSomething();
}

上面我们定义了一个struct，它包含一个int数组，该数组有64个元素，因此这个struct一共占用256个字节，显然它无法放到寄存器中。我们用同样的汇编指令，对比上面两个函数产生的汇编代码


callStructByPointer:
	.cfi_startproc
; %bb.0:
	stp	x29, x30, [sp, #-16]!
	mov	x29, sp
	.cfi_def_cfa w29, 16
	.cfi_offset w30, -8
	.cfi_offset w29, -16
	mov	x0, x1
	bl	structByPointer
	ldp	x29, x30, [sp], #16
	b	doSomething
	.cfi_endproc


callStructByValue:
	.cfi_startproc
; %bb.0:
	sub	sp, sp, #288 
	stp	x28, x27, [sp, #256]
	stp	x29, x30, [sp, #272]
	add	x29, sp, #272
	.cfi_def_cfa w29, 16
	.cfi_offset w30, -8
	.cfi_offset w29, -16
	.cfi_offset w27, -24
	.cfi_offset w28, -32
	ldp	q0, q1, [x0]
	ldp	q2, q3, [x0, #32]
	stp	q2, q3, [sp, #32]
	stp	q0, q1, [sp]
	ldp	q0, q1, [x0, #64]
	ldp	q2, q3, [x0, #96]
	stp	q2, q3, [sp, #96]
	stp	q0, q1, [sp, #64]
	ldp	q0, q1, [x0, #128]
	ldp	q2, q3, [x0, #160]
	stp	q2, q3, [sp, #160]
	stp	q0, q1, [sp, #128]
	ldp	q0, q1, [x0, #192]
	ldp	q2, q3, [x0, #224]
	stp	q2, q3, [sp, #224]
	stp	q0, q1, [sp, #192]
	mov	x0, sp
	bl	structByValue
	bl	doSomething
	ldp	x29, x30, [sp, #272]
	ldp	x28, x27, [sp, #256]
	add	sp, sp, #288
	ret
	.cfi_endproc

显然，编译器需要更多指令来copy这个struct。这里ldp和stp是一对load和store的stack操作，这里不做过多展开。如果我们将int[64]变成int[128]，或者将compiler的优化参数改为-Oz，则上面代码将变成

__Z17callStructByValue9BigStruct:
; %bb.0:
	stp	x28, x27, [sp, #-32]!           ; 16-byte Folded Spill
	stp	x29, x30, [sp, #16]             ; 16-byte Folded Spill
	add	x29, sp, #16                    ; =16
	sub	sp, sp, #512                    ; =512
	.cfi_def_cfa w29, 16
	.cfi_offset w30, -8
	.cfi_offset w29, -16
	.cfi_offset w27, -24
	.cfi_offset w28, -32
	mov	x1, x0
	mov	x0, sp
	mov	w2, #512
	bl	_memcpy
	mov	x0, sp
	bl	__Z13structByValue9BigStruct
	bl	__Z11doSomethingv
	add	sp, sp, #512                    ; =512
	ldp	x29, x30, [sp, #16]             ; 16-byte Folded Reload
	ldp	x28, x27, [sp], #32             ; 16-byte Folded Reload
	ret
	.cfi_endproc

这里可以看出，如果数组的元素少，或者用户指定了-Oz，编译器会对memcpy进行inline操作。memcpy的函数原型为memcpy(&s_copy, &s, sizeof(s))，每个参数对应下面三条指令

mov	x1, x0
mov	x0, sp
mov	w2, #512

上面代码虽然变少了，但是速度上可能会比inline memcpy要慢。

bl (branch and link)指令会将control交给它的参数，并将返回地址存到一个特殊的link register中(“LR”)
blr指令用于函数返回。它将control交给LR寄存器中保存的值

Pass function arguments by value if they're small

How argument passing works at the CPU level

Pass cheaply-copied types by value

Pass others by reference to const

Resources