From de92f819f07e2ccfb1e56b913ef86b5473d79109 Mon Sep 17 00:00:00 2001 From: spaceface Date: Sat, 18 Dec 2021 16:33:24 +0100 Subject: [PATCH] cgen: support closures with any number of parameters of any size on amd64 (#12891) --- cmd/tools/vtest-self.v | 2 + vlib/v/gen/c/cheaders.v | 123 ++++++++++++++++++++------ vlib/v/gen/c/fn.v | 16 +++- vlib/v/tests/closure_generator_test.v | 76 ++++++++++++++++ 4 files changed, 189 insertions(+), 28 deletions(-) create mode 100644 vlib/v/tests/closure_generator_test.v diff --git a/cmd/tools/vtest-self.v b/cmd/tools/vtest-self.v index 091703edee..719217e2a0 100644 --- a/cmd/tools/vtest-self.v +++ b/cmd/tools/vtest-self.v @@ -109,6 +109,7 @@ const ( 'vlib/orm/orm_test.v', 'vlib/v/tests/orm_sub_struct_test.v', 'vlib/v/tests/closure_test.v', + 'vlib/v/tests/closure_generator_test.v', 'vlib/net/websocket/ws_test.v', 'vlib/net/unix/unix_test.v', 'vlib/net/websocket/websocket_test.v', @@ -132,6 +133,7 @@ const ( 'do_not_remove', ] skip_on_arm64 = [ + 'vlib/v/tests/closure_generator_test.v', 'do_not_remove', ] skip_on_non_amd64_or_arm64 = [ diff --git a/vlib/v/gen/c/cheaders.v b/vlib/v/gen/c/cheaders.v index 288a82e38e..71ede72787 100644 --- a/vlib/v/gen/c/cheaders.v +++ b/vlib/v/gen/c/cheaders.v @@ -76,6 +76,49 @@ fn arm32_bytes(nargs int) string { return bytes.replace('', nargs.str()) } +// gen_amd64_bytecode generates the amd64 bytecode a closure with `nargs` parameters. +// NB: `nargs` includes the last `userdata` parameter that will be passed to the original +// function, and as such nargs must always be > 0 +fn amd64_bytes(nargs int) string { + match nargs { + 1 { + return '0x48, 0x8b, 0x3d, 0xe9, 0xff, 0xff, 0xff, 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff' + } + 2 { + return '0x48, 0x8b, 0x35, 0xe9, 0xff, 0xff, 0xff, 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff' + } + 3 { + return '0x48, 0x8b, 0x15, 0xe9, 0xff, 0xff, 0xff, 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff' + } + 4 { + return '0x48, 0x8b, 0x0d, 0xe9, 0xff, 0xff, 0xff, 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff' + } + 5 { + return '0x4C, 0x8b, 0x05, 0xe9, 0xff, 0xff, 0xff, 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff' + } + 6 { + return '0x4C, 0x8b, 0x0d, 0xe9, 0xff, 0xff, 0xff, 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff' + } + else { + // see https://godbolt.org/z/64e5TEf5n for similar assembly + mut sb := strings.new_builder(256) + s := (((byte(nargs) & 1) + 1) << 3).hex() + sb.write_string('0x48, 0x83, 0xec, 0x$s, ') // sub rsp,0x8 sub rsp,0x10 + sb.write_string('0xff, 0x35, 0xe6, 0xff, 0xff, 0xff, ') // push QWORD PTR [rip+0xffffffffffffffe6] + + rsp_offset := byte(0x18 + ((byte(nargs - 7) >> 1) << 4)).hex() + for _ in 0 .. nargs - 7 { + sb.write_string('0xff, 0xb4, 0x24, 0x$rsp_offset, 0x00, 0x00, 0x00, ') // push QWORD PTR [rsp+$rsp_offset] + } + sb.write_string('0xff, 0x15, 0x${byte(256 - sb.len / 6 - 6 - 8).hex()}, 0xff, 0xff, 0xff, ') // call QWORD PTR [rip+OFFSET] + sb.write_string('0x48, 0x81, 0xc4, 0x$rsp_offset, 0x00, 0x00, 0x00, ') // add rsp,$rsp_offset + sb.write_string('0xc3') // ret + + return sb.str() + } + } +} + // Heavily based on Chris Wellons's work // https://nullprogram.com/blog/2017/01/08/ @@ -90,33 +133,50 @@ fn c_closure_helpers(pref &pref.Preferences) string { if pref.os != .windows { builder.writeln('#include ') } + // TODO: support additional arguments by pushing them onto the stack + // https://en.wikipedia.org/wiki/Calling_convention if pref.arch == .amd64 { + // TODO: the `amd64_bytes()` function above should work for an arbitrary* number of arguments, + // so we should just remove the table and call the function directly at runtime builder.write_string(' -static unsigned char __closure_thunk[6][13] = { - { - 0x48, 0x8b, 0x3d, 0xe9, 0xff, 0xff, 0xff, - 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff - }, { - 0x48, 0x8b, 0x35, 0xe9, 0xff, 0xff, 0xff, - 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff - }, { - 0x48, 0x8b, 0x15, 0xe9, 0xff, 0xff, 0xff, - 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff - }, { - 0x48, 0x8b, 0x0d, 0xe9, 0xff, 0xff, 0xff, - 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff - }, { - 0x4C, 0x8b, 0x05, 0xe9, 0xff, 0xff, 0xff, - 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff - }, { - 0x4C, 0x8b, 0x0d, 0xe9, 0xff, 0xff, 0xff, - 0xff, 0x25, 0xeb, 0xff, 0xff, 0xff - }, +static unsigned char __closure_thunk[32][${amd64_bytes(31).len / 6 + + 2}] = { + { ${amd64_bytes(1)} }, + { ${amd64_bytes(2)} }, + { ${amd64_bytes(3)} }, + { ${amd64_bytes(4)} }, + { ${amd64_bytes(5)} }, + { ${amd64_bytes(6)} }, + { ${amd64_bytes(7)} }, + { ${amd64_bytes(8)} }, + { ${amd64_bytes(9)} }, + { ${amd64_bytes(10)} }, + { ${amd64_bytes(11)} }, + { ${amd64_bytes(12)} }, + { ${amd64_bytes(13)} }, + { ${amd64_bytes(14)} }, + { ${amd64_bytes(15)} }, + { ${amd64_bytes(16)} }, + { ${amd64_bytes(17)} }, + { ${amd64_bytes(18)} }, + { ${amd64_bytes(19)} }, + { ${amd64_bytes(20)} }, + { ${amd64_bytes(21)} }, + { ${amd64_bytes(22)} }, + { ${amd64_bytes(23)} }, + { ${amd64_bytes(24)} }, + { ${amd64_bytes(25)} }, + { ${amd64_bytes(26)} }, + { ${amd64_bytes(27)} }, + { ${amd64_bytes(28)} }, + { ${amd64_bytes(29)} }, + { ${amd64_bytes(30)} }, + { ${amd64_bytes(31)} }, }; ') } else if pref.arch == .arm64 { builder.write_string(' -static unsigned char __closure_thunk[6][12] = { +static unsigned char __closure_thunk[8][12] = { { ${arm64_bytes(0)} }, { @@ -129,12 +189,16 @@ static unsigned char __closure_thunk[6][12] = { ${arm64_bytes(4)} }, { ${arm64_bytes(5)} + }, { + ${arm64_bytes(6)} + }, { + ${arm64_bytes(7)} }, }; ') } else if pref.arch == .arm32 { builder.write_string(' -static unsigned char __closure_thunk[6][12] = { +static unsigned char __closure_thunk[4][12] = { { ${arm32_bytes(0)} }, { @@ -143,10 +207,6 @@ static unsigned char __closure_thunk[6][12] = { ${arm32_bytes(2)} }, { ${arm32_bytes(3)} - }, { - ${arm32_bytes(4)} - }, { - ${arm32_bytes(5)} }, }; ') @@ -161,6 +221,14 @@ static void __closure_set_function(void *closure, void *f) { void **p = closure; p[-1] = f; } + +static inline int __closure_check_nargs(int nargs) { + if (nargs > (int)_ARR_LEN(__closure_thunk)) { + _v_panic(_SLIT("Closure too large. Reduce the number of parameters, or pass the parameters by reference.")); + VUNREACHABLE(); + } + return nargs; +} ') if pref.os != .windows { builder.write_string(' @@ -260,6 +328,9 @@ const c_common_macros = ' #define __offsetof(PTYPE,FIELDNAME) ((size_t)((char *)&((PTYPE *)0)->FIELDNAME - (char *)0)) #endif +// returns the number of CPU registers that TYPE takes up +#define _REG_WIDTH(T) (((sizeof(T) + sizeof(void*) - 1) & ~(sizeof(void*) - 1)) / sizeof(void*)) + #define OPTION_CAST(x) (x) #ifndef V64_PRINTFORMAT diff --git a/vlib/v/gen/c/fn.v b/vlib/v/gen/c/fn.v index 4d3a67d9f0..f40c1b61c0 100644 --- a/vlib/v/gen/c/fn.v +++ b/vlib/v/gen/c/fn.v @@ -452,8 +452,20 @@ fn (mut g Gen) gen_anon_fn(mut node ast.AnonFn) { // it may be possible to optimize `memdup` out if the closure never leaves current scope ctx_struct := closure_ctx_struct(node.decl) // TODO in case of an assignment, this should only call "__closure_set_data" and "__closure_set_function" (and free the former data) - g.write('__closure_create($node.decl.name, ${node.decl.params.len + 1}, ') - g.writeln('($ctx_struct*) memdup(&($ctx_struct){') + + mut size_sb := strings.new_builder(node.decl.params.len * 50) + for param in node.decl.params { + size_sb.write_string('_REG_WIDTH(${g.typ(param.typ)}) + ') + } + size_sb.write_string('1') + args_size := size_sb.str() + g.writeln('') + + // ensure that nargs maps to a known entry in the __closure_thunk array + // TODO make it a compile-time error (you can't call `sizeof()` inside preprocessor `#if`s) + // NB: this won't be necessary when (if) we have functions that return the machine code for + // an arbitrary number of arguments + g.write('__closure_create($node.decl.name, __closure_check_nargs($args_size), ($ctx_struct*) memdup(&($ctx_struct){') g.indent++ for var in node.inherited_vars { g.writeln('.$var.name = $var.name,') diff --git a/vlib/v/tests/closure_generator_test.v b/vlib/v/tests/closure_generator_test.v new file mode 100644 index 0000000000..7fec0db0e2 --- /dev/null +++ b/vlib/v/tests/closure_generator_test.v @@ -0,0 +1,76 @@ +import strings +import os + +const ( + max_params = 16 + all_param_names = []string{len: max_params, init: '${`a` + it}'} + all_param_values = []string{len: max_params, init: '${it + 1}'} +) + +// test_closures_with_n_args generates a new V file containing closures of `i` +// and parameters of type `typ`, to makes sure that all combinations work correctly +fn test_closures_with_n_args() ? { + mut v_code := strings.new_builder(1024) + // NB: the type or value of the captured arg doesn't matter for this test, + // as the entire closure context is always passed as one pointer anyways + + for typ in ['byte', 'u16', 'int', 'i64', 'voidptr', 'string'] { + for i in 0 .. max_params { + param_names := all_param_names[..i] + params := param_names.map('$it $typ') + + mut values := all_param_values[..i] + if typ == 'string' { + values = values.map("'$it'") + } + values = values.map('${typ}($it)') + + mut expected_val := if typ == 'string' { + s := all_param_values[..i].join('') + "'127' + '$s'" + } else { + '127 + ${i * (i + 1) / 2}' + } + + init_val, return_type := if typ != 'string' { + 'u64(127)', 'u64' + } else { + "'127'", 'string' + } + + // NB: the captured arg doesn't matter for this test, as closures always receive + // a pointer to the entire closure context as their last argument anyways + v_code.writeln(" + fn test_big_closure_${typ}_${i}() { + println('test_big_closure_${typ}_$i') + mut z := $init_val + c := fn [z] (${params.join(', ')}) $return_type { + mut sum := z") + for j in 0 .. i { + v_code.writeln('\t\tsum += ${return_type}(${param_names[j]})') + } + v_code.writeln(' + return sum + } + assert c(${values.join(', ')}) == $expected_val + }') + } + } + + code := v_code.str() + println('Compiling V code (${code.count('\n')} lines) ...') + wrkdir := os.join_path(os.temp_dir(), 'vtests', 'closures') + os.mkdir_all(wrkdir) ? + os.chdir(wrkdir) ? + os.write_file('closure_args_test.v', code) ? + vexe := os.getenv('VEXE') + res := os.execute('"$vexe" closure_args_test.v') + if res.exit_code != 0 { + eprintln(res.output) + assert false + } + println('Process exited with code $res.exit_code') + + os.chdir(os.dir(vexe)) or {} + os.rmdir_all(wrkdir) or {} +}