From 7a2705d8ce756dff19f080216311b946c95f5567 Mon Sep 17 00:00:00 2001 From: jeffmikels Date: Tue, 18 Jan 2022 19:23:25 -0500 Subject: [PATCH] scanner: multibyte rune literals now support unicode, hex, and octal escape codes (#13140) --- .editorconfig | 5 + doc/docs.md | 194 ++++++++++++++---- examples/v_script.vsh | 0 .../tests/sum_type_ref_variant_err.out | 2 +- vlib/v/parser/tests/duplicate_type_a.out | 2 +- vlib/v/scanner/scanner.v | 89 ++++---- vlib/v/scanner/scanner_test.v | 44 +++- 7 files changed, 234 insertions(+), 102 deletions(-) mode change 100644 => 100755 examples/v_script.vsh diff --git a/.editorconfig b/.editorconfig index d0c5ae6bdb..c972a1d7aa 100644 --- a/.editorconfig +++ b/.editorconfig @@ -16,6 +16,11 @@ indent_size = 2 [*.md] trim_trailing_whitespace = false +# lines that are too long will trigger an error in cmd/tools/vcheck-md.v +# run v check-md [folder/file] to test markdown files +# the longest normal line is specified with this constant: +# `too_long_line_length_other = 100` +max_line_length = 100 [*.{txt,out}] insert_final_newline = false diff --git a/doc/docs.md b/doc/docs.md index cfc3c79c6f..bbbec48ae3 100644 --- a/doc/docs.md +++ b/doc/docs.md @@ -476,16 +476,33 @@ d := b + x // d is of type `f64` - automatic promotion of `x`'s value ### Strings -```v +```v nofmt name := 'Bob' -println(name.len) -println(name[0]) // indexing gives a byte B -println(name[1..3]) // slicing gives a string 'ob' -windows_newline := '\r\n' // escape special characters like in C +assert name.len == 3 // will print 3 +assert name[0] == byte(66) // indexing gives a byte, byte(66) == `B` +assert name[1..3] == 'ob' // slicing gives a string 'ob' + +// escape codes +windows_newline := '\r\n' // escape special characters like in C assert windows_newline.len == 2 + +// arbitrary bytes can be directly specified using `\x##` notation where `#` is +// a hex digit aardvark_str := '\x61ardvark' assert aardvark_str == 'aardvark' +assert '\xc0'[0] == byte(0xc0) + +// or using octal escape `\###` notation where `#` is an octal digit +aardvark_str2 := '\141ardvark' +assert aardvark_str2 == 'aardvark' + +// Unicode can be specified directly as `\u####` where # is a hex digit +// and will be converted internally to its UTF-8 representation +star_str := '\u2605' // ★ +assert star_str == '★' +assert star_str == '\xe2\x98\x85' // UTF-8 can be specified this way too. ``` -In V, a string is a read-only array of bytes. String data is encoded using UTF-8: +In V, a string is a read-only array of bytes. All Unicode characters are encoded using UTF-8: + ```v s := 'hello 🌎' // emoji takes 4 bytes assert s.len == 10 @@ -503,11 +520,12 @@ String values are immutable. You cannot mutate elements: mut s := 'hello 🌎' s[0] = `H` // not allowed ``` + > error: cannot assign to `s[i]` since V strings are immutable -Note that indexing a string will produce a `byte`, not a `rune` nor another `string`. -Indexes correspond to bytes in the string, not Unicode code points. If you want to -convert the `byte` to a `string`, use the `ascii_str()` method: +Note that indexing a string will produce a `byte`, not a `rune` nor another `string`. Indexes +correspond to _bytes_ in the string, not Unicode code points. If you want to convert the `byte` to a +`string`, use the `.ascii_str()` method on the `byte`: ```v country := 'Netherlands' @@ -515,20 +533,13 @@ println(country[0]) // Output: 78 println(country[0].ascii_str()) // Output: N ``` -Character literals have type `rune`. To denote them, use ` +Both single and double quotes can be used to denote strings. For consistency, `vfmt` converts double +quotes to single quotes unless the string contains a single quote character. + +For raw strings, prepend `r`. Escape handling is not done for raw strings: ```v -rocket := `🚀` -assert 'aloha!'[0] == `a` -``` - -Both single and double quotes can be used to denote strings. For consistency, -`vfmt` converts double quotes to single quotes unless the string contains a single quote character. - -For raw strings, prepend `r`. Raw strings are not escaped: - -```v -s := r'hello\nworld' +s := r'hello\nworld' // the `\n` will be preserved as two characters println(s) // "hello\nworld" ``` @@ -537,41 +548,79 @@ Strings can be easily converted to integers: ```v s := '42' n := s.int() // 42 + +// all int literals are supported +assert '0xc3'.int() == 195 +assert '0o10'.int() == 8 +assert '0b1111_0000_1010'.int() == 3850 +assert '-0b1111_0000_1010'.int() == -3850 ``` -### Runes -A `rune` represents a unicode character and is an alias for `u32`. Runes can be created like this: -```v -x := `🚀` -``` - -A string can be converted to runes by the `.runes()` method. -```v -hello := 'Hello World 👋' -hello_runes := hello.runes() // [`H`, `e`, `l`, `l`, `o`, ` `, `W`, `o`, `r`, `l`, `d`, ` `, `👋`] -``` +For more advanced `string` processing and conversions, refer to the +[vlib/strconv](https://modules.vlang.io/strconv.html) module. ### String interpolation -Basic interpolation syntax is pretty simple - use `$` before a variable name. -The variable will be converted to a string and embedded into the literal: +Basic interpolation syntax is pretty simple - use `$` before a variable name. The variable will be +converted to a string and embedded into the literal: + ```v name := 'Bob' println('Hello, $name!') // Hello, Bob! ``` -It also works with fields: `'age = $user.age'`. -If you need more complex expressions, use `${}`: `'can register = ${user.age > 13}'`. -Format specifiers similar to those in C's `printf()` are also supported. -`f`, `g`, `x`, etc. are optional and specify the output format. -The compiler takes care of the storage size, so there is no `hd` or `llu`. +It also works with fields: `'age = $user.age'`. If you need more complex expressions, use `${}`: +`'can register = ${user.age > 13}'`. + +Format specifiers similar to those in C's `printf()` are also supported. `f`, `g`, `x`, `o`, `b`, +etc. are optional and specify the output format. The compiler takes care of the storage size, so +there is no `hd` or `llu`. + +To use a format specifier, follow this pattern: + +`${varname:[flags][width][.precision][type]}` + +- flags: may be zero or more of the following: `-` to left-align output within the field, `0` to use + `0` as the padding character instead of the default `space` character. (Note: V does not currently + support the use of `'` or `#` as format flags, and V supports but doesn't need `+` to right-align + since that's the default.) +- width: may be an integer value describing the minimum width of total field to output. +- precision: an integer value preceeded by a `.` will guarantee that many digits after the decimal + point, if the input variable is a float. Ignored if variable is an integer. +- type: `f` and `F` specify the input is a float and should be rendered as such, `e` and `E` specify + the input is a float and should be rendered as an exponent (partially broken), `g` and `G` specify + the input is a float--the renderer will use floating point notation for small values and exponent + notation for large values, `d` specifies the input is an integer and should be rendered in base-10 + digits, `x` and `X` require an integer and will render it as hexadecimal digits, `o` requires an + integer and will render it as octal digits, `b` requires an integer and will render it as binary + digits, `s` requires a string (almost never used). + +Note: when a numeric type can render alphabetic characters, such as hex strings or special values +like `infinity`, the lowercase version of the type forces lowercase alphabetics and the uppercase +version forces uppercase alphabetics. + +Also note: in most cases, it's best to leave the format type empty. Floats will be rendered by +default as `g`, integers will be rendered by default as `d`, and `s` is almost always redundant. +There are only three cases where specifying a type is recommended: + +- format strings are parsed at compile time, so specifing a type can help detect errors then +- format strings default to using lowercase letters for hex digits and the `e` in exponents. Use a + uppercase type to force the use of uppercase hex digits and an uppercase `E` in exponents. +- format strings are the most convenient way to get hex, binary or octal strings from an integer. + +See +[Format Placeholder Specification](https://en.wikipedia.org/wiki/Printf_format_string#Format_placeholder_specification) +for more information. ```v x := 123.4567 -println('x = ${x:4.2f}') -println('[${x:10}]') // pad with spaces on the left => [ 123.457] -println('[${int(x):-10}]') // pad with spaces on the right => [123 ] +println('[${x:.2}]') // round to two decimal places => [123.46] +println('[${x:10}]') // right-align with spaces on the left => [ 123.457] +println('[${int(x):-10}]') // left-align with spaces on the right => [123 ] println('[${int(x):010}]') // pad with zeros on the left => [0000000123] +println('[${int(x):b}]') // output as binary => [1111011] +println('[${int(x):o}]') // output as octal => [173] +println('[${int(x):X}]') // output as uppercase hex => [7B] ``` ### String operators @@ -585,13 +634,14 @@ s += 'world' // `+=` is used to append to a string println(s) // "hello world" ``` -All operators in V must have values of the same type on both sides. -You cannot concatenate an integer to a string: +All operators in V must have values of the same type on both sides. You cannot concatenate an +integer to a string: ```v failcompile age := 10 println('age = ' + age) // not allowed ``` + > error: infix expr: cannot use `int` (right expression) as `string` We have to either convert `age` to a `string`: @@ -608,6 +658,62 @@ age := 12 println('age = $age') ``` +### Runes + +A `rune` represents a single Unicode character and is an alias for `u32`. To denote them, use ` +(backticks) : + +```v +rocket := `🚀` +``` + +A `rune` can be converted to a UTF-8 string by using the `.str()` method. + +```v +rocket := `🚀` +assert rocket.str() == '🚀' +``` + +A `rune` can be converted to UTF-8 bytes by using the `.bytes()` method. + +```v +rocket := `🚀` +assert rocket.bytes() == [byte(0xf0), 0x9f, 0x9a, 0x80] +``` + +Hex, Unicode, and Octal escape sequences also work in a `rune` literal: + +```v +assert `\x61` == `a` +assert `\141` == `a` +assert `\u0061` == `a` + +// multibyte literals work too +assert `\u2605` == `★` +assert `\u2605`.bytes() == [byte(0xe2), 0x98, 0x85] +assert `\xe2\x98\x85`.bytes() == [byte(0xe2), 0x98, 0x85] +assert `\342\230\205`.bytes() == [byte(0xe2), 0x98, 0x85] +``` + +Note that `rune` literals use the same escape syntax as strings, but they can only hold one unicode +character. Therefore, if your code does not specify a single Unicode character, you will receive an +error at compile time. + +Also remember that strings are indexed as bytes, not runes, so beware: + +```v +rocket_string := '🚀' +assert rocket_string[0] != `🚀` +assert 'aloha!'[0] == `a` +``` + +A string can be converted to runes by the `.runes()` method. + +```v +hello := 'Hello World 👋' +hello_runes := hello.runes() // [`H`, `e`, `l`, `l`, `o`, ` `, `W`, `o`, `r`, `l`, `d`, ` `, `👋`] +``` + ### Numbers ```v diff --git a/examples/v_script.vsh b/examples/v_script.vsh old mode 100644 new mode 100755 diff --git a/vlib/v/checker/tests/sum_type_ref_variant_err.out b/vlib/v/checker/tests/sum_type_ref_variant_err.out index d893a2b13e..7392a0cb29 100644 --- a/vlib/v/checker/tests/sum_type_ref_variant_err.out +++ b/vlib/v/checker/tests/sum_type_ref_variant_err.out @@ -15,4 +15,4 @@ vlib/v/checker/tests/sum_type_ref_variant_err.vv:9:18: error: sum type cannot ho 7 | type Alphabet1 = Abc | string | &Xyz 8 | type Alphabet2 = Abc | &Xyz | string 9 | type Alphabet3 = &Xyz | Abc | string - | ~~~~ \ No newline at end of file + | ~~~~ diff --git a/vlib/v/parser/tests/duplicate_type_a.out b/vlib/v/parser/tests/duplicate_type_a.out index a2324a44d5..4700e533bd 100644 --- a/vlib/v/parser/tests/duplicate_type_a.out +++ b/vlib/v/parser/tests/duplicate_type_a.out @@ -1,5 +1,5 @@ vlib/v/parser/tests/duplicate_type_a.vv:3:11: error: cannot register interface `Foo`, another type with this name exists 1 | struct Foo {} - 2 | + 2 | 3 | interface Foo {} | ~~~ diff --git a/vlib/v/scanner/scanner.v b/vlib/v/scanner/scanner.v index 860a51d8ec..9bd32d9be5 100644 --- a/vlib/v/scanner/scanner.v +++ b/vlib/v/scanner/scanner.v @@ -1307,6 +1307,28 @@ fn decode_h_escapes(s string, start int, escapes_pos []int) string { return ss.join('') } +// handle single-byte inline octal escapes like '\###' +fn decode_o_escapes(s string, start int, escapes_pos []int) string { + if escapes_pos.len == 0 { + return s + } + mut ss := []string{cap: escapes_pos.len} + ss << s[..escapes_pos.first() - start] // everything before the first escape code position + for i, pos in escapes_pos { + idx := pos - start + end_idx := idx + 4 // "\XXX".len == 4 + // notice this function doesn't do any decoding... it just replaces '\141' with the byte 0o141 + ss << [byte(strconv.parse_uint(s[idx + 1..end_idx], 8, 8) or { 0 })].bytestr() + if i + 1 < escapes_pos.len { + ss << s[end_idx..escapes_pos[i + 1] - start] + } else { + ss << s[end_idx..] + } + } + return ss.join('') +} + +// decode the flagged unicode escape sequences into their utf-8 bytes fn decode_u_escapes(s string, start int, escapes_pos []int) string { if escapes_pos.len == 0 { return s @@ -1348,9 +1370,10 @@ fn trim_slash_line_break(s string) string { /// possibilities: /// single chars like `a`, `b` => 'a', 'b' /// escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n' -/// escaped hex bytes like `\x01`, `\x61` => '\x01', 'a' -/// escaped multibyte runes like `\xe29885` => (★) +/// escaped single hex bytes like `\x01`, `\x61` => '\x01', 'a' /// escaped unicode literals like `\u2605` +/// escaped utf8 runes in hex like `\xe2\x98\x85` => (★) +/// escaped utf8 runes in octal like `\342\230\205` => (★) fn (mut s Scanner) ident_char() string { lspos := token.Position{ line_nr: s.line_nr @@ -1365,6 +1388,7 @@ fn (mut s Scanner) ident_char() string { // set flags for advanced escapes first escaped_hex := s.expect('\\x', start + 1) escaped_unicode := s.expect('\\u', start + 1) + escaped_octal := !escaped_hex && !escaped_unicode && s.expect('\\', start + 1) // walk the string to get characters up to the next backtick for { @@ -1390,65 +1414,40 @@ fn (mut s Scanner) ident_char() string { return c } if len != 1 { + // the string inside the backticks is longer than one character + // but we might only have one rune... attempt to decode escapes // if the content expresses an escape code, it will have an even number of characters - // e.g. \x61 or \u2605 - if (c.len % 2 == 0) && (escaped_hex || escaped_unicode) { + // e.g. (octal) \141 (hex) \x61 or (unicode) \u2605 + // we don't handle binary escape codes in rune literals + orig := c + if (c.len % 2 == 0) && (escaped_hex || escaped_unicode || escaped_octal) { if escaped_unicode { + // there can only be one, so attempt to decode it now c = decode_u_escapes(c, 0, [0]) } else { - // we have to handle hex ourselves - ascii_0 := byte(0x30) - ascii_a := byte(0x61) - mut accumulated := []byte{} - val := c[2..c.len].to_lower() // 0A -> 0a - mut offset := 0 - // take two characters at a time, parse as hex and add to bytes - for { - if offset >= val.len - 1 { - break + // find escape sequence start positions + mut escapes_pos := []int{} + for i, v in c { + if v == `\\` { + escapes_pos << i } - mut byteval := byte(0) - big := val[offset] - little := val[offset + 1] - if !big.is_hex_digit() { - accumulated.clear() - break - } - if !little.is_hex_digit() { - accumulated.clear() - break - } - - if big.is_digit() { - byteval |= (big - ascii_0) << 4 - } else { - byteval |= (big - ascii_a + 10) << 4 - } - if little.is_digit() { - byteval |= (little - ascii_0) - } else { - byteval |= (little - ascii_a + 10) - } - - accumulated << byteval - offset += 2 } - if accumulated.len > 0 { - c = accumulated.bytestr() + if escaped_hex { + c = decode_h_escapes(c, 0, escapes_pos) + } else { + c = decode_o_escapes(c, 0, escapes_pos) } } } - // the string inside the backticks is longer than one character - // but we might only have one rune, say in the case u := c.runes() if u.len != 1 { if escaped_hex || escaped_unicode { - s.error('invalid character literal (escape sequence did not refer to a singular rune)') + s.error('invalid character literal `$orig` => `$c` ($u) (escape sequence did not refer to a singular rune)') } else { s.add_error_detail_with_pos('use quotes for strings, backticks for characters', lspos) - s.error('invalid character literal (more than one character)') + s.error('invalid character literal `$orig` => `$c` ($u) (more than one character)') } } } diff --git a/vlib/v/scanner/scanner_test.v b/vlib/v/scanner/scanner_test.v index 8cdae1a43b..b4076e7a43 100644 --- a/vlib/v/scanner/scanner_test.v +++ b/vlib/v/scanner/scanner_test.v @@ -150,13 +150,19 @@ fn test_ref_ref_array_ref_ref_foo() { assert result[6] == .name } -fn test_escape_string() { - // these assertions aren't helpful... - // they test the vlib built-in to the compiler, - // but we want to test this module before compilation - assert '\x61' == 'a' - assert '\x62' == 'b' - // assert `\x61` == `a` // will work after pull request goes through +fn test_escape_rune() { + // these lines work if the v compiler is working + // will not work until v compiler on github is updated + // assert `\x61` == `a` + // assert `\u0061` == `a` + + // will not work until PR is accepted + // assert `\141` == `a` + // assert `\xe2\x98\x85` == `★` + // assert `\342\230\205` == `★` + + // the following lines test the scanner module + // even before it is compiled into the v executable // SINGLE CHAR ESCAPES // SINGLE CHAR APOSTROPHE @@ -187,14 +193,30 @@ fn test_escape_string() { // SINGLE CHAR INCORRECT ESCAPE // result = scan_tokens(r'`\x61\x61`') // should always result in an error - // SINGLE CHAR MULTI-BYTE UTF-8 - // Compilation blocked by vlib/v/checker/check_types.v, but works in the repl - result = scan_tokens(r'`\xe29885`') + // SINGLE CHAR MULTI-BYTE UTF-8 (hex) + result = scan_tokens(r'`\xe2\x98\x85`') assert result[0].lit == r'★' + // SINGLE CHAR MULTI-BYTE UTF-8 (octal) + result = scan_tokens(r'`\342\230\205`') + assert result[0].lit == r'★' +} + +fn test_escape_string() { + // these lines work if the v compiler is working + assert '\x61' == 'a' + assert '\x62' == 'b' + assert '\u0061' == 'a' + assert '\141' == 'a' + assert '\xe2\x98\x85' == '★' + assert '\342\230\205' == '★' + + // the following lines test the scanner module + // even before it is compiled into the v executable + // STRING ESCAPES ================= // STRING APOSTROPHE - result = scan_tokens(r"'\''") + mut result := scan_tokens(r"'\''") assert result[0].kind == .string assert result[0].lit == r"\'"