From ebfacca252b620951266032863b27dd73f889061 Mon Sep 17 00:00:00 2001
From: Larpon <Larpon@users.noreply.github.com>
Date: Thu, 2 Dec 2021 10:19:12 +0100
Subject: [PATCH] toml: fix bug in unicode decoding (#12643)

---
 vlib/toml/decoder/decoder.v                    | 18 ++++++++++--------
 .../tests/alexcrichton.toml-rs-tests_test.v    |  1 -
 vlib/toml/tests/iarna.toml-spec-tests_test.v   |  4 +---
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/vlib/toml/decoder/decoder.v b/vlib/toml/decoder/decoder.v
index cd5fe942da..ede7920e5d 100644
--- a/vlib/toml/decoder/decoder.v
+++ b/vlib/toml/decoder/decoder.v
@@ -159,16 +159,20 @@ pub fn decode_quoted_escapes(mut q ast.Quoted) ? {
 					&& byte(s.peek(3)).is_hex_digit() && byte(s.peek(4)).is_hex_digit()
 
 				if is_valid_short {
-					// is_valid_long := byte(s.peek(5)).is_hex_digit() && byte(s.peek(6)).is_hex_digit() && byte(s.peek(7)).is_hex_digit() && byte(s.peek(8)).is_hex_digit()
-					// Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
+					is_valid_long := byte(s.peek(5)).is_hex_digit()
+						&& byte(s.peek(6)).is_hex_digit() && byte(s.peek(7)).is_hex_digit()
+						&& byte(s.peek(8)).is_hex_digit()
+					// If it's a long type Unicode (\UXXXXXXXX) with a maximum of 10 chars: '\' + 'U' + 8 hex characters
 					// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
 					// of 9 chars plus one extra.
+					// Else it's a short sequence (\uXXXX) with a maximum of 6 chars: '\' + 'U' + 4 hex characters.
 					mut decoded := ''
 					mut sequence_length := 0
 					mut unicode_val := 0
-					if s.remaining() >= 10 {
+					mut slen := if is_valid_long { 10 } else { 6 }
+					if slen <= s.remaining() {
 						pos := s.state().pos
-						sequence := s.text[pos..pos + 11]
+						sequence := s.text[pos..pos + slen + 1]
 
 						decoded, unicode_val, sequence_length = decode_unicode_escape(sequence) or {
 							decoded_s += escape
@@ -184,11 +188,9 @@ pub fn decode_quoted_escapes(mut q ast.Quoted) ? {
 							decoded_s += escape
 							continue
 						}
-						if unicode_val in [0x7F, 0x1F, 0x5C, 0x75] {
-							sequence_length -= 2
-						}
 						decoded_s += decoded
-						s.skip_n(s.text[pos..pos + 2 + sequence_length + 1].len)
+						replacement := s.text[pos..pos + sequence_length + 1]
+						s.skip_n(replacement.len)
 						continue
 					} else {
 						pos := s.state().pos
diff --git a/vlib/toml/tests/alexcrichton.toml-rs-tests_test.v b/vlib/toml/tests/alexcrichton.toml-rs-tests_test.v
index 6299be500c..2a9f075394 100644
--- a/vlib/toml/tests/alexcrichton.toml-rs-tests_test.v
+++ b/vlib/toml/tests/alexcrichton.toml-rs-tests_test.v
@@ -20,7 +20,6 @@ const (
 	]
 
 	valid_value_exceptions = [
-		'valid/unicode-escape.toml',
 		// These have correct values, and should've passed, but the format of arrays is *mixed* in the JSON ??
 		'valid/example2.toml',
 	]
diff --git a/vlib/toml/tests/iarna.toml-spec-tests_test.v b/vlib/toml/tests/iarna.toml-spec-tests_test.v
index a399c8da19..7c6c3a2a38 100644
--- a/vlib/toml/tests/iarna.toml-spec-tests_test.v
+++ b/vlib/toml/tests/iarna.toml-spec-tests_test.v
@@ -19,9 +19,7 @@ const (
 	valid_exceptions       = []string{}
 	invalid_exceptions     = []string{}
 
-	valid_value_exceptions = [
-		'values/spec-string-basic.toml',
-	]
+	valid_value_exceptions = []string{}
 
 	yaml_value_exceptions  = [
 		'values/spec-float-5.toml', // YAML: "1e6", V: 1000000