strings: add find_between_pair (#13468)

2022-02-15 14:12:15 +01:00 · 2022-02-15 14:12:15 +01:00 · 80444c8ec4
parent d25652fbcf
commit 80444c8ec4
2 changed files with 197 additions and 0 deletions
--- a/vlib/strings/strings.v
+++ b/vlib/strings/strings.v
@ -11,3 +11,118 @@ pub fn random(n int) string {
 	return tos(buf)
 }
 */
+
+// find_between_pair_byte returns the string found between the pair of marks defined
+// by `start` and `end`.
+// As opposed to the `find_between`, `all_after*`, `all_before*` methods defined on the
+// `string` type, this function can extract content between *nested* marks in `input`.
+// If `start` and `end` marks are nested in `input`, the characters
+// between the *outermost* mark pair is returned. It is expected that `start` and `end`
+// marks are *balanced*, meaning that the amount of `start` marks equal the
+// amount of `end` marks in the `input`. An empty string is returned otherwise.
+// Using two identical marks as `start` and `end` results in undefined output behavior.
+// find_between_pair_byte is the fastest in the find_between_pair_* family of functions.
+// Example: assert strings.find_between_pair_byte('(V) (NOT V)',`(`,`)`) == 'V'
+// Example: assert strings.find_between_pair_byte('s {X{Y}} s',`{`,`}`) == 'X{Y}'
+pub fn find_between_pair_byte(input string, start byte, end byte) string {
+	mut marks := 0
+	mut start_index := -1
+	for i, b in input {
+		if b == start {
+			if start_index == -1 {
+				start_index = i + 1
+			}
+			marks++
+			continue
+		}
+		if start_index > 0 {
+			if b == end {
+				marks--
+				if marks == 0 {
+					return input[start_index..i]
+				}
+			}
+		}
+	}
+	return ''
+}
+
+// find_between_pair_rune returns the string found between the pair of marks defined
+// by `start` and `end`.
+// As opposed to the `find_between`, `all_after*`, `all_before*` methods defined on the
+// `string` type, this function can extract content between *nested* marks in `input`.
+// If `start` and `end` marks are nested in `input`, the characters
+// between the *outermost* mark pair is returned. It is expected that `start` and `end`
+// marks are *balanced*, meaning that the amount of `start` marks equal the
+// amount of `end` marks in the `input`. An empty string is returned otherwise.
+// Using two identical marks as `start` and `end` results in undefined output behavior.
+// find_between_pair_rune is inbetween the fastest and slowest in the find_between_pair_* family of functions.
+// Example: assert strings.find_between_pair_rune('(V) (NOT V)',`(`,`)`) == 'V'
+// Example: assert strings.find_between_pair_rune('s {X{Y}} s',`{`,`}`) == 'X{Y}'
+pub fn find_between_pair_rune(input string, start rune, end rune) string {
+	mut marks := 0
+	mut start_index := -1
+	runes := input.runes()
+	for i, r in runes {
+		if r == start {
+			if start_index == -1 {
+				start_index = i + 1
+			}
+			marks++
+			continue
+		}
+		if start_index > 0 {
+			if r == end {
+				marks--
+				if marks == 0 {
+					return runes[start_index..i].string()
+				}
+			}
+		}
+	}
+	return ''
+}
+
+// find_between_pair_string returns the string found between the pair of marks defined
+// by `start` and `end`.
+// As opposed to the `find_between`, `all_after*`, `all_before*` methods defined on the
+// `string` type, this function can extract content between *nested* marks in `input`.
+// If `start` and `end` marks are nested in `input`, the characters
+// between the *outermost* mark pair is returned. It is expected that `start` and `end`
+// marks are *balanced*, meaning that the amount of `start` marks equal the
+// amount of `end` marks in the `input`. An empty string is returned otherwise.
+// Using two identical marks as `start` and `end` results in undefined output behavior.
+// find_between_pair_string is the slowest in the find_between_pair_* function family.
+// Example: assert strings.find_between_pair_string('/*V*/ /*NOT V*/','/*','*/') == 'V'
+// Example: assert strings.find_between_pair_string('s {{X{{Y}}}} s','{{','}}') == 'X{{Y}}'
+pub fn find_between_pair_string(input string, start string, end string) string {
+	mut start_index := -1
+	mut marks := 0
+	start_runes := start.runes()
+	end_runes := end.runes()
+	runes := input.runes()
+	mut i := 0
+	for ; i < runes.len; i++ {
+		start_slice := runes#[i..i + start_runes.len]
+		if start_slice == start_runes {
+			i = i + start_runes.len - 1
+			if start_index < 0 {
+				start_index = i + 1
+			}
+			marks++
+			continue
+		}
+		if start_index > 0 {
+			end_slice := runes#[i..i + end_runes.len]
+			if end_slice == end_runes {
+				marks--
+				if marks == 0 {
+					return runes[start_index..i].string()
+				}
+				i = i + end_runes.len - 1
+				continue
+			}
+		}
+	}
+	return ''
+}
--- a/vlib/strings/strings_test.v
+++ b/vlib/strings/strings_test.v
@ -12,3 +12,85 @@ fn test_repeat_string() {
 	assert strings.repeat_string('abc', 0) == ''
 	assert strings.repeat_string('', 200) == ''
 }
+
+const test_rune_and_byte = [
+	'xxx[ok1]xxx',
+	'xxx[[ok2]okok]',
+	'xxx[ok3[[[ok]okok]]]',
+	'yyy[ok4]',
+	'[]',
+	']',
+	'[',
+	'yyy[ok5][]zzz',
+	'yyy[xxx',
+	'xxx[xxx
+	xxx]',
+]
+
+const test_strings = [
+	'xxx/*ok1*/xxx',
+	'xxx/*/*ok2*/okok*/',
+	'xxx/*ok3/*/*/*ok*/okok*/*/*/',
+	'yyy/*ok4*/',
+	'/**/',
+	'*/',
+	'/*',
+	'yyy/*ok5*//**/zzz',
+	'yyy/*xxx',
+	'xxx/*xxx
+	xxx*/xxx',
+]
+
+const expected_rune_and_byte_outputs = [
+	'ok1',
+	'[ok2]okok',
+	'ok3[[[ok]okok]]',
+	'ok4',
+	'',
+	'',
+	'',
+	'ok5',
+	'',
+	'xxx
+	xxx',
+]
+
+const expected_string_outputs = [
+	'ok1',
+	'/*ok2*/okok',
+	'ok3/*/*/*ok*/okok*/*/',
+	'ok4',
+	'',
+	'',
+	'',
+	'ok5',
+	'',
+	'xxx
+	xxx',
+]
+
+fn test_find_between_pair_family() {
+	assert strings.find_between_pair_rune('xx♡ok❦yy', `♡`, `❦`) == 'ok'
+	assert strings.find_between_pair_byte('xx{ok}yy', `{`, `}`) == 'ok'
+	assert strings.find_between_pair_string('xx/*ok*/yy', '/*', '*/') == 'ok'
+	assert strings.find_between_pair_byte('xx{ok}yy', `{`, `}`) == 'ok'
+	assert strings.find_between_pair_string('xxxxokyyyy', 'xxx', 'yyy') == 'xok'
+
+	for i, tstr in test_rune_and_byte {
+		e1 := strings.find_between_pair_rune(tstr, `[`, `]`)
+		e2 := expected_rune_and_byte_outputs[i]
+		assert '$e1' == '$e2'
+	}
+
+	for i, tstr in test_rune_and_byte {
+		e1 := strings.find_between_pair_byte(tstr, `[`, `]`)
+		e2 := expected_rune_and_byte_outputs[i]
+		assert '$e1' == '$e2'
+	}
+
+	for i, tstr in test_strings {
+		e1 := strings.find_between_pair_string(tstr, '/*', '*/')
+		e2 := expected_string_outputs[i]
+		assert '$e1' == '$e2'
+	}
+}