regex: bug fix in replace using function, added tests (#9381)

2021-03-20 00:54:12 +01:00 · 2021-03-20 00:54:12 +01:00 · 59f95170b3
parent b0e225ac2d
commit 59f95170b3
2 changed files with 111 additions and 26 deletions
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -1,4 +1,5 @@
 import regex
 import rand
 /******************************************************************************
 *
@ -288,7 +289,7 @@ find_all_test_suite = [
 )
 const (
-	debug = false // true for debug println 
+	debug = true // true for debug println 
 )
 fn test_regex(){
@ -497,3 +498,35 @@ fn test_regex_func(){
 		assert false
 	}
 }
 fn my_repl(re regex.RE, in_txt string, start int, end int) string {
 	s0 := re.get_group_by_id(in_txt,0)[0..1] + "X"
 	s1 := re.get_group_by_id(in_txt,1)[0..1] + "X"
 	s2 := re.get_group_by_id(in_txt,2)[0..1] + "X"
 	return "${s0}${s1}${s2}"
 }
 // test regex replace function
 fn test_regex_func_replace(){
 	filler := "E il primo dei tre regni dell'Oltretomba cristiano visitato da Dante nel corso del viaggio, con la guida di Virgilio."
 	txt    := r'"content": "They dont necessarily flag "you will be buying these shares on margin!"", "channel_id"'
 	query := r'"(content":\s+")(.*)(, "channel_id")'
 	mut re := regex.regex_opt(query) or { panic(err) }
 	mut txt1 := ""
 	mut txt2 := ""
 	for _ in 0..3 {
 		rnd := int(10+rand.u32() % 20)
 		txt1 += txt      + filler[0..rnd] + "\n"
 		txt2 += "cXTX,X" + filler[0..rnd] + "\n"
 	}
 	result := re.replace_by_fn(txt1, my_repl)
 	if debug {
 		eprintln(result)
 		eprintln(txt2)
 	}
 	assert result == txt2
 }
--- a/vlib/regex/regex_util.v
+++ b/vlib/regex/regex_util.v
@ -8,6 +8,7 @@ that can be found in the LICENSE file.
 */
 module regex
 import strings
 /******************************************************************************
 *
@ -71,7 +72,7 @@ pub fn (re RE) get_group_by_id(in_txt string, group_id int) string {
 // get_group_by_id get a group boundaries by its id
 pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int) {
-	if group_id < (re.groups.len >> 1) {
+	if group_id < re.group_count {
 		index := group_id << 1
 		return re.groups[index], re.groups[index + 1]
 	}
@ -146,7 +147,7 @@ pub fn (mut re RE) match_string(in_txt string) (int,int) {
 *
 ******************************************************************************/
 /*
-// find internal implementation
+// find internal implementation HERE for reference do not remove!!
 [direct_array_access]
 fn (mut re RE) find_imp(in_txt string) (int,int) {
 	old_flag := re.flag
@ -169,6 +170,9 @@ fn (mut re RE) find_imp(in_txt string) (int,int) {
 // find try to find the first match in the input string
 [direct_array_access]
 pub fn (mut re RE) find(in_txt string) (int,int) {
 	//old_flag := re.flag
 	//re.flag |= f_src  // enable search mode
 	mut i := 0
 	for i < in_txt.len {
 		//--- speed references ---
@ -183,18 +187,59 @@ pub fn (mut re RE) find(in_txt string) (int,int) {
 		//------------------------
 		if s >= 0 && e > s {
 			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]")
 			//re.flag = old_flag
 			return i+s, i+e
 		} else {
 			i++
 		}
 	}
 	//re.flag = old_flag
 	return -1, -1
 }
 // find try to find the first match in the input string strarting from start index
 [direct_array_access]
 pub fn (mut re RE) find_from(in_txt string, start int) (int,int) {
 	old_flag := re.flag
 	re.flag |= f_src  // enable search mode
 	mut i := start
 	if i < 0 {
 		return -1, -1
 	}
 	for i < in_txt.len {
 		//--- speed references ---
 		mut s := -1
 		mut e := -1
 		unsafe {
 			tmp_str := tos(in_txt.str+i, in_txt.len-i)
 			s,e = re.match_string(tmp_str)
 		}
 		//------------------------
 		//s,e = re.find_imp(in_txt[i..])
 		//------------------------
 		if s >= 0 && e > s {
 			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]")
 			re.flag = old_flag
 			return i+s, i+e
 		} else {
 			i++
 		}
 	}
 	re.flag = old_flag
 	return -1, -1
 }
 // find_all find all the non overlapping occurrences of the match pattern
 [direct_array_access]
 pub fn (mut re RE) find_all(in_txt string) []int {
 	//old_flag := re.flag
 	//re.flag |= f_src  // enable search mode
 	mut i := 0
 	mut res := []int{}
 	mut ls := -1
@ -222,6 +267,7 @@ pub fn (mut re RE) find_all(in_txt string) []int {
 		}
 	}
 	//re.flag = old_flag
 	return res
 }
@ -295,34 +341,40 @@ pub type FnReplace = fn (re RE, in_txt string, start int, end int) string
 // replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function
 pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
 	mut i   := 0
-	mut res := ""
+	mut res := strings.new_builder(in_txt.len)
-	mut ls  := -1
+	mut last_end    := 0
 	mut s1  := 0
 	for i < in_txt.len {
-		s,e := re.find(in_txt[i..])
+		//println("Find Start. $i [${in_txt[i..]}]")
-		if s >= 0 && e > s && i+s > ls {
+		s, e := re.find_from(in_txt,i)
-			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
+		//println("Find End.")
-			start := i + s
+		if s >= 0 && e > s  {
-			end   := i + e
+			//println("find match in: ${s},${e} [${in_txt[s..e]}]")
-			// update grups index diplacement
+			
-			mut gi := 0
+			if last_end < s {
-			for gi < re.groups.len {
+				res.write_string(in_txt[last_end..s])
 				re.groups[gi] += i
 				gi++
 			}
 			repl  := repl_fn(re, in_txt, start, end)
-			res += in_txt[s1..start] + repl
+			for g_i in 0..re.group_count {
-			s1 = end
+				re.groups[g_i << 1      ] += i
 				re.groups[(g_i << 1) + 1] += i
 			}
-			ls = i + s
+			repl := repl_fn(re, in_txt, s, e)
-			i  = i + e
+			//println("repl res: $repl")
-			continue
+			res.write_string(repl)
 			//res.write_string("[[${in_txt[s..e]}]]")
 			last_end = e
 			i = e
 		} else {
-			i++
+			break
 			//i++
 		}
 		//println(i)
 	}
-	res += in_txt[s1..]
+	if last_end >= 0 && last_end < in_txt.len {
-	return res
+		res.write_string(in_txt[last_end..])
 	}
 	return res.str()
 }