regex: add a find_all_str function (#7517)

2020-12-24 06:27:46 +01:00 · 2020-12-24 06:27:46 +01:00 · 2824e07baa
parent 36dcace0a7
commit 2824e07baa
5 changed files with 248 additions and 198 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@ -473,7 +473,7 @@ pub fn new() RE
 ```
 #### **Custom initialization**
-For some particular need it is possible initialize a fully customized regex:
+For some particular needs it is possible initialize a fully manually customized regex:
 ```v ignore
 pattern = r"ab(.*)(ac)"
 // init custom regex
@ -484,6 +484,8 @@ re.cc   = []CharClass{len: pattern.len}     // can not be more char class the th
 re.group_csave_flag = false          // true enable continuos group saving if needed
 re.group_max_nested = 128            // set max 128 group nested possible
 re.group_max        = pattern.len>>1 // we can't have more groups than the half of the pattern legth
 re.group_stack = []int{len: re.group_max, init: -1}
 re.group_data  = []int{len: re.group_max, init: -1}
 ```
 ### Compiling
@ -494,22 +496,14 @@ After an initializer is used, the regex expression must be compiled with:
 pub fn (re mut RE) compile_opt(in_txt string) ?
 ```
-### Operative Functions
+### Matching Functions
-These are the operative functions
+These are the matching functions
 ```v ignore
 // match_string try to match the input string, return start and end index if found else start is -1
 pub fn (re mut RE) match_string(in_txt string) (int,int)
 // find try to find the first match in the input string, return start and end index if found else start is -1
 pub fn (re mut RE) find(in_txt string) (int,int)
 // find_all find all the "non overlapping" occurrences of the matching pattern, return a list of start end indexes
 pub fn (re mut RE) find_all(in_txt string) []int
 // replace return a string where the matches are replaced with the replace string, only non overlapped matches are used
 pub fn (re mut RE) replace(in_txt string, repl string) string
 ```
 ## Find and Replace
@ -519,13 +513,19 @@ There are the following find  and replace functions:
 #### Find functions
 ```v ignore
-// find try to find the first match in the input string, return start and end index if found else start is -1
+// find try to find the first match in the input string
 // return start and end index if found else start is -1
 pub fn (re mut RE) find(in_txt string) (int,int)
 // find_all find all the "non overlapping" occurrences of the matching pattern
 // return a list of start end indexes like: [3,4,6,8] 
 // the matches are [3,4] and [6,8]
 pub fn (re mut RE) find_all(in_txt string) []int
 // find_all find all the "non overlapping" occurrences of the matching pattern
 // return a list of strings
 // the result is like ["first match","secon match"]
 pub fn (mut re RE) find_all_str(in_txt string) []string
 ```
 #### Replace functions
@ -543,10 +543,12 @@ The`replace_by_fn` use a custom replace function making possible customizations.
 The custom function must be of the type:
 ```v ignore
-// re RE struct
+// type of function used for custom replace
-// in_txt all the text passed to the regex expression
+// in_txt  source text
-// the match is: in_txt[start..end]
+// start   index of the start of the match in in_txt
-fn (re RE, in_txt string, start int, end int) string
+// end     index of the end   of the match in in_txt
 // --- the match is in in_txt[start..end] ---
 fn (re RE, in_txt string, start int, end int) string 
 ```
 The following example will clarify the use:
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -1554,14 +1554,14 @@ fn state_str(s Match_state) string {
 struct StateObj {
 pub mut:
-	group_index int = -1  // group id used to know how many groups are open
+	group_index int  = -1  // group id used to know how many groups are open
-	match_flag  bool
+	match_flag  bool       // indicate if we are in a match condition
-	match_index int = -1
+	match_index int  = -1  // index of the last match
-	first_match int = -1  //index of the first match
+	first_match int  = -1  // index of the first match
-	pc int = -1           // program counter
+	pc          int  = -1  // program counter
-	i  int = -1           // source string index
+	i           int  = -1  // source string index
-	char_len int
+	char_len    int        // last char legth
-	last_dot_pc int = -1      // last dot chat pc
+	last_dot_pc int  = -1  // last dot chat pc
 }
 [direct_array_access]
@ -1579,13 +1579,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 	mut ist   := rune(0)              // actual instruction
 	mut l_ist := rune(0)              // last matched instruction
 	//mut state_list := []StateObj{}
 	//mut group_stack := []int{len: re.group_max, init: -1}
 	//mut group_data  := []int{len: re.group_max, init: -1}
 	//mut group_index := -1           // group id used to know how many groups are open
 	mut step_count  := 0              // stats for debug
 	mut dbg_line    := 0              // count debug line printed
@ -1900,7 +1893,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 					//println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
 					if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
 	 					start_i   := re.group_stack[state.group_index]
 	 					//re.group_stack[state.group_index]=-1
 	 					// save group results
 						g_index := re.prog[state.pc].group_id*2
@ -1960,8 +1952,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				// check next token to be false
 				mut next_check_flag := false
 				//if re.prog[state.pc].rep >= re.prog[state.pc].rep_min && 
 				// if we are done with max go on dot char are dedicated case!!
 				if	re.prog[state.pc].rep >= re.prog[state.pc].rep_max 
 				{
@ -2415,113 +2405,3 @@ pub fn (mut re RE) match_string(in_txt string) (int,int) {
 	}
 	return start, end
 }
 //
 // Finders
 //
 // find try to find the first match in the input string
 [direct_array_access]
 pub fn (mut re RE) find(in_txt string) (int,int) {
 	old_flag := re.flag
 	re.flag |= f_src  // enable search mode
 	start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
 	//print("Find [$start,$end] '${in_txt[start..end]}'")
 	if end > in_txt.len {
 		end = in_txt.len
 	}
 	re.flag = old_flag
 	if start >= 0 && end > start {
 		return start, end
 	}
 	return no_match_found, 0
 }
 // find all the non overlapping occurrences of the match pattern
 [direct_array_access]
 pub fn (mut re RE) find_all(in_txt string) []int {
 	mut i := 0
 	mut res := []int{}
 	mut ls := -1
 	for i < in_txt.len {
 		s,e := re.find(in_txt[i..])
 		if s >= 0 && e > s && i+s > ls {
 			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
 			res << i+s
 			res << i+e
 			ls = i+s
 			i = i+e
 			continue
 		} else {
 			i++
 		}
 	}
 	return res
 }
 // replace return a string where the matches are replaced with the replace string
 pub fn (mut re RE) replace(in_txt string, repl string) string {
 	pos := re.find_all(in_txt)
 	if pos.len > 0 {
 		mut res := ""
 		mut i := 0
 		mut s1 := 0
 		mut e1 := in_txt.len
 		for i < pos.len {
 			e1 = pos[i]
 			res += in_txt[s1..e1] + repl
 			s1 = pos[i+1]
 			i += 2
 		}
 		res += in_txt[s1..]
 		return res
 	}
 	return in_txt
 }
 pub type FnReplace = fn (re RE, in_txt string, start int, end int) string 
 // replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function
 pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
 	mut i := 0
 	mut res := ""
 	mut ls := -1
 	mut s1 := 0
 	//mut e1 := in_txt.len
 	for i < in_txt.len {
 		s,e := re.find(in_txt[i..])
 		if s >= 0 && e > s && i+s > ls {
 			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
 			start := i + s
 			end   := i + e
 			// update grups index diplacement
 			mut gi := 0
 			for gi < re.groups.len {
 				re.groups[gi] += i
 				gi++
 			}
 			repl  := repl_fn(re, in_txt, start, end)
 			res += in_txt[s1..start] + repl
 			s1 = end 
 			ls = i + s
 			i  = i + e
 			continue
 		} else {
 			i++
 		}
 	}
 	res += in_txt[s1..]
 	return res
 }
--- a/vlib/regex/regex_opt.v
+++ b/vlib/regex/regex_opt.v
@ -17,9 +17,19 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
 }
 // new_regex create a RE of small size, usually sufficient for ordinary use
 [deprecated]
 pub fn new() RE {
-	return impl_new_regex_by_size(1)
+	// init regex
    mut re := regex.RE{}
    re.prog = []Token    {len: max_code_len + 1} // max program length, can not be longer then the pattern
    re.cc   = []CharClass{len: max_code_len}     // can not be more char class the the length of the pattern
    re.group_csave_flag = false                 // enable continuos group saving
    re.group_max_nested = 128                   // set max 128 group nested
    re.group_max        = max_code_len >> 1      // we can't have more groups than the half of the pattern legth
    re.group_stack = []int{len: re.group_max, init: -1}
 	re.group_data  = []int{len: re.group_max, init: -1}
 	return re
 }
 // new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -144,29 +144,6 @@ match_test_suite = [
 ]
 )
 struct TestItemFa {
 	src string
 	q string
 	r []int
 }
 const (
 match_test_suite_fa = [
 	// find_all tests
 	TestItemFa{
 		"oggi pippo è andato a casa di pluto ed ha trovato pippo",
 		r"p[iplut]+o",
 		[5, 10, 31, 36, 51, 56]
 	},
 	TestItemFa{
 		"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
 		r"(pi?(ba)+o)",
 		[5, 10, 31, 39, 54, 65]
 	},
 ]
 )
 struct TestItemRe {
 	src string
 	q string
@ -174,7 +151,7 @@ struct TestItemRe {
 	r string
 }
 const (
-match_test_suite_re = [
+match_test_suite_replace = [
 	// replace tests
 	TestItemRe{
 		"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
@ -241,12 +218,53 @@ cgroups_test_suite = [
 ]
 )
 struct Test_find_all {
 	src string
 	q string
 	res []int // [0,4,5,6...] 
 	res_str []string // ['find0','find1'...]
 }
 const (
 find_all_test_suite = [
 	Test_find_all{
 		"abcd 1234 efgh 1234 ghkl1234 ab34546df",
 		r"\d+",
 		[5, 9, 15, 19, 24, 28, 31, 36],
 		['1234', '1234', '1234', '34546']
 	},
 	Test_find_all{
 		"abcd 1234 efgh 1234 ghkl1234 ab34546df",
 		r"\a+",
 		[0, 4, 10, 14, 20, 24, 29, 31, 36, 38],
 		['abcd', 'efgh', 'ghkl', 'ab', 'df']
 	},
 	Test_find_all{
 		"oggi pippo è andato a casa di pluto ed ha trovato pippo",
 		r"p[iplut]+o",
 		[5, 10, 31, 36, 51, 56],
 		['pippo', 'pluto', 'pippo']
 	},
 	Test_find_all{
 		"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
 		r"(pi?(ba)+o)",
 		[5, 10, 31, 39, 54, 65],
 		['pibao', 'pbababao', 'pibabababao']
 	},
 	Test_find_all{
 		"Today is a good day and tomorrow will be for sure.",
 		r"[Tt]o\w+",
 		[0, 5, 24, 32],
 		['Today', 'tomorrow']
 	}
 ]
 )
 const (
 	debug = false // true for debug println 
 )
 fn test_regex(){
 	// check capturing groups
 	for c,to in cgroups_test_suite {
 		// debug print
@ -275,8 +293,8 @@ fn test_regex(){
 		if start != to.s || end != to.e {
 			//println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
-			println("ERROR!")
+			eprintln("ERROR!")
-			C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
+			//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
 			assert false
 			continue
 		}	
@ -284,7 +302,7 @@ fn test_regex(){
 		// check cgroups
 		if to.cgn.len > 0 {
 			if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
-				println("Capturing group len error! found: ${re.group_csave[0]} true ground: ${to.cg[0]}")
+				eprintln("Capturing group len error! found: ${re.group_csave[0]} true ground: ${to.cg[0]}")
 				assert false
 				continue
 			}
@ -293,7 +311,7 @@ fn test_regex(){
 			mut ln := re.group_csave[0]*3
 			for ln > 0 {
 				if re.group_csave[ln] != to.cg[ln] {
-					println("Capturing group failed on $ln item!")
+					eprintln("Capturing group failed on $ln item!")
 					assert false
 				}
 				ln--
@ -302,7 +320,7 @@ fn test_regex(){
 			// check named captured groups
 			for k in to.cgn.keys() {
 				if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
-					println("Named capturing group error! [$k]")
+					eprintln("Named capturing group error! [$k]")
 					assert false
 					continue
 				}
@ -314,9 +332,9 @@ fn test_regex(){
 			}
 			for ln:=0; ln < re.groups.len; ln++ {
 				if re.groups[ln] != to.cg[ln] {
-					println("Capture group doesn't match:")
+					eprintln("Capture group doesn't match:")
-					println("true ground: [${to.cg}]")
+					eprintln("true ground: [${to.cg}]")
-					println("elaborated : [${re.groups}]")
+					eprintln("elaborated : [${re.groups}]")
 					assert false
 				}
 			} 
@ -324,9 +342,9 @@ fn test_regex(){
 	}
 	// check find_all
-	for c,to in match_test_suite_fa{
+	for c,to in find_all_test_suite {
 		// debug print
-		if debug { println("#$c [$to.src] q[$to.q] $to.r") }
+		if debug { println("#$c [$to.src] q[$to.q] ($to.res, $to.res_str)") }
 		mut re := regex.regex_opt(to.q) or {
 			eprintln('err: $err')
@ -334,25 +352,24 @@ fn test_regex(){
 			continue
 		}
 		re.reset()
 		res := re.find_all(to.src)
-		if res.len != to.r.len {
+		if res != to.res {
-			println("ERROR: find_all, array of different size.")
+			eprintln('err: find_all !!')
 			if debug { println("#$c exp: $to.res calculated: $res") }
 			assert false
 			continue
 		}
-		for c1,i in res {
+		res_str := re.find_all_str(to.src)
-			if i != to.r[c1] {
+		if res_str != to.res_str {
-				println("ERROR: find_all, different indexes.")
+			eprintln('err: find_all_str !!')
-				assert false
+			if debug { println("#$c exp: $to.res_str calculated: $res_str") }
-				continue
+			assert false
 			}
 		}
 	}
 	// check replace
-	for c,to in match_test_suite_re{
+	for c,to in match_test_suite_replace{
 		// debug print
 		if debug { println("#$c [$to.src] q[$to.q] $to.r") }
@ -364,7 +381,7 @@ fn test_regex(){
 		res := re.replace(to.src,to.rep)
 		if res != to.r {
-			println("ERROR: replace.")
+			eprintln("ERROR: replace.")
 			assert false
 			continue
 		}
@ -383,12 +400,12 @@ fn test_regex(){
 				continue
 			}
 			// q_str := re.get_query()
-			// println("Query: $q_str")
+			// eprintln("Query: $q_str")
 			start,end := re.find(to.src)
 			if start != to.s || end != to.e {
 				err_str := re.get_parse_error_string(start)
-				println("ERROR : $err_str start: ${start} end: ${end}")
+				eprintln("ERROR : $err_str start: ${start} end: ${end}")
 				assert false
 			} else {
 				//tmp_str := text[start..end]
@ -416,8 +433,8 @@ fn test_regex(){
 		}
 		if start != to.s || end != to.e {
-			println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
+			eprintln("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
-			println("ERROR!")
+			eprintln("ERROR!")
 			//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
 			assert false
 			continue
@ -427,7 +444,7 @@ fn test_regex(){
 		tmp_str1 := to.src.clone()
 		start1, end1 := re.match_string(tmp_str1)
 		if start1 != start || end1 != end {
-			println("two run ERROR!!")
+			eprintln("two run ERROR!!")
 			assert false
 			continue
 		}
--- a/vlib/regex/regex_util.v
+++ b/vlib/regex/regex_util.v
@ -117,6 +117,7 @@ pub fn (re RE) get_group_list() []Re_group {
 	mut res := []Re_group{len: re.groups.len >> 1}
 	mut gi := 0
 	//println("len: ${re.groups.len} groups: ${re.groups}")
 	for gi < re.groups.len {
 		if re.groups[gi] >= 0 {
 			txt_st := re.groups[gi]
@ -136,3 +137,143 @@ pub fn (re RE) get_group_list() []Re_group {
 	return res
 }
 /******************************************************************************
 *
 * Finders
 *
 ******************************************************************************/
 // find try to find the first match in the input string
 [direct_array_access]
 pub fn (mut re RE) find(in_txt string) (int,int) {
 	old_flag := re.flag
 	re.flag |= f_src  // enable search mode
 	start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
 	//print("Find [$start,$end] '${in_txt[start..end]}'")
 	if end > in_txt.len {
 		end = in_txt.len
 	}
 	re.flag = old_flag
 	if start >= 0 && end > start {
 		return start, end
 	}
 	return no_match_found, 0
 }
 // find_all find all the non overlapping occurrences of the match pattern
 [direct_array_access]
 pub fn (mut re RE) find_all(in_txt string) []int {
 	mut i := 0
 	mut res := []int{}
 	mut ls := -1
 	for i < in_txt.len {
 		s,e := re.find(in_txt[i..])
 		if s >= 0 && e > s && i+s > ls {
 			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
 			res << i+s
 			res << i+e
 			ls = i+s
 			i = i+e
 			continue
 		} else {
 			i++
 		}
 	}
 	return res
 }
 // find_all_str find all the non overlapping occurrences of the match pattern, return a string list
 [direct_array_access]
 pub fn (mut re RE) find_all_str(in_txt string) []string {
 	mut i := 0
 	mut res := []string{}
 	mut ls := -1
 	for i < in_txt.len {
 		s,e := re.find(in_txt[i..])
 		if s >= 0 && e > s && i+s > ls {
 			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
 			res << in_txt[i+s..i+e]
 			ls = i+s
 			i = i+e
 			continue
 		} else {
 			i++
 		}
 	}
 	return res
 }
 /******************************************************************************
 *
 * Replacers
 *
 ******************************************************************************/
 // replace return a string where the matches are replaced with the replace string
 pub fn (mut re RE) replace(in_txt string, repl string) string {
 	pos := re.find_all(in_txt)
 	if pos.len > 0 {
 		mut res := ""
 		mut i := 0
 		mut s1 := 0
 		mut e1 := in_txt.len
 		for i < pos.len {
 			e1 = pos[i]
 			res += in_txt[s1..e1] + repl
 			s1 = pos[i+1]
 			i += 2
 		}
 		res += in_txt[s1..]
 		return res
 	}
 	return in_txt
 }
 // type of function used for custom replace
 // in_txt  source text
 // start   index of the start of the match in in_txt
 // end     index of the end   of the match in in_txt
 // the match is in in_txt[start..end]
 pub type FnReplace = fn (re RE, in_txt string, start int, end int) string 
 // replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function
 pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
 	mut i   := 0
 	mut res := ""
 	mut ls  := -1
 	mut s1  := 0
 	for i < in_txt.len {
 		s,e := re.find(in_txt[i..])
 		if s >= 0 && e > s && i+s > ls {
 			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
 			start := i + s
 			end   := i + e
 			// update grups index diplacement
 			mut gi := 0
 			for gi < re.groups.len {
 				re.groups[gi] += i
 				gi++
 			}
 			repl  := repl_fn(re, in_txt, start, end)
 			res += in_txt[s1..start] + repl
 			s1 = end 
 			ls = i + s
 			i  = i + e
 			continue
 		} else {
 			i++
 		}
 	}
 	res += in_txt[s1..]
 	return res
 }