regex: code cleaning, add more tests (#7402)

* added carsywulf tests to the suite, removed deprecated repeat for init arrays * some clean in the code, fix typo in README.md
2020-12-19 00:32:57 +01:00 · 2020-12-19 00:32:57 +01:00 · 1e4b3a7b27
parent ff2cfd4f38
commit 1e4b3a7b27
3 changed files with 25 additions and 20 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@ -413,7 +413,7 @@ These functions are helpers to query the captured groups
 // get_group_bounds_by_name get a group boundaries by its name
 pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) 

-// get_group_by_name get a group boundaries by its name
+// get_group_by_name get a group string by its name
 pub fn (re RE) get_group_by_name(group_name string) string

 // get_group_by_id get a group boundaries by its id
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -1,6 +1,6 @@
 /*

-regex 0.9h
+regex 1.0 alpha

 Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
 Use of this source code is governed by an MIT license
@ -280,7 +280,6 @@ pub const (
 	f_bin = 0x00000200  // work only on bytes, ignore utf-8

 	// behaviour modifier flags
-	//f_or  = 0x00010000  // the OR work with concatenation like PCRE
 	f_src = 0x00020000  // search mode enabled
 )

@ -334,7 +333,11 @@ fn (mut re RE) reset(){
 		re.prog[i].rep       = 0 // clear repetition of the token
 		i++
 	}
-	re.groups = [-1].repeat(re.group_count*2)
+
+	// init groups array
+	if re.group_count > 0 {
+		re.groups = []int{len: re.group_count*2, init: -1}
+	}

 	// reset group_csave
 	re.group_csave = []int{}
@ -723,7 +726,6 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
 		// single value {4}
 		if status == .min_parse && ch == `}` {
 			q_max = q_min
-
 			status = .greedy
 			continue
 		}
@ -731,7 +733,6 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
 		// end without max
 		if status == .comma_checked && ch == `}` {
 			q_max = max_quantifier
-
 			status = .greedy
 			continue
 		}
@ -900,8 +901,8 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {

 	// group management variables
 	mut group_count           := -1
-	mut group_stack           := [0 ].repeat(re.group_max_nested)
-	mut group_stack_txt_index := [-1].repeat(re.group_max_nested)
+	mut group_stack           := []int{len: re.group_max_nested, init: 0}
+	mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1}
 	mut group_stack_index     := -1

 	re.query = in_txt      // save the query string
@ -987,7 +988,6 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {

 			pc = pc + 1
 			continue
-
 		}

 		// ist_group_end
@ -1566,8 +1566,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {

 	mut state_list := []StateObj{}

-	//mut group_stack      := [-1].repeat(re.group_max)
-	//mut group_data       := [-1].repeat(re.group_max)
 	mut group_stack := []int{len: re.group_max, init: -1}
 	mut group_data  := []int{len: re.group_max, init: -1}

@ -1677,7 +1675,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 		//******************************************
 		
 		if ist == ist_prog_end {
-			//println("HERE")
+			//println("HERE we end!")
 			break
 		}

@ -1719,9 +1717,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {

 						// continuous save, save until we have space
 						re.group_continuous_save(g_index)
-						
 					}
-
 					state.group_index--
 				}
 			}
@ -1968,7 +1964,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 						//println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
 					}

-					// char class IST 
+					// char char_class 
 					else if re.prog[chk_pc].ist == ist_char_class_pos || re.prog[chk_pc].ist == ist_char_class_neg {
 						mut cc_neg := false
 						if re.prog[chk_pc].ist == ist_char_class_neg {
@ -1993,7 +1989,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {

 				// check if we must continue or pass to the next IST
 				if next_check_flag == true {
-//				if	re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
 					//println("save the state!!")
 					state_list << StateObj {
 						group_index: state.group_index
@ -2382,8 +2377,8 @@ Public functions
 [deprecated]
 pub fn regex(in_query string) (RE,int,int){
 	mut re := RE{}
-	re.prog = [Token{}].repeat(in_query.len+1)
-	re.cc = [CharClass{}].repeat(in_query.len+1)
+	re.prog = []Token    {len: in_query.len+1}
+	re.cc   = []CharClass{len: in_query.len+1}
 	re.group_max_nested = 8

 	re_err,err_pos := re.compile(in_query)
@ -2403,8 +2398,8 @@ pub fn new_regex_by_size(mult int) RE {
 }
 fn impl_new_regex_by_size(mult int) RE {
 	mut re := RE{}
-	re.prog = [Token{}].repeat(max_code_len*mult)       // max program length, default 256 istructions
-	re.cc = [CharClass{}].repeat(max_code_len*mult)     // char class list
+	re.prog = []Token    {len: max_code_len*mult}       // max program length, default 256 istructions
+	re.cc   = []CharClass{len: max_code_len*mult}       // char class list
 	re.group_max_nested = 3*mult                        // max nested group

 	return re
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -128,6 +128,16 @@ match_test_suite = [
 	TestItem{"[ an s. s! ]( wi4ki:something )", r"\[.*\]\( *(\w*:*\w+) *\)",0,31},
 	TestItem{"[ an s. s! ](wiki:something)", r"\[.*\]\( *(\w*:*\w+) *\)",0,28},
 	
+	// Crazywulf tests (?:^|[()])(\d+)(*)(\d+)(?:$|[()])
+    TestItem{"1*1", r"(\d+)([*])(\d+)",0,3},
+    TestItem{"+1*1", r"^(\d+)([*])(\d+)",-1,0},
+    TestItem{"*1*1", r"(?:^|[*])(\d+)([*])(\d+)",0,4},
+    TestItem{"*1*1", r"(?:^|[*()])(\d+)([*])(\d+)",0,4},
+    TestItem{")1*1", r"(?:^|[*()])(\d+)([*])(\d+)",0,4},
+    TestItem{"(1*1", r"(?:^|[*()])(\d+)([*])(\d+)",0,4},
+    TestItem{"*1*1(", r"(?:^|[*()])(\d+)([*])(\d+)(?:$|[*()])",0,5},
+    TestItem{" 1*1(", r"(?:^|[*()])(\d+)([*])(\d+)(?:$|[*()])",-1,0},
+    TestItem{"1*1 ", r"(?:^|[*()])(\d+)([*])(\d+)(?:$|[*()])",-1,0},
 ]
 )