regex: implement negation groups, more flexibility for bsls, small fixes (#12981)

* removed memory allocations in cleaning during clear calls * first test implementation of negative groups, more flexibility for bsls * fixed bsls failed tests * fmt * added \n to regex tests
2021-12-27 21:18:48 +01:00 · 2021-12-27 21:18:48 +01:00 · dadc965082
parent 14648fa41e
commit dadc965082
2 changed files with 248 additions and 38 deletions
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -17,30 +17,31 @@ module regex
 import strings
 pub const (
-	v_regex_version        = '1.0 alpha' // regex module version
+	v_regex_version          = '1.0 alpha' // regex module version
-	max_code_len           = 256 // default small base code len for the regex programs
+	max_code_len             = 256 // default small base code len for the regex programs
-	max_quantifier         = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
+	max_quantifier           = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
 	// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
-	spaces                 = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
+	spaces                   = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
 	// new line chars for now only '\n'
-	new_line_list          = [`\n`, `\r`]
+	new_line_list            = [`\n`, `\r`]
 	// Results
-	no_match_found         = -1
+	no_match_found           = -1
 	// Errors
-	compile_ok             = 0 // the regex string compiled, all ok
+	compile_ok               = 0 // the regex string compiled, all ok
-	err_char_unknown       = -2 // the char used is unknow to the system
+	err_char_unknown         = -2 // the char used is unknow to the system
-	err_undefined          = -3 // the compiler symbol is undefined
+	err_undefined            = -3 // the compiler symbol is undefined
-	err_internal_error     = -4 // Bug in the regex system!!
+	err_internal_error       = -4 // Bug in the regex system!!
-	err_cc_alloc_overflow  = -5 // memory for char class full!!
+	err_cc_alloc_overflow    = -5 // memory for char class full!!
-	err_syntax_error       = -6 // syntax error in regex compiling
+	err_syntax_error         = -6 // syntax error in regex compiling
-	err_groups_overflow    = -7 // max number of groups reached
+	err_groups_overflow      = -7 // max number of groups reached
-	err_groups_max_nested  = -8 // max number of nested group reached
+	err_groups_max_nested    = -8 // max number of nested group reached
-	err_group_not_balanced = -9 // group not balanced
+	err_group_not_balanced   = -9 // group not balanced
-	err_group_qm_notation  = -10 // group invalid notation
+	err_group_qm_notation    = -10 // group invalid notation
-	err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
+	err_invalid_or_with_cc   = -11 // invalid or on two consecutive char class
 	err_neg_group_quantifier = -12 // negation groups can not have quantifier
 )
 const (
@ -198,6 +199,7 @@ pub fn (re RE) get_parse_error_string(err int) string {
 		regex.err_group_not_balanced { return 'err_group_not_balanced' }
 		regex.err_group_qm_notation { return 'err_group_qm_notation' }
 		regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
 		regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' }
 		else { return 'err_unknown' }
 	}
 }
@ -246,13 +248,15 @@ mut:
 	// validator function pointer
 	validator FnValidator
 	// groups variables
-	group_rep int // repetition of the group
+	group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion
 	group_rep int  // repetition of the group
 	group_id  int = -1 // id of the group
 	goto_pc   int = -1 // jump to this PC if is needed
 	// OR flag for the token
 	next_is_or bool // true if the next token is an OR
 	// dot_char token variables
-	dot_check_pc  int = -1 // pc of the next token to check
+	dot_check_pc  int = -1 // pc of the next token to check for dots
 	bsls_check_pc int = -1 // pc of the next token to check for bsls
 	last_dot_flag bool // if true indicate that is the last dot_char in the regex
 	// debug fields
 	source_index int
@ -333,7 +337,17 @@ fn (mut re RE) reset() {
 	// init groups array
 	if re.group_count > 0 {
-		re.groups = []int{len: re.group_count * 2, init: -1}
+		if re.groups.len == 0 {
 			// first run alloc memory
 			re.groups = []int{len: re.group_count * 2, init: -1}
 		} else {
 			// subsequent executions, only clean up the memory
 			i = 0
 			for i < re.groups.len {
 				re.groups[i] = -1
 				i++
 			}
 		}
 	}
 	// reset group_csave
@ -811,8 +825,8 @@ enum Group_parse_state {
 	finish
 }
-// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
+// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index)
-fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
+fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) {
 	mut status := Group_parse_state.start
 	mut i := in_i
 	mut name := ''
@ -836,10 +850,16 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
 			continue
 		}
 		// negate group
 		if status == .q_mark1 && ch == `!` {
 			i += char_len
 			return 0, false, true, name, i
 		}
 		// non capturing group
 		if status == .q_mark1 && ch == `:` {
 			i += char_len
-			return 0, false, name, i
+			return 0, false, false, name, i
 		}
 		// enter in P section
@ -852,7 +872,7 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
 		// not a valid q mark found
 		if status == .q_mark1 {
 			// println("NO VALID Q MARK")
-			return -2, true, name, i
+			return -2, true, false, name, i
 		}
 		if status == .p_status && ch == `<` {
@ -878,20 +898,20 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
 		// end name
 		if status == .p_in_name && ch == `>` {
 			i += char_len
-			return 0, true, name, i
+			return 0, true, false, name, i
 		}
 		// error on name group
 		if status == .p_in_name {
-			return -2, true, name, i
+			return -2, true, false, name, i
 		}
 		// normal group, nothig to do, exit
-		return 0, true, name, i
+		return 0, true, false, name, i
 	}
 	// UNREACHABLE
 	// println("ERROR!! NOT MEANT TO BE HERE!!1")
-	return -2, true, name, i
+	return -2, true, false, name, i
 }
 const (
@ -949,7 +969,8 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 				return regex.err_groups_max_nested, i + 1
 			}
-			tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt, i)
+			tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt,
 				i)
 			// manage question mark format error
 			if tmp_res < -1 {
@ -984,6 +1005,12 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 			re.prog[pc].rep_min = 1
 			re.prog[pc].rep_max = 1
 			// manage negation groups
 			if negate_flag == true {
 				re.prog[pc].group_neg = true
 				re.prog[pc].rep_min = 0 // may be not catched, but it is ok
 			}
 			// set the group id
 			if cgroup_flag == false {
 				// println("NO CAPTURE GROUP")
@ -1015,6 +1042,11 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 			re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
 			// re.prog[goto_pc].group_id = group_count         // id of this group, used for storing data
 			if re.prog[goto_pc].group_neg == true {
 				re.prog[pc].group_neg = re.prog[goto_pc].group_neg
 				re.prog[pc].rep_min = re.prog[goto_pc].rep_min
 			}
 			pc = pc + 1
 			i = i + char_len
 			continue
@ -1050,6 +1082,12 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 				char_next, char_next_len = re.get_char(in_txt, i + char_len)
 			}
 			mut quant_flag := true
 			// negation groups can not have quantifiers
 			if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] {
 				return regex.err_neg_group_quantifier, i
 			}
 			match byte(char_tmp) {
 				`?` {
 					// println("q: ${char_tmp:c}")
@ -1215,6 +1253,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 			dot_char_count++
 			mut pc2 := pc1 + 1
 			for pc2 < pc {
 				// consecutive dot chars is an error
 				if re.prog[pc2].ist == regex.ist_dot_char {
 					return regex.err_syntax_error, 0
 				}
@ -1246,6 +1285,49 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 		}
 	}
 	//
 	// manage bsls_char
 	//
 	// find the checks for bsls, if any...
 	pc1 = 0
 	mut bsls_char_count := 0
 	mut last_bsls_char_pc := -1
 	for pc1 < pc {
 		if re.prog[pc1].ist == regex.ist_bsls_char {
 			// println("bsls_char pc: $pc1")
 			last_bsls_char_pc = pc1
 			bsls_char_count++
 			mut pc2 := pc1 + 1
 			for pc2 < pc {
 				if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
 					regex.ist_group_start] {
 					// println("Next bsls check is PC: ${pc2}")
 					re.prog[pc1].bsls_check_pc = pc2
 					break
 				}
 				pc2++
 			}
 		}
 		pc1++
 	}
 	// println("last_bsls_char_pc: $last_bsls_char_pc")
 	if last_bsls_char_pc >= 0 {
 		pc1 = last_bsls_char_pc + 1
 		mut is_last_bsls := true
 		for pc1 < pc {
 			if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
 				is_last_bsls = false
 				break
 			}
 			pc1++
 		}
 		if is_last_bsls {
 			re.prog[last_bsls_char_pc].last_dot_flag = true
 		}
 	}
 	//******************************************
 	// OR branch
@ -1405,14 +1487,15 @@ pub fn (re RE) get_query() string {
 		// GROUP start
 		if ch == regex.ist_group_start {
-			if re.debug == 0 {
+			if re.debug > 0 {
-				res.write_string('(')
+				res.write_string('#$tk.group_id')
-			} else {
+			}
-				if tk.group_id == -1 {
+			res.write_string('(')
-					res.write_string('(?:') // non capturing group
+
-				} else {
+			if tk.group_neg == true {
-					res.write_string('#${tk.group_id}(')
+				res.write_string('?!') // negation group
-				}
+			} else if tk.group_id == -1 {
 				res.write_string('?:') // non capturing group
 			}
 			for x in re.group_map.keys() {
@ -1470,7 +1553,7 @@ pub fn (re RE) get_query() string {
 		}
 		// quantifier
-		if !(tk.rep_min == 1 && tk.rep_max == 1) {
+		if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false {
 			if tk.rep_min == 0 && tk.rep_max == 1 {
 				res.write_string('?')
 			} else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier {
@ -2081,6 +2164,7 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
 				continue
 			}
 			// check bsls
 			/*
 			else if ist == regex.ist_bsls_char {
 				state.match_flag = false
 				tmp_res := re.prog[state.pc].validator(byte(ch))
@ -2103,6 +2187,101 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
 				m_state = .ist_quant_n
 				continue
 			}
 			*/
 			else if ist == regex.ist_bsls_char {
 				// println("ist_bsls_char rep: ${re.prog[state.pc].rep}")
 				// check next token to be false
 				mut next_check_flag := false
 				// if we are done with max go on dot char are dedicated case!!
 				if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
 					re.state_list.pop()
 					m_state = .ist_next
 					continue
 				}
 				if re.prog[state.pc].bsls_check_pc >= 0
 					&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
 					// load the char
 					// ch_t, _ := re.get_charb(in_txt, state.i+char_len)
 					ch_t := ch
 					chk_pc := re.prog[state.pc].bsls_check_pc
 					// simple char
 					if re.prog[chk_pc].ist == regex.ist_simple_char {
 						if re.prog[chk_pc].ch == ch_t {
 							next_check_flag = true
 						}
 						// println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
 					}
 					// char char_class
 					else if re.prog[chk_pc].ist == regex.ist_char_class_pos
 						|| re.prog[chk_pc].ist == regex.ist_char_class_neg {
 						mut cc_neg := false
 						if re.prog[chk_pc].ist == regex.ist_char_class_neg {
 							cc_neg = true
 						}
 						mut cc_res := re.check_char_class(chk_pc, ch_t)
 						if cc_neg {
 							cc_res = !cc_res
 						}
 						next_check_flag = cc_res
 						// println("Check [ist_char_class] => $next_check_flag")
 					}
 					// check bsls
 					else if re.prog[chk_pc].ist == regex.ist_bsls_char {
 						next_check_flag = re.prog[chk_pc].validator(byte(ch_t))
 						// println("Check [ist_bsls_char] => $next_check_flag")
 					}
 				}
 				// check if we must continue or pass to the next IST
 				if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
 					// println("save the state!!")
 					mut dot_state := StateObj{
 						group_index: state.group_index
 						match_flag: state.match_flag
 						match_index: state.match_index
 						first_match: state.first_match
 						pc: state.pc
 						i: state.i + char_len
 						char_len: char_len
 						last_dot_pc: state.pc
 					}
 					// if we are mananging a .* stay on the same char on return
 					if re.prog[state.pc].rep_min == 0 {
 						dot_state.i -= char_len
 					}
 					re.state_list << dot_state
 					m_state = .ist_quant_n
 					// println("dot_char stack len: ${re.state_list.len}")
 					continue
 				}
 				tmp_res := re.prog[state.pc].validator(byte(ch))
 				if tmp_res == false {
 					m_state = .ist_quant_n
 					continue
 				}
 				// println("${ch} => ${tmp_res}")
 				state.match_flag = true
 				l_ist = u32(regex.ist_dot_char)
 				if state.first_match < 0 {
 					state.first_match = state.i
 				}
 				state.match_index = state.i
 				re.prog[state.pc].rep++ // increase repetitions
 				state.i += char_len
 				m_state = .ist_quant_p
 				continue
 			}
 			// simple char IST
 			else if ist == regex.ist_simple_char {
 				// println("ist_simple_char")
@ -2213,6 +2392,13 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
 				tmp_pc = re.group_data[state.group_index]
 			}
 			if re.prog[tmp_pc].group_neg == true {
 				// println("***** Negation of the group")
 				result = regex.no_match_found
 				m_state = .stop
 				continue
 			}
 			rep := re.prog[tmp_pc].group_rep
 			if rep < re.prog[tmp_pc].rep_min {
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -760,3 +760,27 @@ fn test_long_query() {
    //println("$start, $end")
    assert start >= 0 && end == base_string.len
 }
 struct Test_negation_group {
 	src string
 	res bool 
 }
 const(
 	negation_groups = [
        Test_negation_group{'automobile',false},
        Test_negation_group{'botomobile',true},
        Test_negation_group{'auto_caravan',false},
        Test_negation_group{'moto_mobile',true},
        Test_negation_group{'pippole',true},
        Test_negation_group{'boring test',false},
    ]
 )
 fn test_negation_groups() {
 	mut query := r"(?!auto)\w+le"
    mut re := regex.regex_opt(query) or { panic(err) }
 	for test in negation_groups {
        start, end := re.match_string(test.src)
        assert (start >= 0) ==  test.res
    }
 }