regex: implement negation groups, more flexibility for bsls, small fixes (#12981)

* removed memory allocations in cleaning during clear calls * first test implementation of negative groups, more flexibility for bsls * fixed bsls failed tests * fmt * added \n to regex tests
2021-12-27 21:18:48 +01:00 · 2021-12-27 21:18:48 +01:00 · dadc965082
parent 14648fa41e
commit dadc965082
2 changed files with 248 additions and 38 deletions
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -17,30 +17,31 @@ module regex
 import strings

 pub const (
-	v_regex_version        = '1.0 alpha' // regex module version
+	v_regex_version          = '1.0 alpha' // regex module version

-	max_code_len           = 256 // default small base code len for the regex programs
-	max_quantifier         = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
+	max_code_len             = 256 // default small base code len for the regex programs
+	max_quantifier           = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
 	// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
-	spaces                 = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
+	spaces                   = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
 	// new line chars for now only '\n'
-	new_line_list          = [`\n`, `\r`]
+	new_line_list            = [`\n`, `\r`]

 	// Results
-	no_match_found         = -1
+	no_match_found           = -1

 	// Errors
-	compile_ok             = 0 // the regex string compiled, all ok
-	err_char_unknown       = -2 // the char used is unknow to the system
-	err_undefined          = -3 // the compiler symbol is undefined
-	err_internal_error     = -4 // Bug in the regex system!!
-	err_cc_alloc_overflow  = -5 // memory for char class full!!
-	err_syntax_error       = -6 // syntax error in regex compiling
-	err_groups_overflow    = -7 // max number of groups reached
-	err_groups_max_nested  = -8 // max number of nested group reached
-	err_group_not_balanced = -9 // group not balanced
-	err_group_qm_notation  = -10 // group invalid notation
-	err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
+	compile_ok               = 0 // the regex string compiled, all ok
+	err_char_unknown         = -2 // the char used is unknow to the system
+	err_undefined            = -3 // the compiler symbol is undefined
+	err_internal_error       = -4 // Bug in the regex system!!
+	err_cc_alloc_overflow    = -5 // memory for char class full!!
+	err_syntax_error         = -6 // syntax error in regex compiling
+	err_groups_overflow      = -7 // max number of groups reached
+	err_groups_max_nested    = -8 // max number of nested group reached
+	err_group_not_balanced   = -9 // group not balanced
+	err_group_qm_notation    = -10 // group invalid notation
+	err_invalid_or_with_cc   = -11 // invalid or on two consecutive char class
+	err_neg_group_quantifier = -12 // negation groups can not have quantifier
 )

 const (
@ -198,6 +199,7 @@ pub fn (re RE) get_parse_error_string(err int) string {
 		regex.err_group_not_balanced { return 'err_group_not_balanced' }
 		regex.err_group_qm_notation { return 'err_group_qm_notation' }
 		regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
+		regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' }
 		else { return 'err_unknown' }
 	}
 }
@ -246,13 +248,15 @@ mut:
 	// validator function pointer
 	validator FnValidator
 	// groups variables
-	group_rep int // repetition of the group
+	group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion
+	group_rep int  // repetition of the group
 	group_id  int = -1 // id of the group
 	goto_pc   int = -1 // jump to this PC if is needed
 	// OR flag for the token
 	next_is_or bool // true if the next token is an OR
 	// dot_char token variables
-	dot_check_pc  int = -1 // pc of the next token to check
+	dot_check_pc  int = -1 // pc of the next token to check for dots
+	bsls_check_pc int = -1 // pc of the next token to check for bsls
 	last_dot_flag bool // if true indicate that is the last dot_char in the regex
 	// debug fields
 	source_index int
@ -333,7 +337,17 @@ fn (mut re RE) reset() {

 	// init groups array
 	if re.group_count > 0 {
-		re.groups = []int{len: re.group_count * 2, init: -1}
+		if re.groups.len == 0 {
+			// first run alloc memory
+			re.groups = []int{len: re.group_count * 2, init: -1}
+		} else {
+			// subsequent executions, only clean up the memory
+			i = 0
+			for i < re.groups.len {
+				re.groups[i] = -1
+				i++
+			}
+		}
 	}

 	// reset group_csave
@ -811,8 +825,8 @@ enum Group_parse_state {
 	finish
 }

-// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
-fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
+// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index)
+fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) {
 	mut status := Group_parse_state.start
 	mut i := in_i
 	mut name := ''
@ -836,10 +850,16 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
 			continue
 		}

+		// negate group
+		if status == .q_mark1 && ch == `!` {
+			i += char_len
+			return 0, false, true, name, i
+		}
+
 		// non capturing group
 		if status == .q_mark1 && ch == `:` {
 			i += char_len
-			return 0, false, name, i
+			return 0, false, false, name, i
 		}

 		// enter in P section
@ -852,7 +872,7 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
 		// not a valid q mark found
 		if status == .q_mark1 {
 			// println("NO VALID Q MARK")
-			return -2, true, name, i
+			return -2, true, false, name, i
 		}

 		if status == .p_status && ch == `<` {
@ -878,20 +898,20 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
 		// end name
 		if status == .p_in_name && ch == `>` {
 			i += char_len
-			return 0, true, name, i
+			return 0, true, false, name, i
 		}

 		// error on name group
 		if status == .p_in_name {
-			return -2, true, name, i
+			return -2, true, false, name, i
 		}

 		// normal group, nothig to do, exit
-		return 0, true, name, i
+		return 0, true, false, name, i
 	}
 	// UNREACHABLE
 	// println("ERROR!! NOT MEANT TO BE HERE!!1")
-	return -2, true, name, i
+	return -2, true, false, name, i
 }

 const (
@ -949,7 +969,8 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 				return regex.err_groups_max_nested, i + 1
 			}

-			tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt, i)
+			tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt,
+				i)

 			// manage question mark format error
 			if tmp_res < -1 {
@ -984,6 +1005,12 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 			re.prog[pc].rep_min = 1
 			re.prog[pc].rep_max = 1

+			// manage negation groups
+			if negate_flag == true {
+				re.prog[pc].group_neg = true
+				re.prog[pc].rep_min = 0 // may be not catched, but it is ok
+			}
+
 			// set the group id
 			if cgroup_flag == false {
 				// println("NO CAPTURE GROUP")
@ -1015,6 +1042,11 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 			re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
 			// re.prog[goto_pc].group_id = group_count         // id of this group, used for storing data

+			if re.prog[goto_pc].group_neg == true {
+				re.prog[pc].group_neg = re.prog[goto_pc].group_neg
+				re.prog[pc].rep_min = re.prog[goto_pc].rep_min
+			}
+
 			pc = pc + 1
 			i = i + char_len
 			continue
@ -1050,6 +1082,12 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 				char_next, char_next_len = re.get_char(in_txt, i + char_len)
 			}
 			mut quant_flag := true
+
+			// negation groups can not have quantifiers
+			if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] {
+				return regex.err_neg_group_quantifier, i
+			}
+
 			match byte(char_tmp) {
 				`?` {
 					// println("q: ${char_tmp:c}")
@ -1215,6 +1253,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 			dot_char_count++
 			mut pc2 := pc1 + 1
 			for pc2 < pc {
+				// consecutive dot chars is an error
 				if re.prog[pc2].ist == regex.ist_dot_char {
 					return regex.err_syntax_error, 0
 				}
@ -1246,6 +1285,49 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 		}
 	}

+	//
+	// manage bsls_char
+	//
+
+	// find the checks for bsls, if any...
+	pc1 = 0
+	mut bsls_char_count := 0
+	mut last_bsls_char_pc := -1
+	for pc1 < pc {
+		if re.prog[pc1].ist == regex.ist_bsls_char {
+			// println("bsls_char pc: $pc1")
+			last_bsls_char_pc = pc1
+			bsls_char_count++
+			mut pc2 := pc1 + 1
+			for pc2 < pc {
+				if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
+					regex.ist_group_start] {
+					// println("Next bsls check is PC: ${pc2}")
+					re.prog[pc1].bsls_check_pc = pc2
+					break
+				}
+				pc2++
+			}
+		}
+		pc1++
+	}
+
+	// println("last_bsls_char_pc: $last_bsls_char_pc")
+	if last_bsls_char_pc >= 0 {
+		pc1 = last_bsls_char_pc + 1
+		mut is_last_bsls := true
+		for pc1 < pc {
+			if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
+				is_last_bsls = false
+				break
+			}
+			pc1++
+		}
+		if is_last_bsls {
+			re.prog[last_bsls_char_pc].last_dot_flag = true
+		}
+	}
+
 	//******************************************

 	// OR branch
@ -1405,14 +1487,15 @@ pub fn (re RE) get_query() string {

 		// GROUP start
 		if ch == regex.ist_group_start {
-			if re.debug == 0 {
-				res.write_string('(')
-			} else {
-				if tk.group_id == -1 {
-					res.write_string('(?:') // non capturing group
-				} else {
-					res.write_string('#${tk.group_id}(')
-				}
+			if re.debug > 0 {
+				res.write_string('#$tk.group_id')
+			}
+			res.write_string('(')
+
+			if tk.group_neg == true {
+				res.write_string('?!') // negation group
+			} else if tk.group_id == -1 {
+				res.write_string('?:') // non capturing group
 			}

 			for x in re.group_map.keys() {
@ -1470,7 +1553,7 @@ pub fn (re RE) get_query() string {
 		}

 		// quantifier
-		if !(tk.rep_min == 1 && tk.rep_max == 1) {
+		if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false {
 			if tk.rep_min == 0 && tk.rep_max == 1 {
 				res.write_string('?')
 			} else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier {
@ -2081,6 +2164,7 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
 				continue
 			}
 			// check bsls
+			/*
 			else if ist == regex.ist_bsls_char {
 				state.match_flag = false
 				tmp_res := re.prog[state.pc].validator(byte(ch))
@ -2103,6 +2187,101 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
 				m_state = .ist_quant_n
 				continue
 			}
+			*/
+			else if ist == regex.ist_bsls_char {
+				// println("ist_bsls_char rep: ${re.prog[state.pc].rep}")
+
+				// check next token to be false
+				mut next_check_flag := false
+
+				// if we are done with max go on dot char are dedicated case!!
+				if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
+					re.state_list.pop()
+					m_state = .ist_next
+					continue
+				}
+
+				if re.prog[state.pc].bsls_check_pc >= 0
+					&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
+					// load the char
+					// ch_t, _ := re.get_charb(in_txt, state.i+char_len)
+					ch_t := ch
+					chk_pc := re.prog[state.pc].bsls_check_pc
+
+					// simple char
+					if re.prog[chk_pc].ist == regex.ist_simple_char {
+						if re.prog[chk_pc].ch == ch_t {
+							next_check_flag = true
+						}
+						// println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
+					}
+					// char char_class
+					else if re.prog[chk_pc].ist == regex.ist_char_class_pos
+						|| re.prog[chk_pc].ist == regex.ist_char_class_neg {
+						mut cc_neg := false
+						if re.prog[chk_pc].ist == regex.ist_char_class_neg {
+							cc_neg = true
+						}
+						mut cc_res := re.check_char_class(chk_pc, ch_t)
+
+						if cc_neg {
+							cc_res = !cc_res
+						}
+						next_check_flag = cc_res
+						// println("Check [ist_char_class] => $next_check_flag")
+					}
+					// check bsls
+					else if re.prog[chk_pc].ist == regex.ist_bsls_char {
+						next_check_flag = re.prog[chk_pc].validator(byte(ch_t))
+						// println("Check [ist_bsls_char] => $next_check_flag")
+					}
+				}
+
+				// check if we must continue or pass to the next IST
+				if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
+					// println("save the state!!")
+					mut dot_state := StateObj{
+						group_index: state.group_index
+						match_flag: state.match_flag
+						match_index: state.match_index
+						first_match: state.first_match
+						pc: state.pc
+						i: state.i + char_len
+						char_len: char_len
+						last_dot_pc: state.pc
+					}
+					// if we are mananging a .* stay on the same char on return
+					if re.prog[state.pc].rep_min == 0 {
+						dot_state.i -= char_len
+					}
+
+					re.state_list << dot_state
+
+					m_state = .ist_quant_n
+					// println("dot_char stack len: ${re.state_list.len}")
+					continue
+				}
+
+				tmp_res := re.prog[state.pc].validator(byte(ch))
+				if tmp_res == false {
+					m_state = .ist_quant_n
+					continue
+				}
+				// println("${ch} => ${tmp_res}")
+
+				state.match_flag = true
+				l_ist = u32(regex.ist_dot_char)
+
+				if state.first_match < 0 {
+					state.first_match = state.i
+				}
+				state.match_index = state.i
+				re.prog[state.pc].rep++ // increase repetitions
+
+				state.i += char_len
+				m_state = .ist_quant_p
+				continue
+			}
 			// simple char IST
 			else if ist == regex.ist_simple_char {
 				// println("ist_simple_char")
@ -2213,6 +2392,13 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
 				tmp_pc = re.group_data[state.group_index]
 			}

+			if re.prog[tmp_pc].group_neg == true {
+				// println("***** Negation of the group")
+				result = regex.no_match_found
+				m_state = .stop
+				continue
+			}
+
 			rep := re.prog[tmp_pc].group_rep

 			if rep < re.prog[tmp_pc].rep_min {
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -760,3 +760,27 @@ fn test_long_query() {
    //println("$start, $end")
    assert start >= 0 && end == base_string.len
 }
+
+
+struct Test_negation_group {
+	src string
+	res bool 
+}
+const(
+	negation_groups = [
+        Test_negation_group{'automobile',false},
+        Test_negation_group{'botomobile',true},
+        Test_negation_group{'auto_caravan',false},
+        Test_negation_group{'moto_mobile',true},
+        Test_negation_group{'pippole',true},
+        Test_negation_group{'boring test',false},
+    ]
+)
+fn test_negation_groups() {
+	mut query := r"(?!auto)\w+le"
+    mut re := regex.regex_opt(query) or { panic(err) }
+	for test in negation_groups {
+        start, end := re.match_string(test.src)
+        assert (start >= 0) ==  test.res
+    }
+}