regex: bug fixes (#7137)

2020-12-05 01:51:48 +01:00 · 2020-12-05 01:51:48 +01:00 · 4fb37e81b2
parent 89952edd25
commit 4fb37e81b2
4 changed files with 153 additions and 57 deletions
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -616,7 +616,7 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
 				}
 			}
 			if status == .in_bsls {
-				println("CC bsls not found [${ch:c}]")
+				//println("CC bsls not found [${ch:c}]")
 				status = .in_char
 			}else {
 				continue
@ -1212,6 +1212,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
 	// set the jump in the right places
 	pc1 = 0
 	for pc1 < pc-2 {
 		//println("Here $pc1 ${pc-2}")
 		// two consecutive OR are a syntax error
 		if re.prog[pc1+1].ist == ist_or_branch && re.prog[pc1+2].ist == ist_or_branch {
 			return err_syntax_error, i
@ -1238,8 +1239,13 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
 					re.prog[pc1+1].rep_max = pc2 + 1
 					break
 				}
 				pc2++
 			}
 			// special case query of few chars, teh true can't go on the first instruction
 			if re.prog[pc1+1].rep_max == pc1 {
 				re.prog[pc1+1].rep_max = 3
 			}
 			//println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]")
 			pc1 = pc2
 			continue
@ -1490,6 +1496,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 	mut ch       := rune(0)           // examinated char
 	mut char_len := 0                 // utf8 examinated char len
 	mut m_state  := Match_state.start // start point for the matcher FSM
 	mut src_end  := false
 	mut last_fnd_pc := -1
 	mut pc    := -1                   // program counter
 	mut state := StateObj{}           // actual state
@ -1599,9 +1607,14 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 			dbg_line++
 		}
 		//******************************************
-
+/*		if ist == ist_prog_end {
 			//println("HERE")
 			break
 		}
 */
 		// we're out of text, manage it
 		if i >= in_txt_len || m_state == .new_line {
 			src_end = true
 			// manage groups
 			if group_index >= 0 && state.match_index >= 0 {
@ -1644,11 +1657,29 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				}
 			}
-			// manage ist_dot_char
+			if pc == -1 {
 				pc = last_fnd_pc
 			}
 			//println("Finished text!!")
 			//println("Instruction: ${ist:08x} pc: $pc")
 			//println("min_rep: ${re.prog[pc].rep_min} max_rep: ${re.prog[pc].rep_max} rep: ${re.prog[pc].rep}")
-			m_state = .end
+			// program end
-			break
+			if ist == ist_prog_end {
-			//return no_match_found,0
+				//println("Program end on end of text!")
 				return first_match,i
 			}
 			// if we go out of text and we are the last instruction .* check
 			if (re.prog[pc+1].ist == ist_prog_end) && 
 			(re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max) {
 				//println("Ok .* rep match!")
 				return first_match,i
 			}		
 			//m_state = .end
 			//break
 			return no_match_found,0
 		}
 		// starting and init
@ -1697,7 +1728,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 		if m_state == .stop {
 			// we are in search mode, don't exit until the end
-			if re.flag & f_src != 0 && ist != ist_prog_end {
+			if ((re.flag & f_src) != 0) && (ist != ist_prog_end) {
 				last_fnd_pc = pc
 				pc = -1
 				i += char_len
 				m_state = .ist_next
@ -1741,6 +1773,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				// we have a DOT MATCH on going
 				//println("ist_prog_end l_ist: ${l_ist:08x}", l_ist)
 				if re.state_stack_index>=0 && l_ist == ist_dot_char {
 					i = in_txt_len // dario
 					m_state = .stop
 					continue
 				}				
@ -1832,7 +1865,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 					first_match = i
 				}
 				state.match_index = i
-				re.prog[pc].rep++
+				re.prog[pc].rep++ // increase repetitions
 				//if re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max {
 				if re.prog[pc].rep >= 0 && re.prog[pc].rep <= re.prog[pc].rep_max {
@ -1857,12 +1890,15 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 					re.state_stack[re.state_stack_index].i  = i + char_len
 				}
 				//i += char_len // next char
 /*
 				// manage * and {0,} quantifier
-				if re.prog[pc].rep_min > 0 {
+				if re.prog[pc].rep_max == max_quantifier {
-					i += char_len // next char
+					//println("manage .*")
-					l_ist = u32(ist_dot_char)
+					m_state = .ist_load
 					continue
 				}
-
+*/
 				m_state = .ist_next
 				continue
@ -2163,14 +2199,48 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 		return err_internal_error, i
 	}
 	//println("Check end of text!")
 	// Check the results
 	if state.match_index >= 0 {
 		if group_index < 0 {
-			//println("OK match,natural end [$first_match,$i]")
+			
-			return first_match, i
+			if re.prog[pc].ist == ist_prog_end {
 				//println("program ended!!")
 				if (re.flag & f_src) != 0 {
 					//println("find return")
 					return first_match, i
 				} else {
 					return 0, i
 				}
 			}
 			//println("No Group here, natural end [$first_match,$i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len")
 			if re.prog[pc+1].ist == ist_prog_end || re.prog[pc].ist == ist_prog_end{
 				rep := re.prog[pc].rep
 				//println("rep: $rep re.prog[pc].rep_min: ${re.prog[pc].rep_min} re.prog[pc].rep_max: ${re.prog[pc].rep_max}")
 				if rep >= re.prog[pc].rep_min && rep <= re.prog[pc].rep_max {
 					return first_match, i
 				}
 				//println("Program not finished! ")
 				return no_match_found, 0
 			}
 			if src_end {
 				//println("program end")
 				return first_match, i
 			}
 			//print("No match found!!")
 			return no_match_found, 0
 		} else {
 			//println("Group match! OK")
 			//println("first_match: $first_match, i: $i")
 			//println("Skip last group")
-			return first_match,group_stack[group_index--]
+			return first_match,i
 			//return first_match,group_stack[group_index--]
 		}
 	}
 	//println("no_match_found, natural end")
@ -2224,7 +2294,12 @@ fn impl_new_regex_by_size(mult int) RE {
 //
 pub fn (mut re RE) match_string(in_txt string) (int,int) {
-	start, end := re.match_base(in_txt.str,in_txt.len)
+
 	start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
 	if end > in_txt.len {
 		end = in_txt.len
 	}
 	if start >= 0 && end > start {
 		if (re.flag & f_ms) != 0 && start > 0 {
 			return no_match_found, 0
@ -2247,9 +2322,15 @@ pub fn (mut re RE) match_string(in_txt string) (int,int) {
 // find try to find the first match in the input string
 pub fn (mut re RE) find(in_txt string) (int,int) {
 	old_flag := re.flag
 	re.flag |= f_src  // enable search mode
-	start, end := re.match_base(in_txt.str, in_txt.len)
+	start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
 	//print("Find [$start,$end] '${in_txt[start..end]}'")
 	if end > in_txt.len {
 		end = in_txt.len
 	}
 	re.flag = old_flag
 	if start >= 0 && end > start {
 		return start, end
 	}
--- a/vlib/regex/regex_opt.v
+++ b/vlib/regex/regex_opt.v
@ -7,11 +7,11 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
 	if re_err != compile_ok {
 		mut err_msg := strings.new_builder(300)
-		err_str := re.get_parse_error_string(re_err)
+		err_msg.write("query: $pattern\n")
 		err_msg.write("$err_str\n")
 		err_msg.write("      query: $pattern\n")
 		line := "-".repeat(err_pos)
-		err_msg.write("    err pos: ${line}^")
+		err_msg.write("err  : ${line}^\n")
 		err_str := re.get_parse_error_string(re_err)
 		err_msg.write("ERROR: $err_str\n")
 		return error_with_code(err_msg.str(), re_err)
 	}
 }
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -14,6 +14,12 @@ struct TestItem {
 const(
 match_test_suite = [
 	// base OR
 	TestItem{"a",r"a|b",0,1},
 	TestItem{"a",r"b|a",0,1},
 	TestItem{"b",r"a|b",0,1},
 	TestItem{"b",r"b|a",0,1},
 	TestItem{"c",r"b|a",-1,0},
 	// positive
 	TestItem{"this is a good.",r"this",0,4},
@ -38,7 +44,7 @@ match_test_suite = [
 	TestItem{"this these those ",r"(th[eio]se? ?)+",0,17},
 	TestItem{"this these those ",r"(th[eio]se? )+",0,17},
 	TestItem{"this,these,those. over",r"(th[eio]se?[,. ])+",0,17},
-	TestItem{"soday,this,these,those. over",r"(th[eio]se?[,. ])+",6,23},
+	TestItem{"soday,this,these,those. over",r".+(th[eio]se?[,. ])+",0,23},
 	TestItem{"cpapaz",r"(c(pa)+z)",0,6},
 	TestItem{"this is a cpapaz over",r"(c(pa)+z)",10,16},
@ -60,7 +66,7 @@ match_test_suite = [
 	TestItem{"soday,this,these,those. over",r".*,(th[eio]se?[,. ])+",0,23},
 	TestItem{"soday,this,these,thesa.thesi over",r".*,(th[ei]se?[,. ])+(thes[ai][,. ])+",0,29},
 	TestItem{"cpapaz ole. pippo,",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18},
-	TestItem{"cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,17},
+	TestItem{"cpapaz ole. pippo",r"(c(pa)+z)(\s+\a+[\.,]?)+",0,17},
 	TestItem{"cpapaz ole. pippo, 852",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18},
 	TestItem{"123cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20},
 	TestItem{"...cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20},
@ -74,7 +80,6 @@ match_test_suite = [
 	TestItem{"/home/us_er/pippo/info-01.txt", r"(/?[-\w_]+)*\.txt$",0,29}
 	// negative
 	TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
 	TestItem{"this is a good.",r"thes",-1,0},
@ -88,6 +93,25 @@ match_test_suite = [
 	// check unicode
 	TestItem{"this is a Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r".*a [Ⅰ-Ⅵ ]+",0,34},
 	TestItem{"123Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r"[Ⅰ-Ⅴ\s]+",3,23},
 	// new edge cases
 	TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",-1,0},
 	TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",0,8},
 	TestItem{"123456789", r"^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$",0,9}
 	TestItem{"12345678", r"^\d{8}$",0,8},
 	TestItem{"12345678", r"^\d{7}$",-1,0},
 	TestItem{"12345678", r"^\d{9}$",-1,0},
 	TestItem{"eth", r"(oth)|(eth)",0,3},
 	TestItem{"et", r"(oth)|(eth)",-1,0},
 	TestItem{"et", r".*(oth)|(eth)",-1,0},
 	TestItem{"peoth", r".*(ith)|(eth)",-1,0},
 	TestItem{"poth", r"(eth)|(oth)",1,4},
 	TestItem{"poth", r"(oth)|(eth)",1,4},
 	TestItem{"poth", r".(oth)|(eth)$",0,4},
 	TestItem{"poth", r"^.(oth)|(eth)$",0,4},
 	TestItem{"poth", r"^\w+$",0,4},
 ]
 )
@ -150,30 +174,35 @@ const (
 cgroups_test_suite = [
 	TestItemCGroup{
 		"http://www.ciao.mondo/hello/pippo12_/pera.html",
-		r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+",0,46,
+		r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+[\.|/])+",0,42,
-		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
+		[7, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42],
 		{'format':int(0),'token':1}
 	},
 	TestItemCGroup{
 		"http://www.ciao.mondo/hello/pippo12_/pera.html",
 		r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
-		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
+		[2, 0, 0, 4, 1, 7, 10],
 		{'format':int(0),'token':1}
 	},
 	TestItemCGroup{
 		"http://www.ciao.mondo/hello/pippo12_/pera.html",
-		r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+.)+",0,46,
+		r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+\.)+",0,16,
-		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
+		[3, 0, 0, 4, 1, 7, 11, 1, 11, 16],
 		{'format':int(0)}
 	},
 ]
 )
 const (
 	debug = false // true for debug println 
 )
 fn test_regex(){
 	// check capturing groups
 	for c,to in cgroups_test_suite {
 		// debug print
-		//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)")
+		if debug { println("#$c [$to.src] q[$to.q] ($to.s, $to.e)") }
 		mut re := regex.regex_opt(to.q) or {
 			eprintln('err: $err')
@ -191,16 +220,16 @@ fn test_regex(){
 		}
 		if start != to.s || end != to.e {
-			println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
+			//println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
 			println("ERROR!")
-			//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
+			C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
 			assert false
 			continue
 		}
 		// check cgroups
 		if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
-			println("Capturing group len error!")
+			println("Capturing group len error! ${re.group_csave[0]}")
 			assert false
 			continue
 		}
@ -225,9 +254,9 @@ fn test_regex(){
 	}
 	// check find_all
-	for _,to in match_test_suite_fa{
+	for c,to in match_test_suite_fa{
 		// debug print
-		//println("#$c [$to.src] q[$to.q] $to.r")
+		if debug { println("#$c [$to.src] q[$to.q] $to.r") }
 		mut re := regex.regex_opt(to.q) or {
 			eprintln('err: $err')
@ -253,9 +282,9 @@ fn test_regex(){
 	}
 	// check replace
-	for _,to in match_test_suite_re{
+	for c,to in match_test_suite_re{
 		// debug print
-		//println("#$c [$to.src] q[$to.q] $to.r")
+		if debug { println("#$c [$to.src] q[$to.q] $to.r") }
 		mut re := regex.regex_opt(to.q) or {
 			eprintln('err: $err')
@ -274,7 +303,7 @@ fn test_regex(){
 	// check match and find
 	for c,to in match_test_suite {
 		// debug print
-		println("#$c [$to.src] q[$to.q] $to.s $to.e")
+		if debug { println("#$c [$to.src] q[$to.q] $to.s $to.e") }
 		// test the find
 		if to.s > 0 {
@ -289,7 +318,7 @@ fn test_regex(){
 			if start != to.s || end != to.e {
 				err_str := re.get_parse_error_string(start)
-				println("ERROR : $err_str")
+				println("ERROR : $err_str start: ${start} end: ${end}")
 				assert false
 			} else {
 				//tmp_str := text[start..end]
@ -334,4 +363,7 @@ fn test_regex(){
 		}
 	}
 	if debug { println("DONE!") }
 }
--- a/vlib/regex/simple_const_regex_test.v
+++ b/vlib/regex/simple_const_regex_test.v
@ -1,17 +0,0 @@
 import regex
 const (
 	a_or_b = regex.regex_opt('a|b') ?
 )
 fn f(s string) bool {
 	mut re := a_or_b
 	start, _ := re.match_string(s)
 	return start != -1
 }
 fn test_const_regex_works() {
 	assert f('a') == true
 	assert f('b') == true
 	assert f('c') == false
 }