From 4324be9bd8ec5788f6ca1c241167a2c1b55e623d Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Sun, 13 Dec 2020 02:04:53 +0100 Subject: [PATCH] regex: fix for issue #7227, more tests (#7296) --- vlib/regex/regex.v | 79 +++++++++++++++++++++++++++++++++++------ vlib/regex/regex_test.v | 14 ++++++-- 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index 44ea32420b..c54156094f 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -256,6 +256,9 @@ mut: // OR flag for the token next_is_or bool // true if the next token is an OR + + // last_dot flag + last_dot bool } [inline] @@ -1192,16 +1195,44 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) { // Post processing //****************************************** + // + // manage ist_dot_char + // // count ist_dot_char to set the size of the state stack mut pc1 := 0 mut tmp_count := 0 + mut last_dot_pc := -1 for pc1 < pc { if re.prog[pc1].ist == ist_dot_char { tmp_count++ + last_dot_pc = pc1 + //println("Found dot_char pc:[${last_dot_pc}]") } pc1++ } + // if exist set the last dot_char token to manage the last .* + if last_dot_pc >= 0 { + re.prog[last_dot_pc].last_dot = true + + mut last_dot_flag := true + mut tmp_pc := last_dot_pc + 1 + for tmp_pc < pc { + if re.prog[tmp_pc].ist !in [rune(ist_prog_end),ist_group_end] { + last_dot_flag = false + break + } + tmp_pc++ + } + re.prog[last_dot_pc].last_dot = last_dot_flag + //println("Our last dot flag pc: ${last_dot_pc} flag: ${last_dot_flag}") + } + + + //****************************************** + + + // init the state stack re.state_stack = []StateDotObj{len: tmp_count+1, init: StateDotObj{}} @@ -1325,6 +1356,12 @@ pub fn (re RE) get_code() string { res.write("?") } } + + // last dot char flag + if tk.last_dot == true { + res.write(" Last dot_char!") + } + res.write("\n") if stop_flag { break @@ -1502,7 +1539,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { mut pc := -1 // program counter mut state := StateObj{} // actual state mut ist := rune(0) // actual instruction - mut l_ist :=rune(0) // last matched instruction + mut l_ist := rune(0) // last matched instruction //mut group_stack := [-1].repeat(re.group_max) //mut group_data := [-1].repeat(re.group_max) @@ -1581,6 +1618,9 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { buf2.write("CHAR_CLASS_NEG[${re.get_char_class(pc)}]") } else if ist == ist_dot_char { buf2.write("DOT_CHAR") + if re.prog[pc].last_dot == true { + buf2.write(" Last dot_char!") + } } else if ist == ist_group_start { tmp_gi :=re.prog[pc].group_id tmp_gr := re.prog[re.prog[pc].goto_pc].group_rep @@ -1638,6 +1678,11 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { } re.groups[g_index+1] = i + // manage last dot_char + if l_ist == ist_dot_char && re.prog[pc].last_dot == true { + re.groups[g_index+1]-- + } + // continuous save, save until we have space if re.group_csave_index > 0 { // check if we have space to save the record @@ -1676,6 +1721,11 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { //println("Ok .* rep match!") return first_match,i } + + // manage last dot_char + if first_match >= 0 && l_ist == ist_dot_char && re.prog[pc].last_dot == true { + return first_match,i + } //m_state = .end //break @@ -1798,6 +1848,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // check GROUP end else if ist == ist_group_end { // we are in matching streak + //println("Group END!! last ist: ${l_ist:08x}") if state.match_index >= 0 { // restore txt index stack and save the group data @@ -1816,9 +1867,14 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { re.groups[g_index+1] = i // if a group end with a dot, manage the not increased char index +/* if i == re.groups[g_index] { re.groups[g_index+1] = i+1 } +*/ + if l_ist == ist_dot_char { + re.groups[g_index+1] = i+1 + } //println("GROUP ${re.prog[pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}]") @@ -1873,8 +1929,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { state.match_index = i re.prog[pc].rep++ // increase repetitions - //if re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max { - if re.prog[pc].rep >= 0 && re.prog[pc].rep <= re.prog[pc].rep_max { + if re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max { + //if re.prog[pc].rep >= 0 && re.prog[pc].rep <= re.prog[pc].rep_max { //println("DOT CHAR save state : ${re.state_stack_index}") // save the state @@ -1893,18 +1949,21 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { } if re.prog[pc].rep >= 1 && re.state_stack_index >= 0 { + //println("Save state char index.") re.state_stack[re.state_stack_index].i = i + char_len } - //i += char_len // next char -/* - // manage * and {0,} quantifier - if re.prog[pc].rep_max == max_quantifier { - //println("manage .*") + // manage last dot char + if re.prog[pc].last_dot == true + && re.prog[pc].rep >= re.prog[pc].rep_min + && re.prog[pc].rep <= re.prog[pc].rep_max + { + //println("We are the last dot_char in the query") + i += char_len m_state = .ist_load continue } -*/ + m_state = .ist_next continue @@ -2078,7 +2137,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { } // ist_quant_pg => quantifier positive test on group - else if m_state == .ist_quant_pg { + else if m_state == .ist_quant_pg { //println(".ist_quant_pg") mut tmp_pc := pc if group_index >= 0 { diff --git a/vlib/regex/regex_test.v b/vlib/regex/regex_test.v index aa5d334bbf..25aea6a47e 100644 --- a/vlib/regex/regex_test.v +++ b/vlib/regex/regex_test.v @@ -112,6 +112,13 @@ match_test_suite = [ TestItem{"poth", r".(oth)|(eth)$",0,4}, TestItem{"poth", r"^.(oth)|(eth)$",0,4}, TestItem{"poth", r"^\w+$",0,4}, + + // test dot_char + TestItem{"8-11 l: qllllqllklhlvtl", r"^(\d+)-(\d+) ([a-z]): (.*)$",0,23}, + TestItem{"accccb deer", r"^a(.*)b d(.+)r",0,11}, + TestItem{"accccb deer", r"^a(.*)b d(.+)",0,11}, + TestItem{"accccb deer", r"^(.*)$",0,11}, + TestItem{"accccb deer", r"^a(.*)b d(.+)p",-1,0}, ] ) @@ -167,7 +174,7 @@ struct TestItemCGroup { q string s int e int - cg []int + cg []int // [number of items (3*# item), id_group_0, start_0, end_0, id_group_1, start1, start2,... ] cgn map[string]int } const ( @@ -181,7 +188,7 @@ cgroups_test_suite = [ TestItemCGroup{ "http://www.ciao.mondo/hello/pippo12_/pera.html", r"(?Phttps?)|(?Pftps?)://(?P[\w_]+.)+",0,46, - [2, 0, 0, 4, 1, 7, 10], + [8, 0, 0, 4, 1, 7, 12, 1, 11, 17, 1, 16, 23, 1, 22, 29, 1, 28, 38, 1, 37, 43, 1, 42, 46], {'format':int(0),'token':1} }, TestItemCGroup{ @@ -247,7 +254,7 @@ fn test_regex(){ // check cgroups if to.cgn.len > 0 { if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] { - println("Capturing group len error! ${re.group_csave[0]}") + println("Capturing group len error! found: ${re.group_csave[0]} true ground: ${to.cg[0]}") assert false continue } @@ -256,6 +263,7 @@ fn test_regex(){ mut ln := re.group_csave[0]*3 for ln > 0 { if re.group_csave[ln] != to.cg[ln] { + println("Capturing group failed on $ln item!") assert false } ln--