From 4fb37e81b2b9664695a3a373ccacf6dc68cd466e Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Sat, 5 Dec 2020 01:51:48 +0100 Subject: [PATCH] regex: bug fixes (#7137) --- vlib/regex/regex.v | 117 ++++++++++++++++++++++----- vlib/regex/regex_opt.v | 8 +- vlib/regex/regex_test.v | 68 +++++++++++----- vlib/regex/simple_const_regex_test.v | 17 ---- 4 files changed, 153 insertions(+), 57 deletions(-) delete mode 100644 vlib/regex/simple_const_regex_test.v diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index 1be0eda102..1294cdf342 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -616,7 +616,7 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) { } } if status == .in_bsls { - println("CC bsls not found [${ch:c}]") + //println("CC bsls not found [${ch:c}]") status = .in_char }else { continue @@ -1212,6 +1212,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) { // set the jump in the right places pc1 = 0 for pc1 < pc-2 { + //println("Here $pc1 ${pc-2}") // two consecutive OR are a syntax error if re.prog[pc1+1].ist == ist_or_branch && re.prog[pc1+2].ist == ist_or_branch { return err_syntax_error, i @@ -1238,8 +1239,13 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) { re.prog[pc1+1].rep_max = pc2 + 1 break } + pc2++ } + // special case query of few chars, teh true can't go on the first instruction + if re.prog[pc1+1].rep_max == pc1 { + re.prog[pc1+1].rep_max = 3 + } //println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]") pc1 = pc2 continue @@ -1490,6 +1496,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { mut ch := rune(0) // examinated char mut char_len := 0 // utf8 examinated char len mut m_state := Match_state.start // start point for the matcher FSM + mut src_end := false + mut last_fnd_pc := -1 mut pc := -1 // program counter mut state := StateObj{} // actual state @@ -1599,9 +1607,14 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { dbg_line++ } //****************************************** - +/* if ist == ist_prog_end { + //println("HERE") + break + } +*/ // we're out of text, manage it if i >= in_txt_len || m_state == .new_line { + src_end = true // manage groups if group_index >= 0 && state.match_index >= 0 { @@ -1644,11 +1657,29 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { } } - // manage ist_dot_char + if pc == -1 { + pc = last_fnd_pc + } + //println("Finished text!!") + //println("Instruction: ${ist:08x} pc: $pc") + //println("min_rep: ${re.prog[pc].rep_min} max_rep: ${re.prog[pc].rep_max} rep: ${re.prog[pc].rep}") + + // program end + if ist == ist_prog_end { + //println("Program end on end of text!") + return first_match,i + } - m_state = .end - break - //return no_match_found,0 + // if we go out of text and we are the last instruction .* check + if (re.prog[pc+1].ist == ist_prog_end) && + (re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max) { + //println("Ok .* rep match!") + return first_match,i + } + + //m_state = .end + //break + return no_match_found,0 } // starting and init @@ -1697,7 +1728,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { if m_state == .stop { // we are in search mode, don't exit until the end - if re.flag & f_src != 0 && ist != ist_prog_end { + if ((re.flag & f_src) != 0) && (ist != ist_prog_end) { + last_fnd_pc = pc pc = -1 i += char_len m_state = .ist_next @@ -1741,9 +1773,10 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // we have a DOT MATCH on going //println("ist_prog_end l_ist: ${l_ist:08x}", l_ist) if re.state_stack_index>=0 && l_ist == ist_dot_char { + i = in_txt_len // dario m_state = .stop continue - } + } re.state_stack_index = -1 m_state = .stop @@ -1832,7 +1865,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { first_match = i } state.match_index = i - re.prog[pc].rep++ + re.prog[pc].rep++ // increase repetitions //if re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max { if re.prog[pc].rep >= 0 && re.prog[pc].rep <= re.prog[pc].rep_max { @@ -1857,12 +1890,15 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { re.state_stack[re.state_stack_index].i = i + char_len } + //i += char_len // next char +/* // manage * and {0,} quantifier - if re.prog[pc].rep_min > 0 { - i += char_len // next char - l_ist = u32(ist_dot_char) + if re.prog[pc].rep_max == max_quantifier { + //println("manage .*") + m_state = .ist_load + continue } - +*/ m_state = .ist_next continue @@ -2163,14 +2199,48 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { return err_internal_error, i } + //println("Check end of text!") // Check the results if state.match_index >= 0 { if group_index < 0 { - //println("OK match,natural end [$first_match,$i]") - return first_match, i + + if re.prog[pc].ist == ist_prog_end { + //println("program ended!!") + + if (re.flag & f_src) != 0 { + //println("find return") + return first_match, i + } else { + return 0, i + } + } + + //println("No Group here, natural end [$first_match,$i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len") + + if re.prog[pc+1].ist == ist_prog_end || re.prog[pc].ist == ist_prog_end{ + rep := re.prog[pc].rep + //println("rep: $rep re.prog[pc].rep_min: ${re.prog[pc].rep_min} re.prog[pc].rep_max: ${re.prog[pc].rep_max}") + if rep >= re.prog[pc].rep_min && rep <= re.prog[pc].rep_max { + return first_match, i + } + //println("Program not finished! ") + return no_match_found, 0 + } + if src_end { + //println("program end") + return first_match, i + } + //print("No match found!!") + return no_match_found, 0 + + } else { + //println("Group match! OK") + //println("first_match: $first_match, i: $i") + //println("Skip last group") - return first_match,group_stack[group_index--] + return first_match,i + //return first_match,group_stack[group_index--] } } //println("no_match_found, natural end") @@ -2224,7 +2294,12 @@ fn impl_new_regex_by_size(mult int) RE { // pub fn (mut re RE) match_string(in_txt string) (int,int) { - start, end := re.match_base(in_txt.str,in_txt.len) + + start, mut end := re.match_base(in_txt.str, in_txt.len + 1) + if end > in_txt.len { + end = in_txt.len + } + if start >= 0 && end > start { if (re.flag & f_ms) != 0 && start > 0 { return no_match_found, 0 @@ -2247,9 +2322,15 @@ pub fn (mut re RE) match_string(in_txt string) (int,int) { // find try to find the first match in the input string pub fn (mut re RE) find(in_txt string) (int,int) { old_flag := re.flag + re.flag |= f_src // enable search mode - start, end := re.match_base(in_txt.str, in_txt.len) + start, mut end := re.match_base(in_txt.str, in_txt.len + 1) + //print("Find [$start,$end] '${in_txt[start..end]}'") + if end > in_txt.len { + end = in_txt.len + } re.flag = old_flag + if start >= 0 && end > start { return start, end } diff --git a/vlib/regex/regex_opt.v b/vlib/regex/regex_opt.v index 4c756b8ee4..7af98491c8 100644 --- a/vlib/regex/regex_opt.v +++ b/vlib/regex/regex_opt.v @@ -7,11 +7,11 @@ pub fn (mut re RE) compile_opt(pattern string) ? { if re_err != compile_ok { mut err_msg := strings.new_builder(300) - err_str := re.get_parse_error_string(re_err) - err_msg.write("$err_str\n") - err_msg.write(" query: $pattern\n") + err_msg.write("query: $pattern\n") line := "-".repeat(err_pos) - err_msg.write(" err pos: ${line}^") + err_msg.write("err : ${line}^\n") + err_str := re.get_parse_error_string(re_err) + err_msg.write("ERROR: $err_str\n") return error_with_code(err_msg.str(), re_err) } } diff --git a/vlib/regex/regex_test.v b/vlib/regex/regex_test.v index a175fc37b8..e074268eea 100644 --- a/vlib/regex/regex_test.v +++ b/vlib/regex/regex_test.v @@ -14,6 +14,12 @@ struct TestItem { const( match_test_suite = [ + // base OR + TestItem{"a",r"a|b",0,1}, + TestItem{"a",r"b|a",0,1}, + TestItem{"b",r"a|b",0,1}, + TestItem{"b",r"b|a",0,1}, + TestItem{"c",r"b|a",-1,0}, // positive TestItem{"this is a good.",r"this",0,4}, @@ -38,7 +44,7 @@ match_test_suite = [ TestItem{"this these those ",r"(th[eio]se? ?)+",0,17}, TestItem{"this these those ",r"(th[eio]se? )+",0,17}, TestItem{"this,these,those. over",r"(th[eio]se?[,. ])+",0,17}, - TestItem{"soday,this,these,those. over",r"(th[eio]se?[,. ])+",6,23}, + TestItem{"soday,this,these,those. over",r".+(th[eio]se?[,. ])+",0,23}, TestItem{"cpapaz",r"(c(pa)+z)",0,6}, TestItem{"this is a cpapaz over",r"(c(pa)+z)",10,16}, @@ -60,7 +66,7 @@ match_test_suite = [ TestItem{"soday,this,these,those. over",r".*,(th[eio]se?[,. ])+",0,23}, TestItem{"soday,this,these,thesa.thesi over",r".*,(th[ei]se?[,. ])+(thes[ai][,. ])+",0,29}, TestItem{"cpapaz ole. pippo,",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18}, - TestItem{"cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,17}, + TestItem{"cpapaz ole. pippo",r"(c(pa)+z)(\s+\a+[\.,]?)+",0,17}, TestItem{"cpapaz ole. pippo, 852",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18}, TestItem{"123cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20}, TestItem{"...cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20}, @@ -74,7 +80,6 @@ match_test_suite = [ TestItem{"/home/us_er/pippo/info-01.txt", r"(/?[-\w_]+)*\.txt$",0,29} - // negative TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0}, TestItem{"this is a good.",r"thes",-1,0}, @@ -88,6 +93,25 @@ match_test_suite = [ // check unicode TestItem{"this is a Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r".*a [Ⅰ-Ⅵ ]+",0,34}, TestItem{"123Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r"[Ⅰ-Ⅴ\s]+",3,23}, + + // new edge cases + TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",-1,0}, + TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",0,8}, + TestItem{"123456789", r"^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$",0,9} + TestItem{"12345678", r"^\d{8}$",0,8}, + TestItem{"12345678", r"^\d{7}$",-1,0}, + TestItem{"12345678", r"^\d{9}$",-1,0}, + + TestItem{"eth", r"(oth)|(eth)",0,3}, + TestItem{"et", r"(oth)|(eth)",-1,0}, + TestItem{"et", r".*(oth)|(eth)",-1,0}, + TestItem{"peoth", r".*(ith)|(eth)",-1,0}, + + TestItem{"poth", r"(eth)|(oth)",1,4}, + TestItem{"poth", r"(oth)|(eth)",1,4}, + TestItem{"poth", r".(oth)|(eth)$",0,4}, + TestItem{"poth", r"^.(oth)|(eth)$",0,4}, + TestItem{"poth", r"^\w+$",0,4}, ] ) @@ -150,30 +174,35 @@ const ( cgroups_test_suite = [ TestItemCGroup{ "http://www.ciao.mondo/hello/pippo12_/pera.html", - r"(?Phttps?)|(?:ftps?)://(?P[\w_]+.)+",0,46, - [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46], + r"(?Phttps?)|(?:ftps?)://(?P[\w_]+[\.|/])+",0,42, + [7, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42], {'format':int(0),'token':1} }, TestItemCGroup{ "http://www.ciao.mondo/hello/pippo12_/pera.html", r"(?Phttps?)|(?Pftps?)://(?P[\w_]+.)+",0,46, - [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46], + [2, 0, 0, 4, 1, 7, 10], {'format':int(0),'token':1} }, TestItemCGroup{ "http://www.ciao.mondo/hello/pippo12_/pera.html", - r"(?Phttps?)|(?Pftps?)://([\w_]+.)+",0,46, - [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46], + r"(?Phttps?)|(?Pftps?)://([\w_]+\.)+",0,16, + [3, 0, 0, 4, 1, 7, 11, 1, 11, 16], {'format':int(0)} }, ] ) +const ( + debug = false // true for debug println +) + fn test_regex(){ + // check capturing groups for c,to in cgroups_test_suite { // debug print - //println("#$c [$to.src] q[$to.q] ($to.s, $to.e)") + if debug { println("#$c [$to.src] q[$to.q] ($to.s, $to.e)") } mut re := regex.regex_opt(to.q) or { eprintln('err: $err') @@ -191,16 +220,16 @@ fn test_regex(){ } if start != to.s || end != to.e { - println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") + //println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") println("ERROR!") - //C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) + C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) assert false continue } // check cgroups if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] { - println("Capturing group len error!") + println("Capturing group len error! ${re.group_csave[0]}") assert false continue } @@ -225,9 +254,9 @@ fn test_regex(){ } // check find_all - for _,to in match_test_suite_fa{ + for c,to in match_test_suite_fa{ // debug print - //println("#$c [$to.src] q[$to.q] $to.r") + if debug { println("#$c [$to.src] q[$to.q] $to.r") } mut re := regex.regex_opt(to.q) or { eprintln('err: $err') @@ -253,9 +282,9 @@ fn test_regex(){ } // check replace - for _,to in match_test_suite_re{ + for c,to in match_test_suite_re{ // debug print - //println("#$c [$to.src] q[$to.q] $to.r") + if debug { println("#$c [$to.src] q[$to.q] $to.r") } mut re := regex.regex_opt(to.q) or { eprintln('err: $err') @@ -274,7 +303,7 @@ fn test_regex(){ // check match and find for c,to in match_test_suite { // debug print - println("#$c [$to.src] q[$to.q] $to.s $to.e") + if debug { println("#$c [$to.src] q[$to.q] $to.s $to.e") } // test the find if to.s > 0 { @@ -289,7 +318,7 @@ fn test_regex(){ if start != to.s || end != to.e { err_str := re.get_parse_error_string(start) - println("ERROR : $err_str") + println("ERROR : $err_str start: ${start} end: ${end}") assert false } else { //tmp_str := text[start..end] @@ -334,4 +363,7 @@ fn test_regex(){ } } + if debug { println("DONE!") } + } + diff --git a/vlib/regex/simple_const_regex_test.v b/vlib/regex/simple_const_regex_test.v deleted file mode 100644 index 7ffe31c7e8..0000000000 --- a/vlib/regex/simple_const_regex_test.v +++ /dev/null @@ -1,17 +0,0 @@ -import regex - -const ( - a_or_b = regex.regex_opt('a|b') ? -) - -fn f(s string) bool { - mut re := a_or_b - start, _ := re.match_string(s) - return start != -1 -} - -fn test_const_regex_works() { - assert f('a') == true - assert f('b') == true - assert f('c') == false -}