regex: bug fixes (#7137)

pull/7138/head
penguindark 2020-12-05 01:51:48 +01:00 committed by GitHub
parent 89952edd25
commit 4fb37e81b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 153 additions and 57 deletions

View File

@ -616,7 +616,7 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
} }
} }
if status == .in_bsls { if status == .in_bsls {
println("CC bsls not found [${ch:c}]") //println("CC bsls not found [${ch:c}]")
status = .in_char status = .in_char
}else { }else {
continue continue
@ -1212,6 +1212,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
// set the jump in the right places // set the jump in the right places
pc1 = 0 pc1 = 0
for pc1 < pc-2 { for pc1 < pc-2 {
//println("Here $pc1 ${pc-2}")
// two consecutive OR are a syntax error // two consecutive OR are a syntax error
if re.prog[pc1+1].ist == ist_or_branch && re.prog[pc1+2].ist == ist_or_branch { if re.prog[pc1+1].ist == ist_or_branch && re.prog[pc1+2].ist == ist_or_branch {
return err_syntax_error, i return err_syntax_error, i
@ -1238,8 +1239,13 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
re.prog[pc1+1].rep_max = pc2 + 1 re.prog[pc1+1].rep_max = pc2 + 1
break break
} }
pc2++ pc2++
} }
// special case query of few chars, teh true can't go on the first instruction
if re.prog[pc1+1].rep_max == pc1 {
re.prog[pc1+1].rep_max = 3
}
//println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]") //println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]")
pc1 = pc2 pc1 = pc2
continue continue
@ -1490,6 +1496,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
mut ch := rune(0) // examinated char mut ch := rune(0) // examinated char
mut char_len := 0 // utf8 examinated char len mut char_len := 0 // utf8 examinated char len
mut m_state := Match_state.start // start point for the matcher FSM mut m_state := Match_state.start // start point for the matcher FSM
mut src_end := false
mut last_fnd_pc := -1
mut pc := -1 // program counter mut pc := -1 // program counter
mut state := StateObj{} // actual state mut state := StateObj{} // actual state
@ -1599,9 +1607,14 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
dbg_line++ dbg_line++
} }
//****************************************** //******************************************
/* if ist == ist_prog_end {
//println("HERE")
break
}
*/
// we're out of text, manage it // we're out of text, manage it
if i >= in_txt_len || m_state == .new_line { if i >= in_txt_len || m_state == .new_line {
src_end = true
// manage groups // manage groups
if group_index >= 0 && state.match_index >= 0 { if group_index >= 0 && state.match_index >= 0 {
@ -1644,11 +1657,29 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
} }
} }
// manage ist_dot_char if pc == -1 {
pc = last_fnd_pc
}
//println("Finished text!!")
//println("Instruction: ${ist:08x} pc: $pc")
//println("min_rep: ${re.prog[pc].rep_min} max_rep: ${re.prog[pc].rep_max} rep: ${re.prog[pc].rep}")
m_state = .end // program end
break if ist == ist_prog_end {
//return no_match_found,0 //println("Program end on end of text!")
return first_match,i
}
// if we go out of text and we are the last instruction .* check
if (re.prog[pc+1].ist == ist_prog_end) &&
(re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max) {
//println("Ok .* rep match!")
return first_match,i
}
//m_state = .end
//break
return no_match_found,0
} }
// starting and init // starting and init
@ -1697,7 +1728,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
if m_state == .stop { if m_state == .stop {
// we are in search mode, don't exit until the end // we are in search mode, don't exit until the end
if re.flag & f_src != 0 && ist != ist_prog_end { if ((re.flag & f_src) != 0) && (ist != ist_prog_end) {
last_fnd_pc = pc
pc = -1 pc = -1
i += char_len i += char_len
m_state = .ist_next m_state = .ist_next
@ -1741,6 +1773,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// we have a DOT MATCH on going // we have a DOT MATCH on going
//println("ist_prog_end l_ist: ${l_ist:08x}", l_ist) //println("ist_prog_end l_ist: ${l_ist:08x}", l_ist)
if re.state_stack_index>=0 && l_ist == ist_dot_char { if re.state_stack_index>=0 && l_ist == ist_dot_char {
i = in_txt_len // dario
m_state = .stop m_state = .stop
continue continue
} }
@ -1832,7 +1865,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
first_match = i first_match = i
} }
state.match_index = i state.match_index = i
re.prog[pc].rep++ re.prog[pc].rep++ // increase repetitions
//if re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max { //if re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max {
if re.prog[pc].rep >= 0 && re.prog[pc].rep <= re.prog[pc].rep_max { if re.prog[pc].rep >= 0 && re.prog[pc].rep <= re.prog[pc].rep_max {
@ -1857,12 +1890,15 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
re.state_stack[re.state_stack_index].i = i + char_len re.state_stack[re.state_stack_index].i = i + char_len
} }
//i += char_len // next char
/*
// manage * and {0,} quantifier // manage * and {0,} quantifier
if re.prog[pc].rep_min > 0 { if re.prog[pc].rep_max == max_quantifier {
i += char_len // next char //println("manage .*")
l_ist = u32(ist_dot_char) m_state = .ist_load
continue
} }
*/
m_state = .ist_next m_state = .ist_next
continue continue
@ -2163,14 +2199,48 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
return err_internal_error, i return err_internal_error, i
} }
//println("Check end of text!")
// Check the results // Check the results
if state.match_index >= 0 { if state.match_index >= 0 {
if group_index < 0 { if group_index < 0 {
//println("OK match,natural end [$first_match,$i]")
return first_match, i if re.prog[pc].ist == ist_prog_end {
//println("program ended!!")
if (re.flag & f_src) != 0 {
//println("find return")
return first_match, i
} else {
return 0, i
}
}
//println("No Group here, natural end [$first_match,$i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len")
if re.prog[pc+1].ist == ist_prog_end || re.prog[pc].ist == ist_prog_end{
rep := re.prog[pc].rep
//println("rep: $rep re.prog[pc].rep_min: ${re.prog[pc].rep_min} re.prog[pc].rep_max: ${re.prog[pc].rep_max}")
if rep >= re.prog[pc].rep_min && rep <= re.prog[pc].rep_max {
return first_match, i
}
//println("Program not finished! ")
return no_match_found, 0
}
if src_end {
//println("program end")
return first_match, i
}
//print("No match found!!")
return no_match_found, 0
} else { } else {
//println("Group match! OK")
//println("first_match: $first_match, i: $i")
//println("Skip last group") //println("Skip last group")
return first_match,group_stack[group_index--] return first_match,i
//return first_match,group_stack[group_index--]
} }
} }
//println("no_match_found, natural end") //println("no_match_found, natural end")
@ -2224,7 +2294,12 @@ fn impl_new_regex_by_size(mult int) RE {
// //
pub fn (mut re RE) match_string(in_txt string) (int,int) { pub fn (mut re RE) match_string(in_txt string) (int,int) {
start, end := re.match_base(in_txt.str,in_txt.len)
start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
if end > in_txt.len {
end = in_txt.len
}
if start >= 0 && end > start { if start >= 0 && end > start {
if (re.flag & f_ms) != 0 && start > 0 { if (re.flag & f_ms) != 0 && start > 0 {
return no_match_found, 0 return no_match_found, 0
@ -2247,9 +2322,15 @@ pub fn (mut re RE) match_string(in_txt string) (int,int) {
// find try to find the first match in the input string // find try to find the first match in the input string
pub fn (mut re RE) find(in_txt string) (int,int) { pub fn (mut re RE) find(in_txt string) (int,int) {
old_flag := re.flag old_flag := re.flag
re.flag |= f_src // enable search mode re.flag |= f_src // enable search mode
start, end := re.match_base(in_txt.str, in_txt.len) start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
//print("Find [$start,$end] '${in_txt[start..end]}'")
if end > in_txt.len {
end = in_txt.len
}
re.flag = old_flag re.flag = old_flag
if start >= 0 && end > start { if start >= 0 && end > start {
return start, end return start, end
} }

View File

@ -7,11 +7,11 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
if re_err != compile_ok { if re_err != compile_ok {
mut err_msg := strings.new_builder(300) mut err_msg := strings.new_builder(300)
err_str := re.get_parse_error_string(re_err) err_msg.write("query: $pattern\n")
err_msg.write("$err_str\n")
err_msg.write(" query: $pattern\n")
line := "-".repeat(err_pos) line := "-".repeat(err_pos)
err_msg.write(" err pos: ${line}^") err_msg.write("err : ${line}^\n")
err_str := re.get_parse_error_string(re_err)
err_msg.write("ERROR: $err_str\n")
return error_with_code(err_msg.str(), re_err) return error_with_code(err_msg.str(), re_err)
} }
} }

View File

@ -14,6 +14,12 @@ struct TestItem {
const( const(
match_test_suite = [ match_test_suite = [
// base OR
TestItem{"a",r"a|b",0,1},
TestItem{"a",r"b|a",0,1},
TestItem{"b",r"a|b",0,1},
TestItem{"b",r"b|a",0,1},
TestItem{"c",r"b|a",-1,0},
// positive // positive
TestItem{"this is a good.",r"this",0,4}, TestItem{"this is a good.",r"this",0,4},
@ -38,7 +44,7 @@ match_test_suite = [
TestItem{"this these those ",r"(th[eio]se? ?)+",0,17}, TestItem{"this these those ",r"(th[eio]se? ?)+",0,17},
TestItem{"this these those ",r"(th[eio]se? )+",0,17}, TestItem{"this these those ",r"(th[eio]se? )+",0,17},
TestItem{"this,these,those. over",r"(th[eio]se?[,. ])+",0,17}, TestItem{"this,these,those. over",r"(th[eio]se?[,. ])+",0,17},
TestItem{"soday,this,these,those. over",r"(th[eio]se?[,. ])+",6,23}, TestItem{"soday,this,these,those. over",r".+(th[eio]se?[,. ])+",0,23},
TestItem{"cpapaz",r"(c(pa)+z)",0,6}, TestItem{"cpapaz",r"(c(pa)+z)",0,6},
TestItem{"this is a cpapaz over",r"(c(pa)+z)",10,16}, TestItem{"this is a cpapaz over",r"(c(pa)+z)",10,16},
@ -60,7 +66,7 @@ match_test_suite = [
TestItem{"soday,this,these,those. over",r".*,(th[eio]se?[,. ])+",0,23}, TestItem{"soday,this,these,those. over",r".*,(th[eio]se?[,. ])+",0,23},
TestItem{"soday,this,these,thesa.thesi over",r".*,(th[ei]se?[,. ])+(thes[ai][,. ])+",0,29}, TestItem{"soday,this,these,thesa.thesi over",r".*,(th[ei]se?[,. ])+(thes[ai][,. ])+",0,29},
TestItem{"cpapaz ole. pippo,",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18}, TestItem{"cpapaz ole. pippo,",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18},
TestItem{"cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,17}, TestItem{"cpapaz ole. pippo",r"(c(pa)+z)(\s+\a+[\.,]?)+",0,17},
TestItem{"cpapaz ole. pippo, 852",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18}, TestItem{"cpapaz ole. pippo, 852",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18},
TestItem{"123cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20}, TestItem{"123cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20},
TestItem{"...cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20}, TestItem{"...cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20},
@ -74,7 +80,6 @@ match_test_suite = [
TestItem{"/home/us_er/pippo/info-01.txt", r"(/?[-\w_]+)*\.txt$",0,29} TestItem{"/home/us_er/pippo/info-01.txt", r"(/?[-\w_]+)*\.txt$",0,29}
// negative // negative
TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0}, TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
TestItem{"this is a good.",r"thes",-1,0}, TestItem{"this is a good.",r"thes",-1,0},
@ -88,6 +93,25 @@ match_test_suite = [
// check unicode // check unicode
TestItem{"this is a test",r".*a [-Ⅵ ]+",0,34}, TestItem{"this is a test",r".*a [-Ⅵ ]+",0,34},
TestItem{"123 test",r"[-\s]+",3,23}, TestItem{"123 test",r"[-\s]+",3,23},
// new edge cases
TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",-1,0},
TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",0,8},
TestItem{"123456789", r"^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$",0,9}
TestItem{"12345678", r"^\d{8}$",0,8},
TestItem{"12345678", r"^\d{7}$",-1,0},
TestItem{"12345678", r"^\d{9}$",-1,0},
TestItem{"eth", r"(oth)|(eth)",0,3},
TestItem{"et", r"(oth)|(eth)",-1,0},
TestItem{"et", r".*(oth)|(eth)",-1,0},
TestItem{"peoth", r".*(ith)|(eth)",-1,0},
TestItem{"poth", r"(eth)|(oth)",1,4},
TestItem{"poth", r"(oth)|(eth)",1,4},
TestItem{"poth", r".(oth)|(eth)$",0,4},
TestItem{"poth", r"^.(oth)|(eth)$",0,4},
TestItem{"poth", r"^\w+$",0,4},
] ]
) )
@ -150,30 +174,35 @@ const (
cgroups_test_suite = [ cgroups_test_suite = [
TestItemCGroup{ TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html", "http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+",0,46, r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+[\.|/])+",0,42,
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46], [7, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42],
{'format':int(0),'token':1} {'format':int(0),'token':1}
}, },
TestItemCGroup{ TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html", "http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46, r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46], [2, 0, 0, 4, 1, 7, 10],
{'format':int(0),'token':1} {'format':int(0),'token':1}
}, },
TestItemCGroup{ TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html", "http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+.)+",0,46, r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+\.)+",0,16,
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46], [3, 0, 0, 4, 1, 7, 11, 1, 11, 16],
{'format':int(0)} {'format':int(0)}
}, },
] ]
) )
const (
debug = false // true for debug println
)
fn test_regex(){ fn test_regex(){
// check capturing groups // check capturing groups
for c,to in cgroups_test_suite { for c,to in cgroups_test_suite {
// debug print // debug print
//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)") if debug { println("#$c [$to.src] q[$to.q] ($to.s, $to.e)") }
mut re := regex.regex_opt(to.q) or { mut re := regex.regex_opt(to.q) or {
eprintln('err: $err') eprintln('err: $err')
@ -191,16 +220,16 @@ fn test_regex(){
} }
if start != to.s || end != to.e { if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") //println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!") println("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false assert false
continue continue
} }
// check cgroups // check cgroups
if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] { if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
println("Capturing group len error!") println("Capturing group len error! ${re.group_csave[0]}")
assert false assert false
continue continue
} }
@ -225,9 +254,9 @@ fn test_regex(){
} }
// check find_all // check find_all
for _,to in match_test_suite_fa{ for c,to in match_test_suite_fa{
// debug print // debug print
//println("#$c [$to.src] q[$to.q] $to.r") if debug { println("#$c [$to.src] q[$to.q] $to.r") }
mut re := regex.regex_opt(to.q) or { mut re := regex.regex_opt(to.q) or {
eprintln('err: $err') eprintln('err: $err')
@ -253,9 +282,9 @@ fn test_regex(){
} }
// check replace // check replace
for _,to in match_test_suite_re{ for c,to in match_test_suite_re{
// debug print // debug print
//println("#$c [$to.src] q[$to.q] $to.r") if debug { println("#$c [$to.src] q[$to.q] $to.r") }
mut re := regex.regex_opt(to.q) or { mut re := regex.regex_opt(to.q) or {
eprintln('err: $err') eprintln('err: $err')
@ -274,7 +303,7 @@ fn test_regex(){
// check match and find // check match and find
for c,to in match_test_suite { for c,to in match_test_suite {
// debug print // debug print
println("#$c [$to.src] q[$to.q] $to.s $to.e") if debug { println("#$c [$to.src] q[$to.q] $to.s $to.e") }
// test the find // test the find
if to.s > 0 { if to.s > 0 {
@ -289,7 +318,7 @@ fn test_regex(){
if start != to.s || end != to.e { if start != to.s || end != to.e {
err_str := re.get_parse_error_string(start) err_str := re.get_parse_error_string(start)
println("ERROR : $err_str") println("ERROR : $err_str start: ${start} end: ${end}")
assert false assert false
} else { } else {
//tmp_str := text[start..end] //tmp_str := text[start..end]
@ -334,4 +363,7 @@ fn test_regex(){
} }
} }
if debug { println("DONE!") }
} }

View File

@ -1,17 +0,0 @@
import regex
const (
a_or_b = regex.regex_opt('a|b') ?
)
fn f(s string) bool {
mut re := a_or_b
start, _ := re.match_string(s)
return start != -1
}
fn test_const_regex_works() {
assert f('a') == true
assert f('b') == true
assert f('c') == false
}