regex: bug fixes (#7137)
parent
89952edd25
commit
4fb37e81b2
|
@ -616,7 +616,7 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
|
|||
}
|
||||
}
|
||||
if status == .in_bsls {
|
||||
println("CC bsls not found [${ch:c}]")
|
||||
//println("CC bsls not found [${ch:c}]")
|
||||
status = .in_char
|
||||
}else {
|
||||
continue
|
||||
|
@ -1212,6 +1212,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
|
|||
// set the jump in the right places
|
||||
pc1 = 0
|
||||
for pc1 < pc-2 {
|
||||
//println("Here $pc1 ${pc-2}")
|
||||
// two consecutive OR are a syntax error
|
||||
if re.prog[pc1+1].ist == ist_or_branch && re.prog[pc1+2].ist == ist_or_branch {
|
||||
return err_syntax_error, i
|
||||
|
@ -1238,8 +1239,13 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
|
|||
re.prog[pc1+1].rep_max = pc2 + 1
|
||||
break
|
||||
}
|
||||
|
||||
pc2++
|
||||
}
|
||||
// special case query of few chars, teh true can't go on the first instruction
|
||||
if re.prog[pc1+1].rep_max == pc1 {
|
||||
re.prog[pc1+1].rep_max = 3
|
||||
}
|
||||
//println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]")
|
||||
pc1 = pc2
|
||||
continue
|
||||
|
@ -1490,6 +1496,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
|||
mut ch := rune(0) // examinated char
|
||||
mut char_len := 0 // utf8 examinated char len
|
||||
mut m_state := Match_state.start // start point for the matcher FSM
|
||||
mut src_end := false
|
||||
mut last_fnd_pc := -1
|
||||
|
||||
mut pc := -1 // program counter
|
||||
mut state := StateObj{} // actual state
|
||||
|
@ -1599,9 +1607,14 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
|||
dbg_line++
|
||||
}
|
||||
//******************************************
|
||||
|
||||
/* if ist == ist_prog_end {
|
||||
//println("HERE")
|
||||
break
|
||||
}
|
||||
*/
|
||||
// we're out of text, manage it
|
||||
if i >= in_txt_len || m_state == .new_line {
|
||||
src_end = true
|
||||
|
||||
// manage groups
|
||||
if group_index >= 0 && state.match_index >= 0 {
|
||||
|
@ -1644,11 +1657,29 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
|||
}
|
||||
}
|
||||
|
||||
// manage ist_dot_char
|
||||
if pc == -1 {
|
||||
pc = last_fnd_pc
|
||||
}
|
||||
//println("Finished text!!")
|
||||
//println("Instruction: ${ist:08x} pc: $pc")
|
||||
//println("min_rep: ${re.prog[pc].rep_min} max_rep: ${re.prog[pc].rep_max} rep: ${re.prog[pc].rep}")
|
||||
|
||||
// program end
|
||||
if ist == ist_prog_end {
|
||||
//println("Program end on end of text!")
|
||||
return first_match,i
|
||||
}
|
||||
|
||||
m_state = .end
|
||||
break
|
||||
//return no_match_found,0
|
||||
// if we go out of text and we are the last instruction .* check
|
||||
if (re.prog[pc+1].ist == ist_prog_end) &&
|
||||
(re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max) {
|
||||
//println("Ok .* rep match!")
|
||||
return first_match,i
|
||||
}
|
||||
|
||||
//m_state = .end
|
||||
//break
|
||||
return no_match_found,0
|
||||
}
|
||||
|
||||
// starting and init
|
||||
|
@ -1697,7 +1728,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
|||
if m_state == .stop {
|
||||
|
||||
// we are in search mode, don't exit until the end
|
||||
if re.flag & f_src != 0 && ist != ist_prog_end {
|
||||
if ((re.flag & f_src) != 0) && (ist != ist_prog_end) {
|
||||
last_fnd_pc = pc
|
||||
pc = -1
|
||||
i += char_len
|
||||
m_state = .ist_next
|
||||
|
@ -1741,9 +1773,10 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
|||
// we have a DOT MATCH on going
|
||||
//println("ist_prog_end l_ist: ${l_ist:08x}", l_ist)
|
||||
if re.state_stack_index>=0 && l_ist == ist_dot_char {
|
||||
i = in_txt_len // dario
|
||||
m_state = .stop
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
re.state_stack_index = -1
|
||||
m_state = .stop
|
||||
|
@ -1832,7 +1865,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
|||
first_match = i
|
||||
}
|
||||
state.match_index = i
|
||||
re.prog[pc].rep++
|
||||
re.prog[pc].rep++ // increase repetitions
|
||||
|
||||
//if re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max {
|
||||
if re.prog[pc].rep >= 0 && re.prog[pc].rep <= re.prog[pc].rep_max {
|
||||
|
@ -1857,12 +1890,15 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
|||
re.state_stack[re.state_stack_index].i = i + char_len
|
||||
}
|
||||
|
||||
//i += char_len // next char
|
||||
/*
|
||||
// manage * and {0,} quantifier
|
||||
if re.prog[pc].rep_min > 0 {
|
||||
i += char_len // next char
|
||||
l_ist = u32(ist_dot_char)
|
||||
if re.prog[pc].rep_max == max_quantifier {
|
||||
//println("manage .*")
|
||||
m_state = .ist_load
|
||||
continue
|
||||
}
|
||||
|
||||
*/
|
||||
m_state = .ist_next
|
||||
continue
|
||||
|
||||
|
@ -2163,14 +2199,48 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
|||
return err_internal_error, i
|
||||
}
|
||||
|
||||
//println("Check end of text!")
|
||||
// Check the results
|
||||
if state.match_index >= 0 {
|
||||
if group_index < 0 {
|
||||
//println("OK match,natural end [$first_match,$i]")
|
||||
return first_match, i
|
||||
|
||||
if re.prog[pc].ist == ist_prog_end {
|
||||
//println("program ended!!")
|
||||
|
||||
if (re.flag & f_src) != 0 {
|
||||
//println("find return")
|
||||
return first_match, i
|
||||
} else {
|
||||
return 0, i
|
||||
}
|
||||
}
|
||||
|
||||
//println("No Group here, natural end [$first_match,$i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len")
|
||||
|
||||
if re.prog[pc+1].ist == ist_prog_end || re.prog[pc].ist == ist_prog_end{
|
||||
rep := re.prog[pc].rep
|
||||
//println("rep: $rep re.prog[pc].rep_min: ${re.prog[pc].rep_min} re.prog[pc].rep_max: ${re.prog[pc].rep_max}")
|
||||
if rep >= re.prog[pc].rep_min && rep <= re.prog[pc].rep_max {
|
||||
return first_match, i
|
||||
}
|
||||
//println("Program not finished! ")
|
||||
return no_match_found, 0
|
||||
}
|
||||
if src_end {
|
||||
//println("program end")
|
||||
return first_match, i
|
||||
}
|
||||
//print("No match found!!")
|
||||
return no_match_found, 0
|
||||
|
||||
|
||||
} else {
|
||||
//println("Group match! OK")
|
||||
//println("first_match: $first_match, i: $i")
|
||||
|
||||
//println("Skip last group")
|
||||
return first_match,group_stack[group_index--]
|
||||
return first_match,i
|
||||
//return first_match,group_stack[group_index--]
|
||||
}
|
||||
}
|
||||
//println("no_match_found, natural end")
|
||||
|
@ -2224,7 +2294,12 @@ fn impl_new_regex_by_size(mult int) RE {
|
|||
//
|
||||
|
||||
pub fn (mut re RE) match_string(in_txt string) (int,int) {
|
||||
start, end := re.match_base(in_txt.str,in_txt.len)
|
||||
|
||||
start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
|
||||
if end > in_txt.len {
|
||||
end = in_txt.len
|
||||
}
|
||||
|
||||
if start >= 0 && end > start {
|
||||
if (re.flag & f_ms) != 0 && start > 0 {
|
||||
return no_match_found, 0
|
||||
|
@ -2247,9 +2322,15 @@ pub fn (mut re RE) match_string(in_txt string) (int,int) {
|
|||
// find try to find the first match in the input string
|
||||
pub fn (mut re RE) find(in_txt string) (int,int) {
|
||||
old_flag := re.flag
|
||||
|
||||
re.flag |= f_src // enable search mode
|
||||
start, end := re.match_base(in_txt.str, in_txt.len)
|
||||
start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
|
||||
//print("Find [$start,$end] '${in_txt[start..end]}'")
|
||||
if end > in_txt.len {
|
||||
end = in_txt.len
|
||||
}
|
||||
re.flag = old_flag
|
||||
|
||||
if start >= 0 && end > start {
|
||||
return start, end
|
||||
}
|
||||
|
|
|
@ -7,11 +7,11 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
|
|||
|
||||
if re_err != compile_ok {
|
||||
mut err_msg := strings.new_builder(300)
|
||||
err_str := re.get_parse_error_string(re_err)
|
||||
err_msg.write("$err_str\n")
|
||||
err_msg.write(" query: $pattern\n")
|
||||
err_msg.write("query: $pattern\n")
|
||||
line := "-".repeat(err_pos)
|
||||
err_msg.write(" err pos: ${line}^")
|
||||
err_msg.write("err : ${line}^\n")
|
||||
err_str := re.get_parse_error_string(re_err)
|
||||
err_msg.write("ERROR: $err_str\n")
|
||||
return error_with_code(err_msg.str(), re_err)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,6 +14,12 @@ struct TestItem {
|
|||
|
||||
const(
|
||||
match_test_suite = [
|
||||
// base OR
|
||||
TestItem{"a",r"a|b",0,1},
|
||||
TestItem{"a",r"b|a",0,1},
|
||||
TestItem{"b",r"a|b",0,1},
|
||||
TestItem{"b",r"b|a",0,1},
|
||||
TestItem{"c",r"b|a",-1,0},
|
||||
|
||||
// positive
|
||||
TestItem{"this is a good.",r"this",0,4},
|
||||
|
@ -38,7 +44,7 @@ match_test_suite = [
|
|||
TestItem{"this these those ",r"(th[eio]se? ?)+",0,17},
|
||||
TestItem{"this these those ",r"(th[eio]se? )+",0,17},
|
||||
TestItem{"this,these,those. over",r"(th[eio]se?[,. ])+",0,17},
|
||||
TestItem{"soday,this,these,those. over",r"(th[eio]se?[,. ])+",6,23},
|
||||
TestItem{"soday,this,these,those. over",r".+(th[eio]se?[,. ])+",0,23},
|
||||
|
||||
TestItem{"cpapaz",r"(c(pa)+z)",0,6},
|
||||
TestItem{"this is a cpapaz over",r"(c(pa)+z)",10,16},
|
||||
|
@ -60,7 +66,7 @@ match_test_suite = [
|
|||
TestItem{"soday,this,these,those. over",r".*,(th[eio]se?[,. ])+",0,23},
|
||||
TestItem{"soday,this,these,thesa.thesi over",r".*,(th[ei]se?[,. ])+(thes[ai][,. ])+",0,29},
|
||||
TestItem{"cpapaz ole. pippo,",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18},
|
||||
TestItem{"cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,17},
|
||||
TestItem{"cpapaz ole. pippo",r"(c(pa)+z)(\s+\a+[\.,]?)+",0,17},
|
||||
TestItem{"cpapaz ole. pippo, 852",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18},
|
||||
TestItem{"123cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20},
|
||||
TestItem{"...cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20},
|
||||
|
@ -74,7 +80,6 @@ match_test_suite = [
|
|||
|
||||
TestItem{"/home/us_er/pippo/info-01.txt", r"(/?[-\w_]+)*\.txt$",0,29}
|
||||
|
||||
|
||||
// negative
|
||||
TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
|
||||
TestItem{"this is a good.",r"thes",-1,0},
|
||||
|
@ -88,6 +93,25 @@ match_test_suite = [
|
|||
// check unicode
|
||||
TestItem{"this is a Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r".*a [Ⅰ-Ⅵ ]+",0,34},
|
||||
TestItem{"123Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r"[Ⅰ-Ⅴ\s]+",3,23},
|
||||
|
||||
// new edge cases
|
||||
TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",-1,0},
|
||||
TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",0,8},
|
||||
TestItem{"123456789", r"^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$",0,9}
|
||||
TestItem{"12345678", r"^\d{8}$",0,8},
|
||||
TestItem{"12345678", r"^\d{7}$",-1,0},
|
||||
TestItem{"12345678", r"^\d{9}$",-1,0},
|
||||
|
||||
TestItem{"eth", r"(oth)|(eth)",0,3},
|
||||
TestItem{"et", r"(oth)|(eth)",-1,0},
|
||||
TestItem{"et", r".*(oth)|(eth)",-1,0},
|
||||
TestItem{"peoth", r".*(ith)|(eth)",-1,0},
|
||||
|
||||
TestItem{"poth", r"(eth)|(oth)",1,4},
|
||||
TestItem{"poth", r"(oth)|(eth)",1,4},
|
||||
TestItem{"poth", r".(oth)|(eth)$",0,4},
|
||||
TestItem{"poth", r"^.(oth)|(eth)$",0,4},
|
||||
TestItem{"poth", r"^\w+$",0,4},
|
||||
]
|
||||
)
|
||||
|
||||
|
@ -150,30 +174,35 @@ const (
|
|||
cgroups_test_suite = [
|
||||
TestItemCGroup{
|
||||
"http://www.ciao.mondo/hello/pippo12_/pera.html",
|
||||
r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+",0,46,
|
||||
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
|
||||
r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+[\.|/])+",0,42,
|
||||
[7, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42],
|
||||
{'format':int(0),'token':1}
|
||||
},
|
||||
TestItemCGroup{
|
||||
"http://www.ciao.mondo/hello/pippo12_/pera.html",
|
||||
r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
|
||||
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
|
||||
[2, 0, 0, 4, 1, 7, 10],
|
||||
{'format':int(0),'token':1}
|
||||
},
|
||||
TestItemCGroup{
|
||||
"http://www.ciao.mondo/hello/pippo12_/pera.html",
|
||||
r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+.)+",0,46,
|
||||
[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
|
||||
r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+\.)+",0,16,
|
||||
[3, 0, 0, 4, 1, 7, 11, 1, 11, 16],
|
||||
{'format':int(0)}
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
const (
|
||||
debug = false // true for debug println
|
||||
)
|
||||
|
||||
fn test_regex(){
|
||||
|
||||
// check capturing groups
|
||||
for c,to in cgroups_test_suite {
|
||||
// debug print
|
||||
//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)")
|
||||
if debug { println("#$c [$to.src] q[$to.q] ($to.s, $to.e)") }
|
||||
|
||||
mut re := regex.regex_opt(to.q) or {
|
||||
eprintln('err: $err')
|
||||
|
@ -191,16 +220,16 @@ fn test_regex(){
|
|||
}
|
||||
|
||||
if start != to.s || end != to.e {
|
||||
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
|
||||
//println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
|
||||
println("ERROR!")
|
||||
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
|
||||
C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
|
||||
assert false
|
||||
continue
|
||||
}
|
||||
|
||||
// check cgroups
|
||||
if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
|
||||
println("Capturing group len error!")
|
||||
println("Capturing group len error! ${re.group_csave[0]}")
|
||||
assert false
|
||||
continue
|
||||
}
|
||||
|
@ -225,9 +254,9 @@ fn test_regex(){
|
|||
}
|
||||
|
||||
// check find_all
|
||||
for _,to in match_test_suite_fa{
|
||||
for c,to in match_test_suite_fa{
|
||||
// debug print
|
||||
//println("#$c [$to.src] q[$to.q] $to.r")
|
||||
if debug { println("#$c [$to.src] q[$to.q] $to.r") }
|
||||
|
||||
mut re := regex.regex_opt(to.q) or {
|
||||
eprintln('err: $err')
|
||||
|
@ -253,9 +282,9 @@ fn test_regex(){
|
|||
}
|
||||
|
||||
// check replace
|
||||
for _,to in match_test_suite_re{
|
||||
for c,to in match_test_suite_re{
|
||||
// debug print
|
||||
//println("#$c [$to.src] q[$to.q] $to.r")
|
||||
if debug { println("#$c [$to.src] q[$to.q] $to.r") }
|
||||
|
||||
mut re := regex.regex_opt(to.q) or {
|
||||
eprintln('err: $err')
|
||||
|
@ -274,7 +303,7 @@ fn test_regex(){
|
|||
// check match and find
|
||||
for c,to in match_test_suite {
|
||||
// debug print
|
||||
println("#$c [$to.src] q[$to.q] $to.s $to.e")
|
||||
if debug { println("#$c [$to.src] q[$to.q] $to.s $to.e") }
|
||||
|
||||
// test the find
|
||||
if to.s > 0 {
|
||||
|
@ -289,7 +318,7 @@ fn test_regex(){
|
|||
|
||||
if start != to.s || end != to.e {
|
||||
err_str := re.get_parse_error_string(start)
|
||||
println("ERROR : $err_str")
|
||||
println("ERROR : $err_str start: ${start} end: ${end}")
|
||||
assert false
|
||||
} else {
|
||||
//tmp_str := text[start..end]
|
||||
|
@ -334,4 +363,7 @@ fn test_regex(){
|
|||
}
|
||||
|
||||
}
|
||||
if debug { println("DONE!") }
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
import regex
|
||||
|
||||
const (
|
||||
a_or_b = regex.regex_opt('a|b') ?
|
||||
)
|
||||
|
||||
fn f(s string) bool {
|
||||
mut re := a_or_b
|
||||
start, _ := re.match_string(s)
|
||||
return start != -1
|
||||
}
|
||||
|
||||
fn test_const_regex_works() {
|
||||
assert f('a') == true
|
||||
assert f('b') == true
|
||||
assert f('c') == false
|
||||
}
|
Loading…
Reference in New Issue