regex: improve errors for edge cases (#13008)

* code cleaning, added more clear errors for dots and ORs

* added failed match index for better find functions, updated tests

* added index in match failed, updated tests

* test cleaning

* test check
pull/13010/head
penguindark 2022-01-01 08:21:27 +01:00 committed by GitHub
parent 7b4ba66720
commit 908296cdfb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 41 additions and 52 deletions

View File

@ -42,6 +42,7 @@ pub const (
err_group_qm_notation = -10 // group invalid notation err_group_qm_notation = -10 // group invalid notation
err_invalid_or_with_cc = -11 // invalid or on two consecutive char class err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
err_neg_group_quantifier = -12 // negation groups can not have quantifier err_neg_group_quantifier = -12 // negation groups can not have quantifier
err_consecutive_dots = -13 // two consecutive dots is an error
) )
const ( const (
@ -200,6 +201,7 @@ pub fn (re RE) get_parse_error_string(err int) string {
regex.err_group_qm_notation { return 'err_group_qm_notation' } regex.err_group_qm_notation { return 'err_group_qm_notation' }
regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' } regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' } regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' }
regex.err_consecutive_dots { return 'err_consecutive_dots' }
else { return 'err_unknown' } else { return 'err_unknown' }
} }
} }
@ -283,14 +285,7 @@ pub const (
f_src = 0x00020000 // search mode enabled f_src = 0x00020000 // search mode enabled
) )
struct StateDotObj { // Log function prototype
mut:
i int = -1 // char index in the input buffer
pc int = -1 // program counter saved
mi int = -1 // match_index saved
group_stack_index int = -1 // continuous save on capturing groups
}
pub type FnLog = fn (string) pub type FnLog = fn (string)
pub struct RE { pub struct RE {
@ -1042,6 +1037,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
// re.prog[goto_pc].group_id = group_count // id of this group, used for storing data // re.prog[goto_pc].group_id = group_count // id of this group, used for storing data
// duplicate the negation group info and settings
if re.prog[goto_pc].group_neg == true { if re.prog[goto_pc].group_neg == true {
re.prog[pc].group_neg = re.prog[goto_pc].group_neg re.prog[pc].group_neg = re.prog[goto_pc].group_neg
re.prog[pc].rep_min = re.prog[goto_pc].rep_min re.prog[pc].rep_min = re.prog[goto_pc].rep_min
@ -1054,6 +1050,11 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
// ist_dot_char match any char except the following token // ist_dot_char match any char except the following token
if char_len == 1 && pc >= 0 && byte(char_tmp) == `.` { if char_len == 1 && pc >= 0 && byte(char_tmp) == `.` {
// consecutive ist_dot_char is a syntax error
if pc > 0 && re.prog[pc - 1].ist == regex.ist_dot_char {
return regex.err_consecutive_dots, i
}
re.prog[pc].ist = u32(0) | regex.ist_dot_char re.prog[pc].ist = u32(0) | regex.ist_dot_char
re.prog[pc].rep_min = 1 re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1 re.prog[pc].rep_max = 1
@ -1228,7 +1229,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
// check for OR at the end of the program // check for OR at the end of the program
if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch { if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
return regex.err_syntax_error, in_txt.len return regex.err_syntax_error, in_txt.len - 1
} }
// store the number of groups in the query // store the number of groups in the query
@ -1873,7 +1874,7 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
} }
// print("No good exit!!") // print("No good exit!!")
return regex.no_match_found, 0 return regex.no_match_found, state.i
} }
// starting and init // starting and init
@ -1959,7 +1960,7 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
} }
// exit on no match // exit on no match
return result, 0 return result, state.i
} }
// ist_load // ist_load
else if m_state == .ist_load { else if m_state == .ist_load {
@ -2164,30 +2165,6 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
continue continue
} }
// check bsls // check bsls
/*
else if ist == regex.ist_bsls_char {
state.match_flag = false
tmp_res := re.prog[state.pc].validator(byte(ch))
// println("BSLS in_ch: ${ch:c} res: $tmp_res")
if tmp_res {
state.match_flag = true
l_ist = u32(regex.ist_bsls_char)
if state.first_match < 0 {
state.first_match = state.i
}
state.match_index = state.i
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len // next char
m_state = .ist_quant_p
continue
}
m_state = .ist_quant_n
continue
}
*/
else if ist == regex.ist_bsls_char { else if ist == regex.ist_bsls_char {
// println("ist_bsls_char rep: ${re.prog[state.pc].rep}") // println("ist_bsls_char rep: ${re.prog[state.pc].rep}")
@ -2541,14 +2518,14 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
return state.first_match, state.i return state.first_match, state.i
} }
// println("Program not finished! ") // println("Program not finished! ")
return regex.no_match_found, 0 return regex.no_match_found, state.i
} }
if src_end { if src_end {
// println("program end") // println("program end")
return state.first_match, state.i return state.first_match, state.i
} }
// print("No match found!!") // print("No match found!!")
return regex.no_match_found, 0 return regex.no_match_found, state.i
} else { } else {
// println("Group match! OK") // println("Group match! OK")
// println("first_match: $state.first_match, i: $state.i") // println("first_match: $state.first_match, i: $state.i")
@ -2559,5 +2536,5 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
} }
} }
// println("no_match_found, natural end") // println("no_match_found, natural end")
return regex.no_match_found, 0 return regex.no_match_found, state.i
} }

View File

@ -18,7 +18,7 @@ const(
match_test_suite = [ match_test_suite = [
// minus in CC // minus in CC
TestItem{"d.def",r"abc.\.[\w\-]{,100}",-1,0}, TestItem{"d.def",r"abc.\.[\w\-]{,100}",-1,0},
TestItem{"abc12345.asd",r"abc.\.[\w\-]{,100}",-1,0}, TestItem{"abc12345.asd",r"abc.\.[\w\-]{,100}",-1,4},
TestItem{"abca.exe",r"abc.\.[\w\-]{,100}",0,8}, TestItem{"abca.exe",r"abc.\.[\w\-]{,100}",0,8},
TestItem{"abc2.exe-test_12",r"abc.\.[\w\-]{,100}",0,16}, TestItem{"abc2.exe-test_12",r"abc.\.[\w\-]{,100}",0,16},
TestItem{"abcdefGHK",r"[a-f]+\A+",0,9}, TestItem{"abcdefGHK",r"[a-f]+\A+",0,9},
@ -96,30 +96,30 @@ match_test_suite = [
// negative // negative
TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0}, TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
TestItem{"this is a good.",r"thes",-1,0}, TestItem{"this is a good.",r"thes",-1,2},
TestItem{"test1post.pip.com, pera",r"[\w]+@([\w]+\.)+\w+",-1,0}, TestItem{"test1post.pip.com, pera",r"[\w]+@([\w]+\.)+\w+",-1,9},
TestItem{"this cpapaz adce",r"(c(pa)+z)(\s[\a]+){2}",-1,0}, TestItem{"this cpapaz adce",r"(c(pa)+z)(\s[\a]+){2}",-1,0},
TestItem{"this cpapaz adce aabe third",r"(c(pa)+z)(\s[\a]+){2}$",-1,0}, TestItem{"this cpapaz adce aabe third",r"(c(pa)+z)(\s[\a]+){2}$",-1,0},
TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0}, TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0},
TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0}, TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0},
TestItem{"/home/us_er/pippo/info-01.jpeg", r"(/?[-\w_]+)*\.txt$",-1,0} TestItem{"/home/us_er/pippo/info-01.jpeg", r"(/?[-\w_]+)*\.txt$",-1,26}
// check unicode // check unicode
TestItem{"this is a test",r".*a [-Ⅵ ]+",0,34}, TestItem{"this is a test",r".*a [-Ⅵ ]+",0,34},
TestItem{"123 test",r"[-\s]+",3,23}, TestItem{"123 test",r"[-\s]+",3,23},
// new edge cases // new edge cases
TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",-1,0}, TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",-1,8},
TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",0,8}, TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",0,8},
TestItem{"123456789", r"^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$",0,9} TestItem{"123456789", r"^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$",0,9}
TestItem{"12345678", r"^\d{8}$",0,8}, TestItem{"12345678", r"^\d{8}$",0,8},
TestItem{"12345678", r"^\d{7}$",-1,0}, TestItem{"12345678", r"^\d{7}$",-1,0},
TestItem{"12345678", r"^\d{9}$",-1,0}, TestItem{"12345678", r"^\d{9}$",-1,8},
TestItem{"eth", r"(oth)|(eth)",0,3}, TestItem{"eth", r"(oth)|(eth)",0,3},
TestItem{"et", r"(oth)|(eth)",-1,0}, TestItem{"et", r"(oth)|(eth)",-1,2},
TestItem{"et", r".*(oth)|(eth)",-1,0}, TestItem{"et", r".*(oth)|(eth)",-1,2},
TestItem{"peoth", r".*(ith)|(eth)",-1,0}, TestItem{"peoth", r".*(ith)|(eth)",-1,5},
TestItem{"poth", r"(eth)|(oth)",1,4}, TestItem{"poth", r"(eth)|(oth)",1,4},
TestItem{"poth", r"(oth)|(eth)",1,4}, TestItem{"poth", r"(oth)|(eth)",1,4},
@ -132,7 +132,7 @@ match_test_suite = [
TestItem{"accccb deer", r"^a(.*)b d(.+)r",0,11}, TestItem{"accccb deer", r"^a(.*)b d(.+)r",0,11},
TestItem{"accccb deer", r"^a(.*)b d(.+)",0,11}, TestItem{"accccb deer", r"^a(.*)b d(.+)",0,11},
TestItem{"accccb deer", r"^(.*)$",0,11}, TestItem{"accccb deer", r"^(.*)$",0,11},
TestItem{"accccb deer", r"^a(.*)b d(.+)p",-1,0}, TestItem{"accccb deer", r"^a(.*)b d(.+)p",-1,11},
TestItem{"##.#....#.##.####...#.##", r".{18}[.#]",0,19}, TestItem{"##.#....#.##.####...#.##", r".{18}[.#]",0,19},
TestItem{"#.#......##.#..#..##........##....###...##...######.......#.....#..#......#...#........###.#..#.", r'.*#[.#]{4}##[.#]{4}##[.#]{4}###',0,49}, TestItem{"#.#......##.#..#..##........##....###...##...######.......#.....#..#......#...#........###.#..#.", r'.*#[.#]{4}##[.#]{4}##[.#]{4}###',0,49},
@ -328,19 +328,19 @@ find_all_test_suite = [
[29, 49], [29, 49],
['#....###...##...####'] ['#....###...##...####']
}, },
Test_find_all{ Test_find_all{
"#.#......##.#..#..##........##....###...##...######.......#.....#..#......#...#........###.#..#.", "#.#......##.#..#..##........##....###...##...######.......#.....#..#......#...#........###.#..#.",
r".*#[.#]{4}##[.#]{4}##[.#]{4}###", r".*#[.#]{4}##[.#]{4}##[.#]{4}###",
[0, 49], [0, 49],
['#.#......##.#..#..##........##....###...##...####'] ['#.#......##.#..#..##........##....###...##...####']
}, },
Test_find_all{ Test_find_all{
"1234 Aa dddd Aaf 12334 Aa opopo Aaf", "1234 Aa dddd Aaf 12334 Aa opopo Aaf",
r"Aa.+Aaf", r"Aa.+Aaf",
[5, 16, 23, 35], [5, 16, 23, 35],
['Aa dddd Aaf', 'Aa opopo Aaf'] ['Aa dddd Aaf', 'Aa opopo Aaf']
}, },
Test_find_all{ Test_find_all{
"@for something @endfor @for something else @endfor altro testo @for body @endfor uno due @for senza dire più @endfor pippo", "@for something @endfor @for something else @endfor altro testo @for body @endfor uno due @for senza dire più @endfor pippo",
r"@for.+@endfor", r"@for.+@endfor",
[0, 22, 23, 50, 63, 80, 89, 117], [0, 22, 23, 50, 63, 80, 89, 117],

View File

@ -272,8 +272,14 @@ pub fn (mut re RE) find_all(in_txt string) []int {
i += e i += e
continue continue
} }
/*
if e > 0 {
i += e
continue
}
*/
i++
} }
i++
} }
// re.flag = old_flag // re.flag = old_flag
return res return res
@ -306,6 +312,12 @@ pub fn (mut re RE) find_all_str(in_txt string) []string {
continue continue
} }
} }
/*
if e > 0 {
i += e
continue
}
*/
i++ i++
} }
// re.flag = old_flag // re.flag = old_flag