regex: implement negation groups, more flexibility for bsls, small fixes (#12981)

* removed memory allocations in cleaning during clear calls

* first test implementation of negative groups, more flexibility for bsls

* fixed bsls failed tests

* fmt

* added \n to regex tests
pull/12992/head
penguindark 2021-12-27 21:18:48 +01:00 committed by GitHub
parent 14648fa41e
commit dadc965082
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 248 additions and 38 deletions

View File

@ -17,30 +17,31 @@ module regex
import strings
pub const (
v_regex_version = '1.0 alpha' // regex module version
v_regex_version = '1.0 alpha' // regex module version
max_code_len = 256 // default small base code len for the regex programs
max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
max_code_len = 256 // default small base code len for the regex programs
max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
// new line chars for now only '\n'
new_line_list = [`\n`, `\r`]
new_line_list = [`\n`, `\r`]
// Results
no_match_found = -1
no_match_found = -1
// Errors
compile_ok = 0 // the regex string compiled, all ok
err_char_unknown = -2 // the char used is unknow to the system
err_undefined = -3 // the compiler symbol is undefined
err_internal_error = -4 // Bug in the regex system!!
err_cc_alloc_overflow = -5 // memory for char class full!!
err_syntax_error = -6 // syntax error in regex compiling
err_groups_overflow = -7 // max number of groups reached
err_groups_max_nested = -8 // max number of nested group reached
err_group_not_balanced = -9 // group not balanced
err_group_qm_notation = -10 // group invalid notation
err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
compile_ok = 0 // the regex string compiled, all ok
err_char_unknown = -2 // the char used is unknow to the system
err_undefined = -3 // the compiler symbol is undefined
err_internal_error = -4 // Bug in the regex system!!
err_cc_alloc_overflow = -5 // memory for char class full!!
err_syntax_error = -6 // syntax error in regex compiling
err_groups_overflow = -7 // max number of groups reached
err_groups_max_nested = -8 // max number of nested group reached
err_group_not_balanced = -9 // group not balanced
err_group_qm_notation = -10 // group invalid notation
err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
err_neg_group_quantifier = -12 // negation groups can not have quantifier
)
const (
@ -198,6 +199,7 @@ pub fn (re RE) get_parse_error_string(err int) string {
regex.err_group_not_balanced { return 'err_group_not_balanced' }
regex.err_group_qm_notation { return 'err_group_qm_notation' }
regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' }
else { return 'err_unknown' }
}
}
@ -246,13 +248,15 @@ mut:
// validator function pointer
validator FnValidator
// groups variables
group_rep int // repetition of the group
group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion
group_rep int // repetition of the group
group_id int = -1 // id of the group
goto_pc int = -1 // jump to this PC if is needed
// OR flag for the token
next_is_or bool // true if the next token is an OR
// dot_char token variables
dot_check_pc int = -1 // pc of the next token to check
dot_check_pc int = -1 // pc of the next token to check for dots
bsls_check_pc int = -1 // pc of the next token to check for bsls
last_dot_flag bool // if true indicate that is the last dot_char in the regex
// debug fields
source_index int
@ -333,7 +337,17 @@ fn (mut re RE) reset() {
// init groups array
if re.group_count > 0 {
re.groups = []int{len: re.group_count * 2, init: -1}
if re.groups.len == 0 {
// first run alloc memory
re.groups = []int{len: re.group_count * 2, init: -1}
} else {
// subsequent executions, only clean up the memory
i = 0
for i < re.groups.len {
re.groups[i] = -1
i++
}
}
}
// reset group_csave
@ -811,8 +825,8 @@ enum Group_parse_state {
finish
}
// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index)
fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) {
mut status := Group_parse_state.start
mut i := in_i
mut name := ''
@ -836,10 +850,16 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
continue
}
// negate group
if status == .q_mark1 && ch == `!` {
i += char_len
return 0, false, true, name, i
}
// non capturing group
if status == .q_mark1 && ch == `:` {
i += char_len
return 0, false, name, i
return 0, false, false, name, i
}
// enter in P section
@ -852,7 +872,7 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
// not a valid q mark found
if status == .q_mark1 {
// println("NO VALID Q MARK")
return -2, true, name, i
return -2, true, false, name, i
}
if status == .p_status && ch == `<` {
@ -878,20 +898,20 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
// end name
if status == .p_in_name && ch == `>` {
i += char_len
return 0, true, name, i
return 0, true, false, name, i
}
// error on name group
if status == .p_in_name {
return -2, true, name, i
return -2, true, false, name, i
}
// normal group, nothig to do, exit
return 0, true, name, i
return 0, true, false, name, i
}
// UNREACHABLE
// println("ERROR!! NOT MEANT TO BE HERE!!1")
return -2, true, name, i
return -2, true, false, name, i
}
const (
@ -949,7 +969,8 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
return regex.err_groups_max_nested, i + 1
}
tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt, i)
tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt,
i)
// manage question mark format error
if tmp_res < -1 {
@ -984,6 +1005,12 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
// manage negation groups
if negate_flag == true {
re.prog[pc].group_neg = true
re.prog[pc].rep_min = 0 // may be not catched, but it is ok
}
// set the group id
if cgroup_flag == false {
// println("NO CAPTURE GROUP")
@ -1015,6 +1042,11 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
// re.prog[goto_pc].group_id = group_count // id of this group, used for storing data
if re.prog[goto_pc].group_neg == true {
re.prog[pc].group_neg = re.prog[goto_pc].group_neg
re.prog[pc].rep_min = re.prog[goto_pc].rep_min
}
pc = pc + 1
i = i + char_len
continue
@ -1050,6 +1082,12 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
char_next, char_next_len = re.get_char(in_txt, i + char_len)
}
mut quant_flag := true
// negation groups can not have quantifiers
if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] {
return regex.err_neg_group_quantifier, i
}
match byte(char_tmp) {
`?` {
// println("q: ${char_tmp:c}")
@ -1215,6 +1253,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
dot_char_count++
mut pc2 := pc1 + 1
for pc2 < pc {
// consecutive dot chars is an error
if re.prog[pc2].ist == regex.ist_dot_char {
return regex.err_syntax_error, 0
}
@ -1246,6 +1285,49 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
}
}
//
// manage bsls_char
//
// find the checks for bsls, if any...
pc1 = 0
mut bsls_char_count := 0
mut last_bsls_char_pc := -1
for pc1 < pc {
if re.prog[pc1].ist == regex.ist_bsls_char {
// println("bsls_char pc: $pc1")
last_bsls_char_pc = pc1
bsls_char_count++
mut pc2 := pc1 + 1
for pc2 < pc {
if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
regex.ist_group_start] {
// println("Next bsls check is PC: ${pc2}")
re.prog[pc1].bsls_check_pc = pc2
break
}
pc2++
}
}
pc1++
}
// println("last_bsls_char_pc: $last_bsls_char_pc")
if last_bsls_char_pc >= 0 {
pc1 = last_bsls_char_pc + 1
mut is_last_bsls := true
for pc1 < pc {
if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
is_last_bsls = false
break
}
pc1++
}
if is_last_bsls {
re.prog[last_bsls_char_pc].last_dot_flag = true
}
}
//******************************************
// OR branch
@ -1405,14 +1487,15 @@ pub fn (re RE) get_query() string {
// GROUP start
if ch == regex.ist_group_start {
if re.debug == 0 {
res.write_string('(')
} else {
if tk.group_id == -1 {
res.write_string('(?:') // non capturing group
} else {
res.write_string('#${tk.group_id}(')
}
if re.debug > 0 {
res.write_string('#$tk.group_id')
}
res.write_string('(')
if tk.group_neg == true {
res.write_string('?!') // negation group
} else if tk.group_id == -1 {
res.write_string('?:') // non capturing group
}
for x in re.group_map.keys() {
@ -1470,7 +1553,7 @@ pub fn (re RE) get_query() string {
}
// quantifier
if !(tk.rep_min == 1 && tk.rep_max == 1) {
if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false {
if tk.rep_min == 0 && tk.rep_max == 1 {
res.write_string('?')
} else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier {
@ -2081,6 +2164,7 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
continue
}
// check bsls
/*
else if ist == regex.ist_bsls_char {
state.match_flag = false
tmp_res := re.prog[state.pc].validator(byte(ch))
@ -2103,6 +2187,101 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
m_state = .ist_quant_n
continue
}
*/
else if ist == regex.ist_bsls_char {
// println("ist_bsls_char rep: ${re.prog[state.pc].rep}")
// check next token to be false
mut next_check_flag := false
// if we are done with max go on dot char are dedicated case!!
if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
re.state_list.pop()
m_state = .ist_next
continue
}
if re.prog[state.pc].bsls_check_pc >= 0
&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
// load the char
// ch_t, _ := re.get_charb(in_txt, state.i+char_len)
ch_t := ch
chk_pc := re.prog[state.pc].bsls_check_pc
// simple char
if re.prog[chk_pc].ist == regex.ist_simple_char {
if re.prog[chk_pc].ch == ch_t {
next_check_flag = true
}
// println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
}
// char char_class
else if re.prog[chk_pc].ist == regex.ist_char_class_pos
|| re.prog[chk_pc].ist == regex.ist_char_class_neg {
mut cc_neg := false
if re.prog[chk_pc].ist == regex.ist_char_class_neg {
cc_neg = true
}
mut cc_res := re.check_char_class(chk_pc, ch_t)
if cc_neg {
cc_res = !cc_res
}
next_check_flag = cc_res
// println("Check [ist_char_class] => $next_check_flag")
}
// check bsls
else if re.prog[chk_pc].ist == regex.ist_bsls_char {
next_check_flag = re.prog[chk_pc].validator(byte(ch_t))
// println("Check [ist_bsls_char] => $next_check_flag")
}
}
// check if we must continue or pass to the next IST
if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
// println("save the state!!")
mut dot_state := StateObj{
group_index: state.group_index
match_flag: state.match_flag
match_index: state.match_index
first_match: state.first_match
pc: state.pc
i: state.i + char_len
char_len: char_len
last_dot_pc: state.pc
}
// if we are mananging a .* stay on the same char on return
if re.prog[state.pc].rep_min == 0 {
dot_state.i -= char_len
}
re.state_list << dot_state
m_state = .ist_quant_n
// println("dot_char stack len: ${re.state_list.len}")
continue
}
tmp_res := re.prog[state.pc].validator(byte(ch))
if tmp_res == false {
m_state = .ist_quant_n
continue
}
// println("${ch} => ${tmp_res}")
state.match_flag = true
l_ist = u32(regex.ist_dot_char)
if state.first_match < 0 {
state.first_match = state.i
}
state.match_index = state.i
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len
m_state = .ist_quant_p
continue
}
// simple char IST
else if ist == regex.ist_simple_char {
// println("ist_simple_char")
@ -2213,6 +2392,13 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
tmp_pc = re.group_data[state.group_index]
}
if re.prog[tmp_pc].group_neg == true {
// println("***** Negation of the group")
result = regex.no_match_found
m_state = .stop
continue
}
rep := re.prog[tmp_pc].group_rep
if rep < re.prog[tmp_pc].rep_min {

View File

@ -760,3 +760,27 @@ fn test_long_query() {
//println("$start, $end")
assert start >= 0 && end == base_string.len
}
struct Test_negation_group {
src string
res bool
}
const(
negation_groups = [
Test_negation_group{'automobile',false},
Test_negation_group{'botomobile',true},
Test_negation_group{'auto_caravan',false},
Test_negation_group{'moto_mobile',true},
Test_negation_group{'pippole',true},
Test_negation_group{'boring test',false},
]
)
fn test_negation_groups() {
mut query := r"(?!auto)\w+le"
mut re := regex.regex_opt(query) or { panic(err) }
for test in negation_groups {
start, end := re.match_string(test.src)
assert (start >= 0) == test.res
}
}