regex: code cleaning, add more tests (#7402)

* added carsywulf tests to the suite, removed deprecated repeat for init arrays

* some clean in the code, fix typo in README.md
pull/7405/head^2
penguindark 2020-12-19 00:32:57 +01:00 committed by GitHub
parent ff2cfd4f38
commit 1e4b3a7b27
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 20 deletions

View File

@ -413,7 +413,7 @@ These functions are helpers to query the captured groups
// get_group_bounds_by_name get a group boundaries by its name // get_group_bounds_by_name get a group boundaries by its name
pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int)
// get_group_by_name get a group boundaries by its name // get_group_by_name get a group string by its name
pub fn (re RE) get_group_by_name(group_name string) string pub fn (re RE) get_group_by_name(group_name string) string
// get_group_by_id get a group boundaries by its id // get_group_by_id get a group boundaries by its id

View File

@ -1,6 +1,6 @@
/* /*
regex 0.9h regex 1.0 alpha
Copyright (c) 2019-2020 Dario Deledda. All rights reserved. Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
Use of this source code is governed by an MIT license Use of this source code is governed by an MIT license
@ -280,7 +280,6 @@ pub const (
f_bin = 0x00000200 // work only on bytes, ignore utf-8 f_bin = 0x00000200 // work only on bytes, ignore utf-8
// behaviour modifier flags // behaviour modifier flags
//f_or = 0x00010000 // the OR work with concatenation like PCRE
f_src = 0x00020000 // search mode enabled f_src = 0x00020000 // search mode enabled
) )
@ -334,7 +333,11 @@ fn (mut re RE) reset(){
re.prog[i].rep = 0 // clear repetition of the token re.prog[i].rep = 0 // clear repetition of the token
i++ i++
} }
re.groups = [-1].repeat(re.group_count*2)
// init groups array
if re.group_count > 0 {
re.groups = []int{len: re.group_count*2, init: -1}
}
// reset group_csave // reset group_csave
re.group_csave = []int{} re.group_csave = []int{}
@ -723,7 +726,6 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
// single value {4} // single value {4}
if status == .min_parse && ch == `}` { if status == .min_parse && ch == `}` {
q_max = q_min q_max = q_min
status = .greedy status = .greedy
continue continue
} }
@ -731,7 +733,6 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
// end without max // end without max
if status == .comma_checked && ch == `}` { if status == .comma_checked && ch == `}` {
q_max = max_quantifier q_max = max_quantifier
status = .greedy status = .greedy
continue continue
} }
@ -900,8 +901,8 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
// group management variables // group management variables
mut group_count := -1 mut group_count := -1
mut group_stack := [0 ].repeat(re.group_max_nested) mut group_stack := []int{len: re.group_max_nested, init: 0}
mut group_stack_txt_index := [-1].repeat(re.group_max_nested) mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1}
mut group_stack_index := -1 mut group_stack_index := -1
re.query = in_txt // save the query string re.query = in_txt // save the query string
@ -987,7 +988,6 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
pc = pc + 1 pc = pc + 1
continue continue
} }
// ist_group_end // ist_group_end
@ -1566,8 +1566,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
mut state_list := []StateObj{} mut state_list := []StateObj{}
//mut group_stack := [-1].repeat(re.group_max)
//mut group_data := [-1].repeat(re.group_max)
mut group_stack := []int{len: re.group_max, init: -1} mut group_stack := []int{len: re.group_max, init: -1}
mut group_data := []int{len: re.group_max, init: -1} mut group_data := []int{len: re.group_max, init: -1}
@ -1677,7 +1675,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//****************************************** //******************************************
if ist == ist_prog_end { if ist == ist_prog_end {
//println("HERE") //println("HERE we end!")
break break
} }
@ -1719,9 +1717,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// continuous save, save until we have space // continuous save, save until we have space
re.group_continuous_save(g_index) re.group_continuous_save(g_index)
} }
state.group_index-- state.group_index--
} }
} }
@ -1968,7 +1964,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag") //println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
} }
// char class IST // char char_class
else if re.prog[chk_pc].ist == ist_char_class_pos || re.prog[chk_pc].ist == ist_char_class_neg { else if re.prog[chk_pc].ist == ist_char_class_pos || re.prog[chk_pc].ist == ist_char_class_neg {
mut cc_neg := false mut cc_neg := false
if re.prog[chk_pc].ist == ist_char_class_neg { if re.prog[chk_pc].ist == ist_char_class_neg {
@ -1993,7 +1989,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// check if we must continue or pass to the next IST // check if we must continue or pass to the next IST
if next_check_flag == true { if next_check_flag == true {
// if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
//println("save the state!!") //println("save the state!!")
state_list << StateObj { state_list << StateObj {
group_index: state.group_index group_index: state.group_index
@ -2382,8 +2377,8 @@ Public functions
[deprecated] [deprecated]
pub fn regex(in_query string) (RE,int,int){ pub fn regex(in_query string) (RE,int,int){
mut re := RE{} mut re := RE{}
re.prog = [Token{}].repeat(in_query.len+1) re.prog = []Token {len: in_query.len+1}
re.cc = [CharClass{}].repeat(in_query.len+1) re.cc = []CharClass{len: in_query.len+1}
re.group_max_nested = 8 re.group_max_nested = 8
re_err,err_pos := re.compile(in_query) re_err,err_pos := re.compile(in_query)
@ -2403,8 +2398,8 @@ pub fn new_regex_by_size(mult int) RE {
} }
fn impl_new_regex_by_size(mult int) RE { fn impl_new_regex_by_size(mult int) RE {
mut re := RE{} mut re := RE{}
re.prog = [Token{}].repeat(max_code_len*mult) // max program length, default 256 istructions re.prog = []Token {len: max_code_len*mult} // max program length, default 256 istructions
re.cc = [CharClass{}].repeat(max_code_len*mult) // char class list re.cc = []CharClass{len: max_code_len*mult} // char class list
re.group_max_nested = 3*mult // max nested group re.group_max_nested = 3*mult // max nested group
return re return re

View File

@ -128,6 +128,16 @@ match_test_suite = [
TestItem{"[ an s. s! ]( wi4ki:something )", r"\[.*\]\( *(\w*:*\w+) *\)",0,31}, TestItem{"[ an s. s! ]( wi4ki:something )", r"\[.*\]\( *(\w*:*\w+) *\)",0,31},
TestItem{"[ an s. s! ](wiki:something)", r"\[.*\]\( *(\w*:*\w+) *\)",0,28}, TestItem{"[ an s. s! ](wiki:something)", r"\[.*\]\( *(\w*:*\w+) *\)",0,28},
// Crazywulf tests (?:^|[()])(\d+)(*)(\d+)(?:$|[()])
TestItem{"1*1", r"(\d+)([*])(\d+)",0,3},
TestItem{"+1*1", r"^(\d+)([*])(\d+)",-1,0},
TestItem{"*1*1", r"(?:^|[*])(\d+)([*])(\d+)",0,4},
TestItem{"*1*1", r"(?:^|[*()])(\d+)([*])(\d+)",0,4},
TestItem{")1*1", r"(?:^|[*()])(\d+)([*])(\d+)",0,4},
TestItem{"(1*1", r"(?:^|[*()])(\d+)([*])(\d+)",0,4},
TestItem{"*1*1(", r"(?:^|[*()])(\d+)([*])(\d+)(?:$|[*()])",0,5},
TestItem{" 1*1(", r"(?:^|[*()])(\d+)([*])(\d+)(?:$|[*()])",-1,0},
TestItem{"1*1 ", r"(?:^|[*()])(\d+)([*])(\d+)(?:$|[*()])",-1,0},
] ]
) )