regex: speed optimization 2 (#7473)

pull/7481/head
penguindark 2020-12-22 21:34:46 +01:00 committed by GitHub
parent 1bc317acab
commit 4069a4c55d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 52 additions and 32 deletions

View File

@ -311,11 +311,16 @@ pub mut:
group_max_nested int = 3 // max nested group
group_max int = 8 // max allowed number of different groups
state_list []StateObj
group_csave_flag bool // flag to enable continuous saving
group_csave []int = []int{} // groups continuous save list
group_csave []int //= []int{} // groups continuous save list
group_map map[string]int // groups names map
group_stack []int
group_data []int
// flags
flag int // flag for optional parameters
@ -1574,10 +1579,10 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
mut ist := rune(0) // actual instruction
mut l_ist := rune(0) // last matched instruction
mut state_list := []StateObj{}
//mut state_list := []StateObj{}
mut group_stack := []int{len: re.group_max, init: -1}
mut group_data := []int{len: re.group_max, init: -1}
//mut group_stack := []int{len: re.group_max, init: -1}
//mut group_data := []int{len: re.group_max, init: -1}
//mut group_index := -1 // group id used to know how many groups are open
@ -1699,13 +1704,13 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println("End text with open groups!")
// close the groups
for state.group_index >= 0 {
tmp_pc := group_data[state.group_index]
tmp_pc := re.group_data[state.group_index]
re.prog[tmp_pc].group_rep++
//println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
start_i := group_stack[state.group_index]
group_stack[state.group_index]=-1
start_i := re.group_stack[state.group_index]
re.group_stack[state.group_index]=-1
// save group results
g_index := re.prog[tmp_pc].group_id*2
@ -1836,9 +1841,9 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// manage here dot char
if state_list.len > 0 {
//println("Here we are, with stop: state buffer: [${state_list.len}]")
state = state_list.pop()
if re.state_list.len > 0 {
//println("Here we are, with stop: state buffer: [${re.state_list.len}]")
state = re.state_list.pop()
state.match_flag = true
l_ist = u32(ist_dot_char)
@ -1877,8 +1882,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// check GROUP start, no quantifier is checkd for this token!!
else if ist == ist_group_start {
state.group_index++
group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
group_stack[state.group_index] = state.i // index where we start to manage
re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
re.group_stack[state.group_index] = state.i // index where we start to manage
//println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
m_state = .ist_next
@ -1894,8 +1899,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
start_i := group_stack[state.group_index]
//group_stack[state.group_index]=-1
start_i := re.group_stack[state.group_index]
//re.group_stack[state.group_index]=-1
// save group results
g_index := re.prog[state.pc].group_id*2
@ -1960,7 +1965,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// if we are done with max go on dot char are dedicated case!!
if re.prog[state.pc].rep >= re.prog[state.pc].rep_max
{
state_list.pop()
re.state_list.pop()
m_state = .ist_next
continue
}
@ -2005,7 +2010,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// check if we must continue or pass to the next IST
if next_check_flag == true {
//println("save the state!!")
state_list << StateObj {
re.state_list << StateObj {
group_index: state.group_index
match_flag: state.match_flag
match_index: state.match_index
@ -2016,7 +2021,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
last_dot_pc: state.pc
}
m_state = .ist_quant_n
//println("dot_char stack len: $state_list.len")
//println("dot_char stack len: ${re.state_list.len}")
continue
}
@ -2136,7 +2141,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
continue
}
tmp_pc := group_data[state.group_index] // PC to the end of the group token
tmp_pc := re.group_data[state.group_index] // PC to the end of the group token
rep := re.prog[tmp_pc].group_rep // use a temp variable
re.prog[tmp_pc].group_rep = 0 // clear the repetitions
@ -2145,7 +2150,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
if rep >= re.prog[tmp_pc].rep_min {
//println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index")
state.i = group_stack[state.group_index]
state.i = re.group_stack[state.group_index]
state.pc = tmp_pc
state.group_index--
m_state = .ist_next
@ -2154,7 +2159,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
else if re.prog[tmp_pc].next_is_or {
//println("ist_quant_ng OR Negative branch")
state.i = group_stack[state.group_index]
state.i = re.group_stack[state.group_index]
state.pc = re.prog[tmp_pc+1].rep_min -1
state.group_index--
m_state = .ist_next
@ -2207,7 +2212,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println(".ist_quant_pg")
mut tmp_pc := state.pc
if state.group_index >= 0 {
tmp_pc = group_data[state.group_index]
tmp_pc = re.group_data[state.group_index]
}
rep := re.prog[tmp_pc].group_rep
@ -2372,7 +2377,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println("Skip last group")
return state.first_match,state.i
//return state.first_match,group_stack[state.group_index--]
//return state.first_match,re.group_stack[state.group_index--]
}
}
//println("no_match_found, natural end")

View File

@ -38,6 +38,9 @@ pub fn regex_opt(pattern string) ?RE {
re.group_max_nested = 128 // set max 128 group nested
re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth
re.group_stack = []int{len: re.group_max, init: -1}
re.group_data = []int{len: re.group_max, init: -1}
// compile the pattern
re.compile_opt(pattern)?

View File

@ -16,13 +16,19 @@ module regex
******************************************************************************/
// regex create a regex object from the query string
[deprecated]
pub fn regex(in_query string) (RE,int,int){
mut re := RE{}
re.prog = []Token {len: in_query.len+1}
re.cc = []CharClass{len: in_query.len+1}
re.group_max_nested = 8
pub fn regex(pattern string) (RE,int,int){
// init regex
mut re := regex.RE{}
re.prog = []Token {len: pattern.len + 1} // max program length, can not be longer then the pattern
re.cc = []CharClass{len: pattern.len} // can not be more char class the the length of the pattern
re.group_csave_flag = false // enable continuos group saving
re.group_max_nested = 128 // set max 128 group nested
re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth
re_err,err_pos := re.compile(in_query)
re.group_stack = []int{len: re.group_max, init: -1}
re.group_data = []int{len: re.group_max, init: -1}
re_err,err_pos := re.compile(pattern)
return re, re_err, err_pos
}
@ -38,10 +44,16 @@ pub fn new_regex_by_size(mult int) RE {
return impl_new_regex_by_size(mult)
}
fn impl_new_regex_by_size(mult int) RE {
mut re := RE{}
re.prog = []Token {len: max_code_len*mult} // max program length, default 256 istructions
re.cc = []CharClass{len: max_code_len*mult} // char class list
re.group_max_nested = 3*mult // max nested group
// init regex
mut re := regex.RE{}
re.prog = []Token {len: max_code_len*mult} // max program length, can not be longer then the pattern
re.cc = []CharClass{len: max_code_len*mult} // can not be more char class the the length of the pattern
re.group_csave_flag = false // enable continuos group saving
re.group_max_nested = 3*mult // set max 128 group nested
re.group_max = max_code_len*mult >> 1 // we can't have more groups than the half of the pattern legth
re.group_stack = []int{len: re.group_max, init: -1}
re.group_data = []int{len: re.group_max, init: -1}
return re
}