regex: speed optimization 2 (#7473)

pull/7481/head
penguindark 2020-12-22 21:34:46 +01:00 committed by GitHub
parent 1bc317acab
commit 4069a4c55d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 52 additions and 32 deletions

View File

@ -311,11 +311,16 @@ pub mut:
group_max_nested int = 3 // max nested group group_max_nested int = 3 // max nested group
group_max int = 8 // max allowed number of different groups group_max int = 8 // max allowed number of different groups
state_list []StateObj
group_csave_flag bool // flag to enable continuous saving group_csave_flag bool // flag to enable continuous saving
group_csave []int = []int{} // groups continuous save list group_csave []int //= []int{} // groups continuous save list
group_map map[string]int // groups names map group_map map[string]int // groups names map
group_stack []int
group_data []int
// flags // flags
flag int // flag for optional parameters flag int // flag for optional parameters
@ -1574,10 +1579,10 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
mut ist := rune(0) // actual instruction mut ist := rune(0) // actual instruction
mut l_ist := rune(0) // last matched instruction mut l_ist := rune(0) // last matched instruction
mut state_list := []StateObj{} //mut state_list := []StateObj{}
mut group_stack := []int{len: re.group_max, init: -1} //mut group_stack := []int{len: re.group_max, init: -1}
mut group_data := []int{len: re.group_max, init: -1} //mut group_data := []int{len: re.group_max, init: -1}
//mut group_index := -1 // group id used to know how many groups are open //mut group_index := -1 // group id used to know how many groups are open
@ -1699,13 +1704,13 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println("End text with open groups!") //println("End text with open groups!")
// close the groups // close the groups
for state.group_index >= 0 { for state.group_index >= 0 {
tmp_pc := group_data[state.group_index] tmp_pc := re.group_data[state.group_index]
re.prog[tmp_pc].group_rep++ re.prog[tmp_pc].group_rep++
//println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}") //println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{ if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
start_i := group_stack[state.group_index] start_i := re.group_stack[state.group_index]
group_stack[state.group_index]=-1 re.group_stack[state.group_index]=-1
// save group results // save group results
g_index := re.prog[tmp_pc].group_id*2 g_index := re.prog[tmp_pc].group_id*2
@ -1836,9 +1841,9 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// manage here dot char // manage here dot char
if state_list.len > 0 { if re.state_list.len > 0 {
//println("Here we are, with stop: state buffer: [${state_list.len}]") //println("Here we are, with stop: state buffer: [${re.state_list.len}]")
state = state_list.pop() state = re.state_list.pop()
state.match_flag = true state.match_flag = true
l_ist = u32(ist_dot_char) l_ist = u32(ist_dot_char)
@ -1877,8 +1882,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// check GROUP start, no quantifier is checkd for this token!! // check GROUP start, no quantifier is checkd for this token!!
else if ist == ist_group_start { else if ist == ist_group_start {
state.group_index++ state.group_index++
group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
group_stack[state.group_index] = state.i // index where we start to manage re.group_stack[state.group_index] = state.i // index where we start to manage
//println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}") //println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
m_state = .ist_next m_state = .ist_next
@ -1894,8 +1899,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}") //println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 { if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
start_i := group_stack[state.group_index] start_i := re.group_stack[state.group_index]
//group_stack[state.group_index]=-1 //re.group_stack[state.group_index]=-1
// save group results // save group results
g_index := re.prog[state.pc].group_id*2 g_index := re.prog[state.pc].group_id*2
@ -1960,7 +1965,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// if we are done with max go on dot char are dedicated case!! // if we are done with max go on dot char are dedicated case!!
if re.prog[state.pc].rep >= re.prog[state.pc].rep_max if re.prog[state.pc].rep >= re.prog[state.pc].rep_max
{ {
state_list.pop() re.state_list.pop()
m_state = .ist_next m_state = .ist_next
continue continue
} }
@ -2005,7 +2010,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// check if we must continue or pass to the next IST // check if we must continue or pass to the next IST
if next_check_flag == true { if next_check_flag == true {
//println("save the state!!") //println("save the state!!")
state_list << StateObj { re.state_list << StateObj {
group_index: state.group_index group_index: state.group_index
match_flag: state.match_flag match_flag: state.match_flag
match_index: state.match_index match_index: state.match_index
@ -2016,7 +2021,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
last_dot_pc: state.pc last_dot_pc: state.pc
} }
m_state = .ist_quant_n m_state = .ist_quant_n
//println("dot_char stack len: $state_list.len") //println("dot_char stack len: ${re.state_list.len}")
continue continue
} }
@ -2136,7 +2141,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
continue continue
} }
tmp_pc := group_data[state.group_index] // PC to the end of the group token tmp_pc := re.group_data[state.group_index] // PC to the end of the group token
rep := re.prog[tmp_pc].group_rep // use a temp variable rep := re.prog[tmp_pc].group_rep // use a temp variable
re.prog[tmp_pc].group_rep = 0 // clear the repetitions re.prog[tmp_pc].group_rep = 0 // clear the repetitions
@ -2145,7 +2150,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
if rep >= re.prog[tmp_pc].rep_min { if rep >= re.prog[tmp_pc].rep_min {
//println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index") //println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index")
state.i = group_stack[state.group_index] state.i = re.group_stack[state.group_index]
state.pc = tmp_pc state.pc = tmp_pc
state.group_index-- state.group_index--
m_state = .ist_next m_state = .ist_next
@ -2154,7 +2159,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
else if re.prog[tmp_pc].next_is_or { else if re.prog[tmp_pc].next_is_or {
//println("ist_quant_ng OR Negative branch") //println("ist_quant_ng OR Negative branch")
state.i = group_stack[state.group_index] state.i = re.group_stack[state.group_index]
state.pc = re.prog[tmp_pc+1].rep_min -1 state.pc = re.prog[tmp_pc+1].rep_min -1
state.group_index-- state.group_index--
m_state = .ist_next m_state = .ist_next
@ -2207,7 +2212,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println(".ist_quant_pg") //println(".ist_quant_pg")
mut tmp_pc := state.pc mut tmp_pc := state.pc
if state.group_index >= 0 { if state.group_index >= 0 {
tmp_pc = group_data[state.group_index] tmp_pc = re.group_data[state.group_index]
} }
rep := re.prog[tmp_pc].group_rep rep := re.prog[tmp_pc].group_rep
@ -2372,7 +2377,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//println("Skip last group") //println("Skip last group")
return state.first_match,state.i return state.first_match,state.i
//return state.first_match,group_stack[state.group_index--] //return state.first_match,re.group_stack[state.group_index--]
} }
} }
//println("no_match_found, natural end") //println("no_match_found, natural end")

View File

@ -38,6 +38,9 @@ pub fn regex_opt(pattern string) ?RE {
re.group_max_nested = 128 // set max 128 group nested re.group_max_nested = 128 // set max 128 group nested
re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth
re.group_stack = []int{len: re.group_max, init: -1}
re.group_data = []int{len: re.group_max, init: -1}
// compile the pattern // compile the pattern
re.compile_opt(pattern)? re.compile_opt(pattern)?

View File

@ -16,13 +16,19 @@ module regex
******************************************************************************/ ******************************************************************************/
// regex create a regex object from the query string // regex create a regex object from the query string
[deprecated] [deprecated]
pub fn regex(in_query string) (RE,int,int){ pub fn regex(pattern string) (RE,int,int){
mut re := RE{} // init regex
re.prog = []Token {len: in_query.len+1} mut re := regex.RE{}
re.cc = []CharClass{len: in_query.len+1} re.prog = []Token {len: pattern.len + 1} // max program length, can not be longer then the pattern
re.group_max_nested = 8 re.cc = []CharClass{len: pattern.len} // can not be more char class the the length of the pattern
re.group_csave_flag = false // enable continuos group saving
re.group_max_nested = 128 // set max 128 group nested
re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth
re_err,err_pos := re.compile(in_query) re.group_stack = []int{len: re.group_max, init: -1}
re.group_data = []int{len: re.group_max, init: -1}
re_err,err_pos := re.compile(pattern)
return re, re_err, err_pos return re, re_err, err_pos
} }
@ -38,10 +44,16 @@ pub fn new_regex_by_size(mult int) RE {
return impl_new_regex_by_size(mult) return impl_new_regex_by_size(mult)
} }
fn impl_new_regex_by_size(mult int) RE { fn impl_new_regex_by_size(mult int) RE {
mut re := RE{} // init regex
re.prog = []Token {len: max_code_len*mult} // max program length, default 256 istructions mut re := regex.RE{}
re.cc = []CharClass{len: max_code_len*mult} // char class list re.prog = []Token {len: max_code_len*mult} // max program length, can not be longer then the pattern
re.group_max_nested = 3*mult // max nested group re.cc = []CharClass{len: max_code_len*mult} // can not be more char class the the length of the pattern
re.group_csave_flag = false // enable continuos group saving
re.group_max_nested = 3*mult // set max 128 group nested
re.group_max = max_code_len*mult >> 1 // we can't have more groups than the half of the pattern legth
re.group_stack = []int{len: re.group_max, init: -1}
re.group_data = []int{len: re.group_max, init: -1}
return re return re
} }