regex: speed optimization 2 (#7473)
parent
1bc317acab
commit
4069a4c55d
|
@ -311,11 +311,16 @@ pub mut:
|
||||||
group_max_nested int = 3 // max nested group
|
group_max_nested int = 3 // max nested group
|
||||||
group_max int = 8 // max allowed number of different groups
|
group_max int = 8 // max allowed number of different groups
|
||||||
|
|
||||||
|
state_list []StateObj
|
||||||
|
|
||||||
group_csave_flag bool // flag to enable continuous saving
|
group_csave_flag bool // flag to enable continuous saving
|
||||||
group_csave []int = []int{} // groups continuous save list
|
group_csave []int //= []int{} // groups continuous save list
|
||||||
|
|
||||||
group_map map[string]int // groups names map
|
group_map map[string]int // groups names map
|
||||||
|
|
||||||
|
group_stack []int
|
||||||
|
group_data []int
|
||||||
|
|
||||||
// flags
|
// flags
|
||||||
flag int // flag for optional parameters
|
flag int // flag for optional parameters
|
||||||
|
|
||||||
|
@ -1574,10 +1579,10 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
mut ist := rune(0) // actual instruction
|
mut ist := rune(0) // actual instruction
|
||||||
mut l_ist := rune(0) // last matched instruction
|
mut l_ist := rune(0) // last matched instruction
|
||||||
|
|
||||||
mut state_list := []StateObj{}
|
//mut state_list := []StateObj{}
|
||||||
|
|
||||||
mut group_stack := []int{len: re.group_max, init: -1}
|
//mut group_stack := []int{len: re.group_max, init: -1}
|
||||||
mut group_data := []int{len: re.group_max, init: -1}
|
//mut group_data := []int{len: re.group_max, init: -1}
|
||||||
|
|
||||||
//mut group_index := -1 // group id used to know how many groups are open
|
//mut group_index := -1 // group id used to know how many groups are open
|
||||||
|
|
||||||
|
@ -1699,13 +1704,13 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
//println("End text with open groups!")
|
//println("End text with open groups!")
|
||||||
// close the groups
|
// close the groups
|
||||||
for state.group_index >= 0 {
|
for state.group_index >= 0 {
|
||||||
tmp_pc := group_data[state.group_index]
|
tmp_pc := re.group_data[state.group_index]
|
||||||
re.prog[tmp_pc].group_rep++
|
re.prog[tmp_pc].group_rep++
|
||||||
//println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
|
//println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
|
||||||
|
|
||||||
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
|
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
|
||||||
start_i := group_stack[state.group_index]
|
start_i := re.group_stack[state.group_index]
|
||||||
group_stack[state.group_index]=-1
|
re.group_stack[state.group_index]=-1
|
||||||
|
|
||||||
// save group results
|
// save group results
|
||||||
g_index := re.prog[tmp_pc].group_id*2
|
g_index := re.prog[tmp_pc].group_id*2
|
||||||
|
@ -1836,9 +1841,9 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
|
|
||||||
// manage here dot char
|
// manage here dot char
|
||||||
|
|
||||||
if state_list.len > 0 {
|
if re.state_list.len > 0 {
|
||||||
//println("Here we are, with stop: state buffer: [${state_list.len}]")
|
//println("Here we are, with stop: state buffer: [${re.state_list.len}]")
|
||||||
state = state_list.pop()
|
state = re.state_list.pop()
|
||||||
|
|
||||||
state.match_flag = true
|
state.match_flag = true
|
||||||
l_ist = u32(ist_dot_char)
|
l_ist = u32(ist_dot_char)
|
||||||
|
@ -1877,8 +1882,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
// check GROUP start, no quantifier is checkd for this token!!
|
// check GROUP start, no quantifier is checkd for this token!!
|
||||||
else if ist == ist_group_start {
|
else if ist == ist_group_start {
|
||||||
state.group_index++
|
state.group_index++
|
||||||
group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
|
re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
|
||||||
group_stack[state.group_index] = state.i // index where we start to manage
|
re.group_stack[state.group_index] = state.i // index where we start to manage
|
||||||
//println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
|
//println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
|
||||||
|
|
||||||
m_state = .ist_next
|
m_state = .ist_next
|
||||||
|
@ -1894,8 +1899,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
|
|
||||||
//println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
|
//println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
|
||||||
if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
|
if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
|
||||||
start_i := group_stack[state.group_index]
|
start_i := re.group_stack[state.group_index]
|
||||||
//group_stack[state.group_index]=-1
|
//re.group_stack[state.group_index]=-1
|
||||||
|
|
||||||
// save group results
|
// save group results
|
||||||
g_index := re.prog[state.pc].group_id*2
|
g_index := re.prog[state.pc].group_id*2
|
||||||
|
@ -1960,7 +1965,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
// if we are done with max go on dot char are dedicated case!!
|
// if we are done with max go on dot char are dedicated case!!
|
||||||
if re.prog[state.pc].rep >= re.prog[state.pc].rep_max
|
if re.prog[state.pc].rep >= re.prog[state.pc].rep_max
|
||||||
{
|
{
|
||||||
state_list.pop()
|
re.state_list.pop()
|
||||||
m_state = .ist_next
|
m_state = .ist_next
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -2005,7 +2010,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
// check if we must continue or pass to the next IST
|
// check if we must continue or pass to the next IST
|
||||||
if next_check_flag == true {
|
if next_check_flag == true {
|
||||||
//println("save the state!!")
|
//println("save the state!!")
|
||||||
state_list << StateObj {
|
re.state_list << StateObj {
|
||||||
group_index: state.group_index
|
group_index: state.group_index
|
||||||
match_flag: state.match_flag
|
match_flag: state.match_flag
|
||||||
match_index: state.match_index
|
match_index: state.match_index
|
||||||
|
@ -2016,7 +2021,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
last_dot_pc: state.pc
|
last_dot_pc: state.pc
|
||||||
}
|
}
|
||||||
m_state = .ist_quant_n
|
m_state = .ist_quant_n
|
||||||
//println("dot_char stack len: $state_list.len")
|
//println("dot_char stack len: ${re.state_list.len}")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2136,7 +2141,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp_pc := group_data[state.group_index] // PC to the end of the group token
|
tmp_pc := re.group_data[state.group_index] // PC to the end of the group token
|
||||||
rep := re.prog[tmp_pc].group_rep // use a temp variable
|
rep := re.prog[tmp_pc].group_rep // use a temp variable
|
||||||
re.prog[tmp_pc].group_rep = 0 // clear the repetitions
|
re.prog[tmp_pc].group_rep = 0 // clear the repetitions
|
||||||
|
|
||||||
|
@ -2145,7 +2150,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
if rep >= re.prog[tmp_pc].rep_min {
|
if rep >= re.prog[tmp_pc].rep_min {
|
||||||
//println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index")
|
//println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index")
|
||||||
|
|
||||||
state.i = group_stack[state.group_index]
|
state.i = re.group_stack[state.group_index]
|
||||||
state.pc = tmp_pc
|
state.pc = tmp_pc
|
||||||
state.group_index--
|
state.group_index--
|
||||||
m_state = .ist_next
|
m_state = .ist_next
|
||||||
|
@ -2154,7 +2159,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
else if re.prog[tmp_pc].next_is_or {
|
else if re.prog[tmp_pc].next_is_or {
|
||||||
//println("ist_quant_ng OR Negative branch")
|
//println("ist_quant_ng OR Negative branch")
|
||||||
|
|
||||||
state.i = group_stack[state.group_index]
|
state.i = re.group_stack[state.group_index]
|
||||||
state.pc = re.prog[tmp_pc+1].rep_min -1
|
state.pc = re.prog[tmp_pc+1].rep_min -1
|
||||||
state.group_index--
|
state.group_index--
|
||||||
m_state = .ist_next
|
m_state = .ist_next
|
||||||
|
@ -2207,7 +2212,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
//println(".ist_quant_pg")
|
//println(".ist_quant_pg")
|
||||||
mut tmp_pc := state.pc
|
mut tmp_pc := state.pc
|
||||||
if state.group_index >= 0 {
|
if state.group_index >= 0 {
|
||||||
tmp_pc = group_data[state.group_index]
|
tmp_pc = re.group_data[state.group_index]
|
||||||
}
|
}
|
||||||
|
|
||||||
rep := re.prog[tmp_pc].group_rep
|
rep := re.prog[tmp_pc].group_rep
|
||||||
|
@ -2372,7 +2377,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
|
|
||||||
//println("Skip last group")
|
//println("Skip last group")
|
||||||
return state.first_match,state.i
|
return state.first_match,state.i
|
||||||
//return state.first_match,group_stack[state.group_index--]
|
//return state.first_match,re.group_stack[state.group_index--]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//println("no_match_found, natural end")
|
//println("no_match_found, natural end")
|
||||||
|
|
|
@ -38,6 +38,9 @@ pub fn regex_opt(pattern string) ?RE {
|
||||||
re.group_max_nested = 128 // set max 128 group nested
|
re.group_max_nested = 128 // set max 128 group nested
|
||||||
re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth
|
re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth
|
||||||
|
|
||||||
|
re.group_stack = []int{len: re.group_max, init: -1}
|
||||||
|
re.group_data = []int{len: re.group_max, init: -1}
|
||||||
|
|
||||||
// compile the pattern
|
// compile the pattern
|
||||||
re.compile_opt(pattern)?
|
re.compile_opt(pattern)?
|
||||||
|
|
||||||
|
|
|
@ -16,13 +16,19 @@ module regex
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
// regex create a regex object from the query string
|
// regex create a regex object from the query string
|
||||||
[deprecated]
|
[deprecated]
|
||||||
pub fn regex(in_query string) (RE,int,int){
|
pub fn regex(pattern string) (RE,int,int){
|
||||||
mut re := RE{}
|
// init regex
|
||||||
re.prog = []Token {len: in_query.len+1}
|
mut re := regex.RE{}
|
||||||
re.cc = []CharClass{len: in_query.len+1}
|
re.prog = []Token {len: pattern.len + 1} // max program length, can not be longer then the pattern
|
||||||
re.group_max_nested = 8
|
re.cc = []CharClass{len: pattern.len} // can not be more char class the the length of the pattern
|
||||||
|
re.group_csave_flag = false // enable continuos group saving
|
||||||
|
re.group_max_nested = 128 // set max 128 group nested
|
||||||
|
re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth
|
||||||
|
|
||||||
re_err,err_pos := re.compile(in_query)
|
re.group_stack = []int{len: re.group_max, init: -1}
|
||||||
|
re.group_data = []int{len: re.group_max, init: -1}
|
||||||
|
|
||||||
|
re_err,err_pos := re.compile(pattern)
|
||||||
return re, re_err, err_pos
|
return re, re_err, err_pos
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,10 +44,16 @@ pub fn new_regex_by_size(mult int) RE {
|
||||||
return impl_new_regex_by_size(mult)
|
return impl_new_regex_by_size(mult)
|
||||||
}
|
}
|
||||||
fn impl_new_regex_by_size(mult int) RE {
|
fn impl_new_regex_by_size(mult int) RE {
|
||||||
mut re := RE{}
|
// init regex
|
||||||
re.prog = []Token {len: max_code_len*mult} // max program length, default 256 istructions
|
mut re := regex.RE{}
|
||||||
re.cc = []CharClass{len: max_code_len*mult} // char class list
|
re.prog = []Token {len: max_code_len*mult} // max program length, can not be longer then the pattern
|
||||||
re.group_max_nested = 3*mult // max nested group
|
re.cc = []CharClass{len: max_code_len*mult} // can not be more char class the the length of the pattern
|
||||||
|
re.group_csave_flag = false // enable continuos group saving
|
||||||
|
re.group_max_nested = 3*mult // set max 128 group nested
|
||||||
|
re.group_max = max_code_len*mult >> 1 // we can't have more groups than the half of the pattern legth
|
||||||
|
|
||||||
|
re.group_stack = []int{len: re.group_max, init: -1}
|
||||||
|
re.group_data = []int{len: re.group_max, init: -1}
|
||||||
|
|
||||||
return re
|
return re
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue