regex: speed optimization (#7471)

pull/7473/head^2
penguindark 2020-12-22 17:42:32 +01:00 committed by GitHub
parent 97855eca93
commit b16052db95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 23 additions and 11 deletions

View File

@ -88,6 +88,7 @@ fn utf8util_char_len(b byte) int {
// get_char get a char from position i and return an u32 with the unicode code // get_char get a char from position i and return an u32 with the unicode code
[inline] [inline]
[direct_array_access]
fn (re RE) get_char(in_txt string, i int) (u32,int) { fn (re RE) get_char(in_txt string, i int) (u32,int) {
ini := unsafe {in_txt.str[i]} ini := unsafe {in_txt.str[i]}
// ascii 8 bit // ascii 8 bit
@ -107,6 +108,7 @@ fn (re RE) get_char(in_txt string, i int) (u32,int) {
// get_charb get a char from position i and return an u32 with the unicode code // get_charb get a char from position i and return an u32 with the unicode code
[inline] [inline]
[direct_array_access]
fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) { fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) {
// ascii 8 bit // ascii 8 bit
if (re.flag & f_bin) !=0 || unsafe {in_txt[i]} & 0x80 == 0 { if (re.flag & f_bin) !=0 || unsafe {in_txt[i]} & 0x80 == 0 {
@ -297,6 +299,7 @@ pub
struct RE { struct RE {
pub mut: pub mut:
prog []Token prog []Token
prog_len int // regex program len
// char classes storage // char classes storage
cc []CharClass // char class list cc []CharClass // char class list
@ -323,12 +326,13 @@ pub mut:
} }
// Reset RE object // Reset RE object
//[inline] [inline]
[direct_array_access]
fn (mut re RE) reset(){ fn (mut re RE) reset(){
re.cc_index = 0 re.cc_index = 0
mut i := 0 mut i := 0
for i < re.prog.len { for i < re.prog_len {
re.prog[i].group_rep = 0 // clear repetition of the group re.prog[i].group_rep = 0 // clear repetition of the group
re.prog[i].rep = 0 // clear repetition of the token re.prog[i].rep = 0 // clear repetition of the token
i++ i++
@ -340,14 +344,18 @@ fn (mut re RE) reset(){
} }
// reset group_csave // reset group_csave
re.group_csave = []int{} if re.group_csave_flag == true {
re.group_csave.clear() // = []int{}
}
} }
// reset for search mode fail // reset for search mode fail
// gcc bug, dont use [inline] or go 5 time slower // gcc bug, dont use [inline] or go 5 time slower
//[inline]
[direct_array_access]
fn (mut re RE) reset_src(){ fn (mut re RE) reset_src(){
mut i := 0 mut i := 0
for i < re.prog.len { for i < re.prog_len {
re.prog[i].group_rep = 0 // clear repetition of the group re.prog[i].group_rep = 0 // clear repetition of the group
re.prog[i].rep = 0 // clear repetition of the token re.prog[i].rep = 0 // clear repetition of the token
i++ i++
@ -1155,6 +1163,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
// add end of the program // add end of the program
re.prog[pc].ist = ist_prog_end re.prog[pc].ist = ist_prog_end
re.prog_len = pc
// check for unbalanced groups // check for unbalanced groups
if group_stack_index != -1 { if group_stack_index != -1 {
@ -1467,7 +1476,7 @@ pub fn (re RE) get_query() string {
* Groups saving utilities * Groups saving utilities
* *
******************************************************************************/ ******************************************************************************/
[inline] [direct_array_access]
fn (mut re RE) group_continuous_save(g_index int) { fn (mut re RE) group_continuous_save(g_index int) {
if re.group_csave_flag == true { if re.group_csave_flag == true {
// continuous save, save until we have space // continuous save, save until we have space
@ -1550,6 +1559,7 @@ pub mut:
last_dot_pc int = -1 // last dot chat pc last_dot_pc int = -1 // last dot chat pc
} }
[direct_array_access]
pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// result status // result status
mut result := no_match_found // function return mut result := no_match_found // function return
@ -1771,7 +1781,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
} }
// ist_next, next instruction reseting its state // ist_next, next instruction reseting its state
if m_state == .ist_next { else if m_state == .ist_next {
state.pc = state.pc + 1 state.pc = state.pc + 1
re.prog[state.pc].reset() re.prog[state.pc].reset()
// check if we are in the program bounds // check if we are in the program bounds
@ -1784,7 +1794,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
} }
// ist_next_ks, next instruction keeping its state // ist_next_ks, next instruction keeping its state
if m_state == .ist_next_ks { else if m_state == .ist_next_ks {
state.pc = state.pc + 1 state.pc = state.pc + 1
// check if we are in the program bounds // check if we are in the program bounds
if state.pc < 0 || state.pc > re.prog.len { if state.pc < 0 || state.pc > re.prog.len {
@ -1805,7 +1815,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
} }
// check if stop // check if stop
if m_state == .stop { else if m_state == .stop {
// we are in search mode, don't exit until the end // we are in search mode, don't exit until the end
if ((re.flag & f_src) != 0) && (ist != ist_prog_end) { if ((re.flag & f_src) != 0) && (ist != ist_prog_end) {
@ -1849,7 +1859,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
} }
// ist_load // ist_load
if m_state == .ist_load { else if m_state == .ist_load {
// program end // program end
if ist == ist_prog_end { if ist == ist_prog_end {
@ -2116,7 +2126,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
* Quantifier management * Quantifier management
***********************************/ ***********************************/
// ist_quant_ng => quantifier negative test on group // ist_quant_ng => quantifier negative test on group
if m_state == .ist_quant_ng { else if m_state == .ist_quant_ng {
// we are finished here // we are finished here
if state.group_index < 0 { if state.group_index < 0 {
@ -2378,7 +2388,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// //
// Matchers // Matchers
// //
[direct_array_access]
pub fn (mut re RE) match_string(in_txt string) (int,int) { pub fn (mut re RE) match_string(in_txt string) (int,int) {
start, mut end := re.match_base(in_txt.str, in_txt.len + 1) start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
@ -2406,6 +2416,7 @@ pub fn (mut re RE) match_string(in_txt string) (int,int) {
// //
// find try to find the first match in the input string // find try to find the first match in the input string
[direct_array_access]
pub fn (mut re RE) find(in_txt string) (int,int) { pub fn (mut re RE) find(in_txt string) (int,int) {
old_flag := re.flag old_flag := re.flag
@ -2424,6 +2435,7 @@ pub fn (mut re RE) find(in_txt string) (int,int) {
} }
// find all the non overlapping occurrences of the match pattern // find all the non overlapping occurrences of the match pattern
[direct_array_access]
pub fn (mut re RE) find_all(in_txt string) []int { pub fn (mut re RE) find_all(in_txt string) []int {
mut i := 0 mut i := 0
mut res := []int{} mut res := []int{}