From 4069a4c55d1f3e6940b97c8e0358c5efa2cef645 Mon Sep 17 00:00:00 2001
From: penguindark <57967770+penguindark@users.noreply.github.com>
Date: Tue, 22 Dec 2020 21:34:46 +0100
Subject: [PATCH] regex: speed optimization 2 (#7473)

---
 vlib/regex/regex.v      | 49 +++++++++++++++++++++++------------------
 vlib/regex/regex_opt.v  |  3 +++
 vlib/regex/regex_util.v | 32 ++++++++++++++++++---------
 3 files changed, 52 insertions(+), 32 deletions(-)

diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v
index 60231f191f..caa7e3ab0c 100644
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@@ -311,11 +311,16 @@ pub mut:
 	group_max_nested  int  = 3         // max nested group
 	group_max         int  = 8         // max allowed number of different groups
 
+	state_list        []StateObj
+
 	group_csave_flag  bool             // flag to enable continuous saving
-	group_csave       []int = []int{}  // groups continuous save list
+	group_csave       []int //= []int{}  // groups continuous save list
 
 	group_map         map[string]int   // groups names map
 
+	group_stack       []int 
+	group_data        []int
+
 	// flags
 	flag              int              // flag for optional parameters
 
@@ -1574,10 +1579,10 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 	mut ist   := rune(0)              // actual instruction
 	mut l_ist := rune(0)              // last matched instruction
 
-	mut state_list := []StateObj{}
+	//mut state_list := []StateObj{}
 
-	mut group_stack := []int{len: re.group_max, init: -1}
-	mut group_data  := []int{len: re.group_max, init: -1}
+	//mut group_stack := []int{len: re.group_max, init: -1}
+	//mut group_data  := []int{len: re.group_max, init: -1}
 
 	//mut group_index := -1           // group id used to know how many groups are open
 
@@ -1699,13 +1704,13 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				//println("End text with open groups!")
 				// close the groups
 				for state.group_index >= 0 {
-					tmp_pc := group_data[state.group_index]
+					tmp_pc := re.group_data[state.group_index]
 					re.prog[tmp_pc].group_rep++
 					//println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
 
 					if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
-						start_i   := group_stack[state.group_index]
-	 					group_stack[state.group_index]=-1
+						start_i   := re.group_stack[state.group_index]
+	 					re.group_stack[state.group_index]=-1
 
 	 					// save group results
 						g_index := re.prog[tmp_pc].group_id*2
@@ -1836,9 +1841,9 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 
 			// manage here dot char
 
-			if state_list.len > 0 {
-				//println("Here we are, with stop: state buffer: [${state_list.len}]")
-				state = state_list.pop()
+			if re.state_list.len > 0 {
+				//println("Here we are, with stop: state buffer: [${re.state_list.len}]")
+				state = re.state_list.pop()
 
 				state.match_flag = true
 				l_ist = u32(ist_dot_char)
@@ -1877,8 +1882,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 			// check GROUP start, no quantifier is checkd for this token!!
 			else if ist == ist_group_start {
 				state.group_index++
-				group_data[state.group_index]  = re.prog[state.pc].goto_pc   // save where is ist_group_end, we will use it for escape
-				group_stack[state.group_index] = state.i                     // index where we start to manage
+				re.group_data[state.group_index]  = re.prog[state.pc].goto_pc   // save where is ist_group_end, we will use it for escape
+				re.group_stack[state.group_index] = state.i                     // index where we start to manage
 				//println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
 
 				m_state = .ist_next
@@ -1894,8 +1899,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 
 					//println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
 					if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
-	 					start_i   := group_stack[state.group_index]
-	 					//group_stack[state.group_index]=-1
+	 					start_i   := re.group_stack[state.group_index]
+	 					//re.group_stack[state.group_index]=-1
 
 	 					// save group results
 						g_index := re.prog[state.pc].group_id*2
@@ -1960,7 +1965,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				// if we are done with max go on dot char are dedicated case!!
 				if	re.prog[state.pc].rep >= re.prog[state.pc].rep_max 
 				{
-					state_list.pop()
+					re.state_list.pop()
 					m_state = .ist_next
 					continue
 				}
@@ -2005,7 +2010,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				// check if we must continue or pass to the next IST
 				if next_check_flag == true {
 					//println("save the state!!")
-					state_list << StateObj {
+					re.state_list << StateObj {
 						group_index: state.group_index
 						match_flag:  state.match_flag
 						match_index: state.match_index
@@ -2016,7 +2021,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 						last_dot_pc: state.pc
 					}
 					m_state = .ist_quant_n
-					//println("dot_char stack len: $state_list.len")
+					//println("dot_char stack len: ${re.state_list.len}")
 					continue
 				}
 
@@ -2136,7 +2141,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				continue
 			}
 
-			tmp_pc := group_data[state.group_index]    // PC to the end of the group token
+			tmp_pc := re.group_data[state.group_index]    // PC to the end of the group token
 			rep    := re.prog[tmp_pc].group_rep  // use a temp variable
 			re.prog[tmp_pc].group_rep = 0        // clear the repetitions
 
@@ -2145,7 +2150,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 			if rep >= re.prog[tmp_pc].rep_min {
 				//println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index")
 
-				state.i = group_stack[state.group_index]
+				state.i = re.group_stack[state.group_index]
 				state.pc = tmp_pc
 				state.group_index--
 				m_state = .ist_next
@@ -2154,7 +2159,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 			else if re.prog[tmp_pc].next_is_or {
 				//println("ist_quant_ng OR Negative branch")
 
-				state.i = group_stack[state.group_index]
+				state.i = re.group_stack[state.group_index]
 				state.pc = re.prog[tmp_pc+1].rep_min -1
 				state.group_index--
 				m_state = .ist_next
@@ -2207,7 +2212,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 			//println(".ist_quant_pg")
 			mut tmp_pc := state.pc
 			if state.group_index >= 0 {
-				tmp_pc = group_data[state.group_index]
+				tmp_pc = re.group_data[state.group_index]
 			}
 
 			rep := re.prog[tmp_pc].group_rep
@@ -2372,7 +2377,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 
 			//println("Skip last group")
 			return state.first_match,state.i
-			//return state.first_match,group_stack[state.group_index--]
+			//return state.first_match,re.group_stack[state.group_index--]
 		}
 	}
 	//println("no_match_found, natural end")
diff --git a/vlib/regex/regex_opt.v b/vlib/regex/regex_opt.v
index ffe6ed5cf2..8746f993bb 100644
--- a/vlib/regex/regex_opt.v
+++ b/vlib/regex/regex_opt.v
@@ -38,6 +38,9 @@ pub fn regex_opt(pattern string) ?RE {
     re.group_max_nested = 128                   // set max 128 group nested
     re.group_max        = pattern.len >> 1      // we can't have more groups than the half of the pattern legth
 
+    re.group_stack = []int{len: re.group_max, init: -1}
+	re.group_data  = []int{len: re.group_max, init: -1}
+
     // compile the pattern
     re.compile_opt(pattern)?
 
diff --git a/vlib/regex/regex_util.v b/vlib/regex/regex_util.v
index b743009076..3572d02f19 100644
--- a/vlib/regex/regex_util.v
+++ b/vlib/regex/regex_util.v
@@ -16,13 +16,19 @@ module regex
 ******************************************************************************/
 // regex create a regex object from the query string
 [deprecated]
-pub fn regex(in_query string) (RE,int,int){
-	mut re := RE{}
-	re.prog = []Token    {len: in_query.len+1}
-	re.cc   = []CharClass{len: in_query.len+1}
-	re.group_max_nested = 8
+pub fn regex(pattern string) (RE,int,int){
+	// init regex
+    mut re := regex.RE{}
+    re.prog = []Token    {len: pattern.len + 1} // max program length, can not be longer then the pattern
+    re.cc   = []CharClass{len: pattern.len}     // can not be more char class the the length of the pattern
+    re.group_csave_flag = false                 // enable continuos group saving
+    re.group_max_nested = 128                   // set max 128 group nested
+    re.group_max        = pattern.len >> 1      // we can't have more groups than the half of the pattern legth
 
-	re_err,err_pos := re.compile(in_query)
+    re.group_stack = []int{len: re.group_max, init: -1}
+	re.group_data  = []int{len: re.group_max, init: -1}
+
+	re_err,err_pos := re.compile(pattern)
 	return re, re_err, err_pos
 }
 
@@ -38,10 +44,16 @@ pub fn new_regex_by_size(mult int) RE {
 	return impl_new_regex_by_size(mult)
 }
 fn impl_new_regex_by_size(mult int) RE {
-	mut re := RE{}
-	re.prog = []Token    {len: max_code_len*mult}       // max program length, default 256 istructions
-	re.cc   = []CharClass{len: max_code_len*mult}       // char class list
-	re.group_max_nested = 3*mult                        // max nested group
+	// init regex
+    mut re := regex.RE{}
+    re.prog = []Token    {len: max_code_len*mult} // max program length, can not be longer then the pattern
+    re.cc   = []CharClass{len: max_code_len*mult} // can not be more char class the the length of the pattern
+    re.group_csave_flag = false                   // enable continuos group saving
+    re.group_max_nested = 3*mult                  // set max 128 group nested
+    re.group_max        = max_code_len*mult >> 1  // we can't have more groups than the half of the pattern legth
+
+    re.group_stack = []int{len: re.group_max, init: -1}
+	re.group_data  = []int{len: re.group_max, init: -1}
 
 	return re
 }