From 4f986ccac48c96d2785e8d4f7ff7b9fb55bc034e Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Mon, 14 Dec 2020 14:02:13 +0100 Subject: [PATCH] regex: refactoring of continuous capturing groups (#7310) --- vlib/regex/README.md | 10 +++--- vlib/regex/regex.v | 74 +++++++++++++++++++++++++---------------- vlib/regex/regex_test.v | 3 +- 3 files changed, 52 insertions(+), 35 deletions(-) diff --git a/vlib/regex/README.md b/vlib/regex/README.md index 834a75a305..583f653448 100644 --- a/vlib/regex/README.md +++ b/vlib/regex/README.md @@ -236,8 +236,8 @@ this is possible initializing the saving array field in `RE` struct: `group_csav This feature allow to collect data in a continuous way. In the example we pass a text followed by a integer list that we want collect. -To achieve this task we can use the continuous saving of the group -that save each captured group in a array that we set with: `re.group_csave = [-1].repeat(3*20+1)`. +To achieve this task we can use the continuous saving of the group +enabling the right flag: `re.group_csave_flag = true`. The array will be filled with the following logic: @@ -250,14 +250,14 @@ The array will be filled with the following logic: The regex save until finish or found that the array have no space. If the space ends no error is raised, further records will not be saved. -```v oksyntax +```v ignore fn example2() { test_regex() text := 'tst: 01,23,45 ,56, 78' query := r'.*:(\s*\d+[\s,]*)+' mut re := new() or { panic(err) } // re.debug = 2 - re.group_csave = [-1].repeat(3 * 20 + 1) // we expect max 20 records + re.group_csave_flag = true // enable continuous capture re.compile_opt(query) or { println(err) return @@ -330,7 +330,7 @@ fn main() { query := r'(?Phttps?)|(?:ftps?)://(?P[\w_]+.)+' mut re := new() re.debug = 2 - // must provide an array of the right size if want the continuos saving of the groups + // must provide an array of the right size if want the continuous saving of the groups re.group_csave = [-1].repeat(3 * 20 + 1) re.compile_opt(query) or { println(err) diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index c54156094f..8788ea0851 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -314,8 +314,8 @@ pub mut: group_max_nested int = 3 // max nested group group_max int = 8 // max allowed number of different groups - group_csave []int = []int{} // groups continuous save array - group_csave_index int = -1 // groups continuous save index + group_csave_flag bool // flag to enable continuous saving + group_csave []int = []int{} // groups continuous save list group_map map[string]int // groups names map @@ -344,10 +344,7 @@ fn (mut re RE) reset(){ re.state_stack_index = -1 // reset group_csave - if re.group_csave.len > 0 { - re.group_csave_index = 1 - re.group_csave[0] = 0 // reset the capture count - } + re.group_csave = []int{} } // reset for search mode fail @@ -1482,6 +1479,45 @@ pub fn (re RE) get_query() string { /* +Groups saving utilities + +*/ +[inline] +fn (mut re RE) group_continuous_save(g_index int) { + if re.group_csave_flag == true { + // continuous save, save until we have space + + // init the first element as counter + if re.group_csave.len == 0 { + re.group_csave << 0 + } + + gi := g_index >> 1 + start := re.groups[g_index] + end := re.groups[g_index+1] + + // check if we are simply increasing the size ot the found group + if re.group_csave.len >=4 && + gi == re.group_csave[re.group_csave.len - 3] && + start == re.group_csave[re.group_csave.len - 2] + { + re.group_csave[re.group_csave.len - 1] = end + return + } + + // otherwise append a new group to the list + + // increment counter + re.group_csave[0]++ + // save the record + re.group_csave << (g_index >> 1) // group id + re.group_csave << re.groups[g_index] // start + re.group_csave << re.groups[g_index+1] // end + } +} + +/* + Matching */ @@ -1684,18 +1720,8 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { } // continuous save, save until we have space - if re.group_csave_index > 0 { - // check if we have space to save the record - if (re.group_csave_index + 3) < re.group_csave.len { - // incrment counter - re.group_csave[0]++ - // save the record - re.group_csave[re.group_csave_index++] = g_index >> 1 // group id - re.group_csave[re.group_csave_index++] = re.groups[g_index] // start - re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end - } - } - + re.group_continuous_save(g_index) + } group_index-- @@ -1879,17 +1905,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { //println("GROUP ${re.prog[pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}]") // continuous save, save until we have space - if re.group_csave_index > 0 { - // check if we have space to save the record - if (re.group_csave_index + 3) < re.group_csave.len { - // incrment counter - re.group_csave[0]++ - // save the record - re.group_csave[re.group_csave_index++] = g_index >> 1 // group id - re.group_csave[re.group_csave_index++] = re.groups[g_index] // start - re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end - } - } + re.group_continuous_save(g_index) } re.prog[pc].group_rep++ // increase repetitions diff --git a/vlib/regex/regex_test.v b/vlib/regex/regex_test.v index 25aea6a47e..b58f7d2196 100644 --- a/vlib/regex/regex_test.v +++ b/vlib/regex/regex_test.v @@ -230,7 +230,8 @@ fn test_regex(){ } if to.cgn.len > 0 { - re.group_csave = [-1].repeat(3*20+1) + re.group_csave_flag = true + //re.group_csave = [-1].repeat(3*20+1) if debug { println("continuous save")} } else { if debug { println("NO continuous save")}