regex: refactoring, documentation, examples (#7418)

2020-12-20 04:52:02 +01:00 · 2020-12-20 04:52:02 +01:00 · b29bcb3fbe
parent 8278af4ee8
commit b29bcb3fbe
4 changed files with 279 additions and 158 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@ -471,10 +471,23 @@ pub fn regex_opt(in_query string) ?RE
 // new_regex create a REgex of small size, usually sufficient for ordinary use
 pub fn new() RE
 // new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated
 pub fn new_by_size(mult int) RE
 ```
-After a base initializer is used, the regex expression must be compiled with:
+#### **Custom initialization**
 For some particular need it is possible initialize a fully customized regex:
 ```v ignore
 // init custom regex
 mut re := regex.RE{}
 re.prog = []Token    {len: pattern.len + 1} // max program length, can not be longer then the pattern
 re.cc   = []CharClass{len: pattern.len}     // can not be more char class the the length of the pattern
 re.group_csave_flag = false          // true enable continuos group saving if needed
 re.group_max_nested = 128            // set max 128 group nested possible
 re.group_max        = pattern.len>>1 // we can't have more groups than the half of the pattern legth
 ```
 ### Compiling
 After an initializer is used, the regex expression must be compiled with:
 ```v ignore
 // compile compiles the REgex returning an error if the compilation fails
 pub fn (re mut RE) compile_opt(in_txt string) ?
@ -500,11 +513,38 @@ pub fn (re mut RE) replace(in_txt string, repl string) string
 ## Find and Replace
 There are the following find  and replace functions:
 #### Find functions
 ```v ignore
 // find try to find the first match in the input string, return start and end index if found else start is -1
 pub fn (re mut RE) find(in_txt string) (int,int)
 // find_all find all the "non overlapping" occurrences of the matching pattern
 // return a list of start end indexes like: [3,4,6,8] 
 // the matches are [3,4] and [6,8]
 pub fn (re mut RE) find_all(in_txt string) []int
 ```
 #### Replace functions
 ```v ignore
 // replace return a string where the matches are replaced with the replace string, only non overlapped matches are used
 pub fn (re mut RE) replace(in_txt string, repl string) string
 ```
 #### Custom replace function
 For complex find and replace operations it is available the function `replace_by_fn` .
 The`replace_by_fn` use a custom replace function making possible customizations. 
 **The custom function is called for every non overlapped find.**
 The custom function must be of the type:
 ```v ignore
 // re RE struct
 // in_txt all the text passed to the regex expression
 // the match is: in_txt[start..end]
 fn (re RE, in_txt string, start int, end int) string
 ```
@ -671,7 +711,7 @@ re.log_func = custom_print
 ## Example code
-Here there is a simple code to perform some basically match of strings
+Here an example that perform some basically match of strings
 ```v ignore
 import regex
@ -698,5 +738,63 @@ fn main(){
    }
 }
 ```
 Here an example of total customization of the regex environment creation:
 ```v ignore
 import regex
-more example code is available in the test code for the `regex` module `vlib\regex\regex_test.v`.
+fn main(){
    txt   := "today John is gone to his house with Jack and Marie."
    query := r"(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+"
    // init regex
    mut re := regex.RE{}
    re.prog = []regex.Token    {len: query.len + 1} // max program length, can not be longer then the query
    re.cc   = []regex.CharClass{len: query.len}     // can not be more char class the the length of the query
    re.prog = []regex.Token    {len: query.len+1}
    re.group_csave_flag = true         // enable continuos group saving
    re.group_max_nested = 128          // set max 128 group nested
    re.group_max        = query.len>>1 // we can't have more groups than the half of the query legth 
    // compile the query
    re.compile_opt(query) or { panic(err) }
    start, end := re.match_string(txt)
    if start >= 0 {
        println("Match ($start, $end) => [${txt[start..end]}]")
    } else {
        println("No Match")
    }
    // show results for continuos group saving
    if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{
        println("cg: $re.group_csave")
        mut cs_i := 1
        for cs_i < re.group_csave[0]*3 {
            g_id := re.group_csave[cs_i]
            st   := re.group_csave[cs_i+1]
            en   := re.group_csave[cs_i+2]
            println("cg[$g_id] $st $en:[${txt[st..en]}]")
            cs_i += 3
        }
    }
    // show results for captured groups
    if start >= 0 {
        println("Match ($start, $end) => [${txt[start..end]}]")
        for g_index := 0; g_index < re.group_count ; g_index++ {
            println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
            bounds: ${re.get_group_bounds_by_id(g_index)}")  
        }
        for name in re.group_map.keys() {
            println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
            bounds: ${re.get_group_bounds_by_name(name)}")
        }
    } else {
        println("No Match")
    }
 }
 ```
 more example code is available in the test code for the `regex` module `vlib\regex\regex_test.v`.
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -266,11 +266,11 @@ fn (mut tok Token) reset() {
 	tok.rep = 0
 }
-/*
+/******************************************************************************
-
+*
-Regex struct
+* Regex struct
-
+*
-*/
+******************************************************************************/
 pub const (
 	f_nl  = 0x00000001  // end the match when find a new line symbol
 	f_ms  = 0x00000002  // match true only if the match is at the start of the string
@ -354,11 +354,11 @@ fn (mut re RE) reset_src(){
 	}
 }
-/*
+/******************************************************************************
-
+*
-Backslashes chars
+* Backslashes chars
-
+*
-*/
+******************************************************************************/
 struct BslsStruct {
 	ch rune                   // meta char
 	validator FnValidator    // validator function pointer
@ -430,11 +430,11 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){
 	return err_syntax_error, i
 }
-/*
+/******************************************************************************
-
+*
-Char class
+* Char class
-
+*
-*/
+******************************************************************************/
 const(
 	cc_null = 0    // empty cc token
 	cc_char = 1    // simple char: a
@ -653,11 +653,11 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
 	return err_syntax_error,0,u32(0)
 }
-/*
+/******************************************************************************
-
+*
-Re Compiler
+* Re Compiler
-
+*
-*/
+******************************************************************************/
 //
 // Quantifier
 //
@ -1462,11 +1462,11 @@ pub fn (re RE) get_query() string {
 	return res.str()
 }
-/*
+/******************************************************************************
-
+*
-Groups saving utilities
+* Groups saving utilities
-
+*
-*/
+******************************************************************************/
 [inline]
 fn (mut re RE) group_continuous_save(g_index int) {
 	if re.group_csave_flag == true {
@ -1500,12 +1500,12 @@ fn (mut re RE) group_continuous_save(g_index int) {
 		re.group_csave << re.groups[g_index+1]  // end
 	}
 }
 /*
-Matching
+/******************************************************************************
-
+*
-*/
+* Matching
 *
 ******************************************************************************/					
 enum Match_state{
 	start = 0
 	stop
@ -2001,6 +2001,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 						last_dot_pc: state.pc
 					}
 					m_state = .ist_quant_n
 					//println("dot_char stack len: $state_list.len")
 					continue
 				}
@ -2363,47 +2364,11 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 	return no_match_found, 0
 }
-/*
+/******************************************************************************
-
+*
-Public functions
+* Public functions
-
+*
-*/
+******************************************************************************/	
 //
 // Inits
 //
 // regex create a regex object from the query string
 [deprecated]
 pub fn regex(in_query string) (RE,int,int){
 	mut re := RE{}
 	re.prog = []Token    {len: in_query.len+1}
 	re.cc   = []CharClass{len: in_query.len+1}
 	re.group_max_nested = 8
 	re_err,err_pos := re.compile(in_query)
 	return re, re_err, err_pos
 }
 // new_regex create a RE of small size, usually sufficient for ordinary use
 [deprecated]
 pub fn new_regex() RE {
 	return impl_new_regex_by_size(1)
 }
 // new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
 [deprecated]
 pub fn new_regex_by_size(mult int) RE {
 	return impl_new_regex_by_size(mult)
 }
 fn impl_new_regex_by_size(mult int) RE {
 	mut re := RE{}
 	re.prog = []Token    {len: max_code_len*mult}       // max program length, default 256 istructions
 	re.cc   = []CharClass{len: max_code_len*mult}       // char class list
 	re.group_max_nested = 3*mult                        // max nested group
 	return re
 }
 //
 // Matchers
@ -2538,82 +2503,3 @@ pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
 	return res
 }
 /*
 Utilities
 */
 // get_group_bounds_by_name get a group boundaries by its name
 pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) {
 	if group_name in re.group_map {
 		tmp_index := re.group_map[group_name]-1
 		start     := re.groups[tmp_index * 2]
 		end       := re.groups[tmp_index * 2 + 1]
 		return start,end
 	}
 	return -1, -1
 }
 // get_group_by_name get a group boundaries by its name
 pub fn (re RE) get_group_by_name(in_txt string, group_name string) string {
 	if group_name in re.group_map {
 		tmp_index := re.group_map[group_name]-1
 		start     := re.groups[tmp_index * 2]
 		end       := re.groups[tmp_index * 2 + 1]
 		return in_txt[start..end]
 	}
 	return ""
 }
 // get_group_by_id get a group string by its id
 pub fn (re RE) get_group_by_id(in_txt string, group_id int) string {
 	if group_id < (re.groups.len >> 1) {
 		index := group_id << 1
 		start := re.groups[index]
 		end   := re.groups[index + 1]
 		return in_txt[start..end]
 	}
 	return ""
 }
 // get_group_by_id get a group boundaries by its id
 pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int) {
 	if group_id < (re.groups.len >> 1) {
 		index := group_id << 1
 		return re.groups[index], re.groups[index]
 	}
 	return -1, -1
 }
 pub
 struct Re_group {
 pub:
 	start int = -1
 	end   int = -1
 }
 // get_group_list return a list of Re_group for the found groups
 pub fn (re RE) get_group_list() []Re_group {
 	mut res := []Re_group{len: re.groups.len >> 1}
 	mut gi := 0
 	//println("len: ${re.groups.len} groups: ${re.groups}")
 	for gi < re.groups.len {
 		if re.groups[gi] >= 0 {
 			txt_st := re.groups[gi]
            txt_en := re.groups[gi+1]
            //println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ")
            if txt_st >= 0 && txt_en > txt_st {
 				tmp := Re_group{ start: re.groups[gi], end: re.groups[gi + 1]}
 				//println(tmp)
 				res[gi >> 1] = tmp
 			} else {
 				res[gi >> 1] = Re_group{}
 			}
 		}
 		gi += 2
 	}
 	return res
 }
--- a/vlib/regex/regex_opt.v
+++ b/vlib/regex/regex_opt.v
@ -17,18 +17,29 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
 }
 // new_regex create a RE of small size, usually sufficient for ordinary use
 [deprecated]
 pub fn new() RE {
 	return impl_new_regex_by_size(1)
 }
 // new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
 [deprecated]
 pub fn new_by_size(mult int) RE {
 	return impl_new_regex_by_size(mult)
 }
 // regex_opt create new RE object from RE pattern string
 pub fn regex_opt(pattern string) ?RE {
-	mut re := new()
+	// init regex
-	re.compile_opt(pattern)?
+    mut re := regex.RE{}
-	return re
+    re.prog = []Token    {len: pattern.len + 1} // max program length, can not be longer then the pattern
    re.cc   = []CharClass{len: pattern.len}     // can not be more char class the the length of the pattern
    re.group_csave_flag = false                 // enable continuos group saving
    re.group_max_nested = 128                   // set max 128 group nested
    re.group_max        = pattern.len >> 1      // we can't have more groups than the half of the pattern legth
    // compile the pattern
    re.compile_opt(pattern)?
    return re
 }
--- a/vlib/regex/regex_util.v
+++ b/vlib/regex/regex_util.v
@ -0,0 +1,126 @@
 /*
 regex 1.0 alpha
 Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
 Use of this source code is governed by an MIT license
 that can be found in the LICENSE file.
 */
 module regex
 /******************************************************************************
 *
 * Inits
 *
 ******************************************************************************/
 // regex create a regex object from the query string
 [deprecated]
 pub fn regex(in_query string) (RE,int,int){
 	mut re := RE{}
 	re.prog = []Token    {len: in_query.len+1}
 	re.cc   = []CharClass{len: in_query.len+1}
 	re.group_max_nested = 8
 	re_err,err_pos := re.compile(in_query)
 	return re, re_err, err_pos
 }
 // new_regex create a RE of small size, usually sufficient for ordinary use
 [deprecated]
 pub fn new_regex() RE {
 	return impl_new_regex_by_size(1)
 }
 // new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
 [deprecated]
 pub fn new_regex_by_size(mult int) RE {
 	return impl_new_regex_by_size(mult)
 }
 fn impl_new_regex_by_size(mult int) RE {
 	mut re := RE{}
 	re.prog = []Token    {len: max_code_len*mult}       // max program length, default 256 istructions
 	re.cc   = []CharClass{len: max_code_len*mult}       // char class list
 	re.group_max_nested = 3*mult                        // max nested group
 	return re
 }
 /******************************************************************************
 *
 * Utilities
 *
 ******************************************************************************/
 // get_group_bounds_by_name get a group boundaries by its name
 pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) {
 	if group_name in re.group_map {
 		tmp_index := re.group_map[group_name]-1
 		start     := re.groups[tmp_index * 2]
 		end       := re.groups[tmp_index * 2 + 1]
 		return start,end
 	}
 	return -1, -1
 }
 // get_group_by_name get a group boundaries by its name
 pub fn (re RE) get_group_by_name(in_txt string, group_name string) string {
 	if group_name in re.group_map {
 		tmp_index := re.group_map[group_name]-1
 		start     := re.groups[tmp_index * 2]
 		end       := re.groups[tmp_index * 2 + 1]
 		return in_txt[start..end]
 	}
 	return ""
 }
 // get_group_by_id get a group string by its id
 pub fn (re RE) get_group_by_id(in_txt string, group_id int) string {
 	if group_id < (re.groups.len >> 1) {
 		index := group_id << 1
 		start := re.groups[index]
 		end   := re.groups[index + 1]
 		return in_txt[start..end]
 	}
 	return ""
 }
 // get_group_by_id get a group boundaries by its id
 pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int) {
 	if group_id < (re.groups.len >> 1) {
 		index := group_id << 1
 		return re.groups[index], re.groups[index + 1]
 	}
 	return -1, -1
 }
 pub
 struct Re_group {
 pub:
 	start int = -1
 	end   int = -1
 }
 // get_group_list return a list of Re_group for the found groups
 pub fn (re RE) get_group_list() []Re_group {
 	mut res := []Re_group{len: re.groups.len >> 1}
 	mut gi := 0
 	//println("len: ${re.groups.len} groups: ${re.groups}")
 	for gi < re.groups.len {
 		if re.groups[gi] >= 0 {
 			txt_st := re.groups[gi]
            txt_en := re.groups[gi+1]
            //println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ")
            if txt_st >= 0 && txt_en > txt_st {
 				tmp := Re_group{ start: re.groups[gi], end: re.groups[gi + 1]}
 				//println(tmp)
 				res[gi >> 1] = tmp
 			} else {
 				res[gi >> 1] = Re_group{}
 			}
 		}
 		gi += 2
 	}
 	return res
 }