diff --git a/vlib/regex/README.md b/vlib/regex/README.md index de027902b5..eba818ffa3 100644 --- a/vlib/regex/README.md +++ b/vlib/regex/README.md @@ -471,10 +471,23 @@ pub fn regex_opt(in_query string) ?RE // new_regex create a REgex of small size, usually sufficient for ordinary use pub fn new() RE -// new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated -pub fn new_by_size(mult int) RE ``` -After a base initializer is used, the regex expression must be compiled with: +#### **Custom initialization** +For some particular need it is possible initialize a fully customized regex: +```v ignore +// init custom regex +mut re := regex.RE{} +re.prog = []Token {len: pattern.len + 1} // max program length, can not be longer then the pattern +re.cc = []CharClass{len: pattern.len} // can not be more char class the the length of the pattern + +re.group_csave_flag = false // true enable continuos group saving if needed +re.group_max_nested = 128 // set max 128 group nested possible +re.group_max = pattern.len>>1 // we can't have more groups than the half of the pattern legth +``` +### Compiling + +After an initializer is used, the regex expression must be compiled with: + ```v ignore // compile compiles the REgex returning an error if the compilation fails pub fn (re mut RE) compile_opt(in_txt string) ? @@ -500,11 +513,38 @@ pub fn (re mut RE) replace(in_txt string, repl string) string ## Find and Replace +There are the following find and replace functions: + +#### Find functions + +```v ignore +// find try to find the first match in the input string, return start and end index if found else start is -1 +pub fn (re mut RE) find(in_txt string) (int,int) + +// find_all find all the "non overlapping" occurrences of the matching pattern +// return a list of start end indexes like: [3,4,6,8] +// the matches are [3,4] and [6,8] +pub fn (re mut RE) find_all(in_txt string) []int +``` + +#### Replace functions + +```v ignore +// replace return a string where the matches are replaced with the replace string, only non overlapped matches are used +pub fn (re mut RE) replace(in_txt string, repl string) string +``` + +#### Custom replace function + For complex find and replace operations it is available the function `replace_by_fn` . The`replace_by_fn` use a custom replace function making possible customizations. **The custom function is called for every non overlapped find.** The custom function must be of the type: + ```v ignore +// re RE struct +// in_txt all the text passed to the regex expression +// the match is: in_txt[start..end] fn (re RE, in_txt string, start int, end int) string ``` @@ -671,7 +711,7 @@ re.log_func = custom_print ## Example code -Here there is a simple code to perform some basically match of strings +Here an example that perform some basically match of strings ```v ignore import regex @@ -698,5 +738,63 @@ fn main(){ } } ``` +Here an example of total customization of the regex environment creation: +```v ignore +import regex -more example code is available in the test code for the `regex` module `vlib\regex\regex_test.v`. +fn main(){ + txt := "today John is gone to his house with Jack and Marie." + query := r"(?:(?P\A\w+)|(?:\a\w+)[\s.]?)+" + + // init regex + mut re := regex.RE{} + re.prog = []regex.Token {len: query.len + 1} // max program length, can not be longer then the query + re.cc = []regex.CharClass{len: query.len} // can not be more char class the the length of the query + re.prog = []regex.Token {len: query.len+1} + re.group_csave_flag = true // enable continuos group saving + re.group_max_nested = 128 // set max 128 group nested + re.group_max = query.len>>1 // we can't have more groups than the half of the query legth + + // compile the query + re.compile_opt(query) or { panic(err) } + + start, end := re.match_string(txt) + if start >= 0 { + println("Match ($start, $end) => [${txt[start..end]}]") + } else { + println("No Match") + } + + // show results for continuos group saving + if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{ + println("cg: $re.group_csave") + mut cs_i := 1 + for cs_i < re.group_csave[0]*3 { + g_id := re.group_csave[cs_i] + st := re.group_csave[cs_i+1] + en := re.group_csave[cs_i+2] + println("cg[$g_id] $st $en:[${txt[st..en]}]") + cs_i += 3 + } + } + + // show results for captured groups + if start >= 0 { + println("Match ($start, $end) => [${txt[start..end]}]") + for g_index := 0; g_index < re.group_count ; g_index++ { + println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \ + bounds: ${re.get_group_bounds_by_id(g_index)}") + } + for name in re.group_map.keys() { + println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \ + bounds: ${re.get_group_bounds_by_name(name)}") + } + } else { + println("No Match") + } +} +``` + + + +more example code is available in the test code for the `regex` module `vlib\regex\regex_test.v`. \ No newline at end of file diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index 1a77762ec8..210319dde2 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -266,11 +266,11 @@ fn (mut tok Token) reset() { tok.rep = 0 } -/* - -Regex struct - -*/ +/****************************************************************************** +* +* Regex struct +* +******************************************************************************/ pub const ( f_nl = 0x00000001 // end the match when find a new line symbol f_ms = 0x00000002 // match true only if the match is at the start of the string @@ -354,11 +354,11 @@ fn (mut re RE) reset_src(){ } } -/* - -Backslashes chars - -*/ +/****************************************************************************** +* +* Backslashes chars +* +******************************************************************************/ struct BslsStruct { ch rune // meta char validator FnValidator // validator function pointer @@ -430,11 +430,11 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){ return err_syntax_error, i } -/* - -Char class - -*/ +/****************************************************************************** +* +* Char class +* +******************************************************************************/ const( cc_null = 0 // empty cc token cc_char = 1 // simple char: a @@ -653,11 +653,11 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) { return err_syntax_error,0,u32(0) } -/* - -Re Compiler - -*/ +/****************************************************************************** +* +* Re Compiler +* +******************************************************************************/ // // Quantifier // @@ -1462,11 +1462,11 @@ pub fn (re RE) get_query() string { return res.str() } -/* - -Groups saving utilities - -*/ +/****************************************************************************** +* +* Groups saving utilities +* +******************************************************************************/ [inline] fn (mut re RE) group_continuous_save(g_index int) { if re.group_csave_flag == true { @@ -1500,12 +1500,12 @@ fn (mut re RE) group_continuous_save(g_index int) { re.group_csave << re.groups[g_index+1] // end } } - -/* -Matching - -*/ +/****************************************************************************** +* +* Matching +* +******************************************************************************/ enum Match_state{ start = 0 stop @@ -2001,6 +2001,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { last_dot_pc: state.pc } m_state = .ist_quant_n + //println("dot_char stack len: $state_list.len") continue } @@ -2363,47 +2364,11 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { return no_match_found, 0 } -/* - -Public functions - -*/ - -// -// Inits -// - -// regex create a regex object from the query string -[deprecated] -pub fn regex(in_query string) (RE,int,int){ - mut re := RE{} - re.prog = []Token {len: in_query.len+1} - re.cc = []CharClass{len: in_query.len+1} - re.group_max_nested = 8 - - re_err,err_pos := re.compile(in_query) - return re, re_err, err_pos -} - -// new_regex create a RE of small size, usually sufficient for ordinary use -[deprecated] -pub fn new_regex() RE { - return impl_new_regex_by_size(1) -} - -// new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated -[deprecated] -pub fn new_regex_by_size(mult int) RE { - return impl_new_regex_by_size(mult) -} -fn impl_new_regex_by_size(mult int) RE { - mut re := RE{} - re.prog = []Token {len: max_code_len*mult} // max program length, default 256 istructions - re.cc = []CharClass{len: max_code_len*mult} // char class list - re.group_max_nested = 3*mult // max nested group - - return re -} +/****************************************************************************** +* +* Public functions +* +******************************************************************************/ // // Matchers @@ -2538,82 +2503,3 @@ pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string { return res } -/* - -Utilities - -*/ - -// get_group_bounds_by_name get a group boundaries by its name -pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) { - if group_name in re.group_map { - tmp_index := re.group_map[group_name]-1 - start := re.groups[tmp_index * 2] - end := re.groups[tmp_index * 2 + 1] - return start,end - } - return -1, -1 -} - -// get_group_by_name get a group boundaries by its name -pub fn (re RE) get_group_by_name(in_txt string, group_name string) string { - if group_name in re.group_map { - tmp_index := re.group_map[group_name]-1 - start := re.groups[tmp_index * 2] - end := re.groups[tmp_index * 2 + 1] - return in_txt[start..end] - } - return "" -} - -// get_group_by_id get a group string by its id -pub fn (re RE) get_group_by_id(in_txt string, group_id int) string { - if group_id < (re.groups.len >> 1) { - index := group_id << 1 - start := re.groups[index] - end := re.groups[index + 1] - return in_txt[start..end] - } - return "" -} - -// get_group_by_id get a group boundaries by its id -pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int) { - if group_id < (re.groups.len >> 1) { - index := group_id << 1 - return re.groups[index], re.groups[index] - } - return -1, -1 -} - -pub -struct Re_group { -pub: - start int = -1 - end int = -1 -} - -// get_group_list return a list of Re_group for the found groups -pub fn (re RE) get_group_list() []Re_group { - mut res := []Re_group{len: re.groups.len >> 1} - mut gi := 0 - //println("len: ${re.groups.len} groups: ${re.groups}") - for gi < re.groups.len { - if re.groups[gi] >= 0 { - txt_st := re.groups[gi] - txt_en := re.groups[gi+1] - - //println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ") - if txt_st >= 0 && txt_en > txt_st { - tmp := Re_group{ start: re.groups[gi], end: re.groups[gi + 1]} - //println(tmp) - res[gi >> 1] = tmp - } else { - res[gi >> 1] = Re_group{} - } - } - gi += 2 - } - return res -} - diff --git a/vlib/regex/regex_opt.v b/vlib/regex/regex_opt.v index acd6c0f542..ffe6ed5cf2 100644 --- a/vlib/regex/regex_opt.v +++ b/vlib/regex/regex_opt.v @@ -17,18 +17,29 @@ pub fn (mut re RE) compile_opt(pattern string) ? { } // new_regex create a RE of small size, usually sufficient for ordinary use +[deprecated] pub fn new() RE { return impl_new_regex_by_size(1) } // new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated +[deprecated] pub fn new_by_size(mult int) RE { return impl_new_regex_by_size(mult) } // regex_opt create new RE object from RE pattern string pub fn regex_opt(pattern string) ?RE { - mut re := new() - re.compile_opt(pattern)? - return re + // init regex + mut re := regex.RE{} + re.prog = []Token {len: pattern.len + 1} // max program length, can not be longer then the pattern + re.cc = []CharClass{len: pattern.len} // can not be more char class the the length of the pattern + re.group_csave_flag = false // enable continuos group saving + re.group_max_nested = 128 // set max 128 group nested + re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth + + // compile the pattern + re.compile_opt(pattern)? + + return re } diff --git a/vlib/regex/regex_util.v b/vlib/regex/regex_util.v new file mode 100644 index 0000000000..b743009076 --- /dev/null +++ b/vlib/regex/regex_util.v @@ -0,0 +1,126 @@ +/* + +regex 1.0 alpha + +Copyright (c) 2019-2020 Dario Deledda. All rights reserved. +Use of this source code is governed by an MIT license +that can be found in the LICENSE file. + +*/ +module regex + +/****************************************************************************** +* +* Inits +* +******************************************************************************/ +// regex create a regex object from the query string +[deprecated] +pub fn regex(in_query string) (RE,int,int){ + mut re := RE{} + re.prog = []Token {len: in_query.len+1} + re.cc = []CharClass{len: in_query.len+1} + re.group_max_nested = 8 + + re_err,err_pos := re.compile(in_query) + return re, re_err, err_pos +} + +// new_regex create a RE of small size, usually sufficient for ordinary use +[deprecated] +pub fn new_regex() RE { + return impl_new_regex_by_size(1) +} + +// new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated +[deprecated] +pub fn new_regex_by_size(mult int) RE { + return impl_new_regex_by_size(mult) +} +fn impl_new_regex_by_size(mult int) RE { + mut re := RE{} + re.prog = []Token {len: max_code_len*mult} // max program length, default 256 istructions + re.cc = []CharClass{len: max_code_len*mult} // char class list + re.group_max_nested = 3*mult // max nested group + + return re +} + +/****************************************************************************** +* +* Utilities +* +******************************************************************************/ +// get_group_bounds_by_name get a group boundaries by its name +pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) { + if group_name in re.group_map { + tmp_index := re.group_map[group_name]-1 + start := re.groups[tmp_index * 2] + end := re.groups[tmp_index * 2 + 1] + return start,end + } + return -1, -1 +} + +// get_group_by_name get a group boundaries by its name +pub fn (re RE) get_group_by_name(in_txt string, group_name string) string { + if group_name in re.group_map { + tmp_index := re.group_map[group_name]-1 + start := re.groups[tmp_index * 2] + end := re.groups[tmp_index * 2 + 1] + return in_txt[start..end] + } + return "" +} + +// get_group_by_id get a group string by its id +pub fn (re RE) get_group_by_id(in_txt string, group_id int) string { + if group_id < (re.groups.len >> 1) { + index := group_id << 1 + start := re.groups[index] + end := re.groups[index + 1] + return in_txt[start..end] + } + return "" +} + +// get_group_by_id get a group boundaries by its id +pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int) { + if group_id < (re.groups.len >> 1) { + index := group_id << 1 + return re.groups[index], re.groups[index + 1] + } + return -1, -1 +} + +pub +struct Re_group { +pub: + start int = -1 + end int = -1 +} + +// get_group_list return a list of Re_group for the found groups +pub fn (re RE) get_group_list() []Re_group { + mut res := []Re_group{len: re.groups.len >> 1} + mut gi := 0 + //println("len: ${re.groups.len} groups: ${re.groups}") + for gi < re.groups.len { + if re.groups[gi] >= 0 { + txt_st := re.groups[gi] + txt_en := re.groups[gi+1] + + //println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ") + if txt_st >= 0 && txt_en > txt_st { + tmp := Re_group{ start: re.groups[gi], end: re.groups[gi + 1]} + //println(tmp) + res[gi >> 1] = tmp + } else { + res[gi >> 1] = Re_group{} + } + } + gi += 2 + } + return res +} +