diff --git a/vlib/regex/README.md b/vlib/regex/README.md index 3645f7589a..c22ba5a7c7 100644 --- a/vlib/regex/README.md +++ b/vlib/regex/README.md @@ -473,7 +473,7 @@ pub fn new() RE ``` #### **Custom initialization** -For some particular need it is possible initialize a fully customized regex: +For some particular needs it is possible initialize a fully manually customized regex: ```v ignore pattern = r"ab(.*)(ac)" // init custom regex @@ -484,6 +484,8 @@ re.cc = []CharClass{len: pattern.len} // can not be more char class the th re.group_csave_flag = false // true enable continuos group saving if needed re.group_max_nested = 128 // set max 128 group nested possible re.group_max = pattern.len>>1 // we can't have more groups than the half of the pattern legth +re.group_stack = []int{len: re.group_max, init: -1} +re.group_data = []int{len: re.group_max, init: -1} ``` ### Compiling @@ -494,22 +496,14 @@ After an initializer is used, the regex expression must be compiled with: pub fn (re mut RE) compile_opt(in_txt string) ? ``` -### Operative Functions +### Matching Functions -These are the operative functions +These are the matching functions ```v ignore // match_string try to match the input string, return start and end index if found else start is -1 pub fn (re mut RE) match_string(in_txt string) (int,int) -// find try to find the first match in the input string, return start and end index if found else start is -1 -pub fn (re mut RE) find(in_txt string) (int,int) - -// find_all find all the "non overlapping" occurrences of the matching pattern, return a list of start end indexes -pub fn (re mut RE) find_all(in_txt string) []int - -// replace return a string where the matches are replaced with the replace string, only non overlapped matches are used -pub fn (re mut RE) replace(in_txt string, repl string) string ``` ## Find and Replace @@ -519,13 +513,19 @@ There are the following find and replace functions: #### Find functions ```v ignore -// find try to find the first match in the input string, return start and end index if found else start is -1 +// find try to find the first match in the input string +// return start and end index if found else start is -1 pub fn (re mut RE) find(in_txt string) (int,int) // find_all find all the "non overlapping" occurrences of the matching pattern // return a list of start end indexes like: [3,4,6,8] // the matches are [3,4] and [6,8] pub fn (re mut RE) find_all(in_txt string) []int + +// find_all find all the "non overlapping" occurrences of the matching pattern +// return a list of strings +// the result is like ["first match","secon match"] +pub fn (mut re RE) find_all_str(in_txt string) []string ``` #### Replace functions @@ -543,10 +543,12 @@ The`replace_by_fn` use a custom replace function making possible customizations. The custom function must be of the type: ```v ignore -// re RE struct -// in_txt all the text passed to the regex expression -// the match is: in_txt[start..end] -fn (re RE, in_txt string, start int, end int) string +// type of function used for custom replace +// in_txt source text +// start index of the start of the match in in_txt +// end index of the end of the match in in_txt +// --- the match is in in_txt[start..end] --- +fn (re RE, in_txt string, start int, end int) string ``` The following example will clarify the use: diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index caa7e3ab0c..16a5c307e9 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -1554,14 +1554,14 @@ fn state_str(s Match_state) string { struct StateObj { pub mut: - group_index int = -1 // group id used to know how many groups are open - match_flag bool - match_index int = -1 - first_match int = -1 //index of the first match - pc int = -1 // program counter - i int = -1 // source string index - char_len int - last_dot_pc int = -1 // last dot chat pc + group_index int = -1 // group id used to know how many groups are open + match_flag bool // indicate if we are in a match condition + match_index int = -1 // index of the last match + first_match int = -1 // index of the first match + pc int = -1 // program counter + i int = -1 // source string index + char_len int // last char legth + last_dot_pc int = -1 // last dot chat pc } [direct_array_access] @@ -1579,13 +1579,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { mut ist := rune(0) // actual instruction mut l_ist := rune(0) // last matched instruction - //mut state_list := []StateObj{} - - //mut group_stack := []int{len: re.group_max, init: -1} - //mut group_data := []int{len: re.group_max, init: -1} - - //mut group_index := -1 // group id used to know how many groups are open - mut step_count := 0 // stats for debug mut dbg_line := 0 // count debug line printed @@ -1900,7 +1893,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { //println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}") if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 { start_i := re.group_stack[state.group_index] - //re.group_stack[state.group_index]=-1 // save group results g_index := re.prog[state.pc].group_id*2 @@ -1960,8 +1952,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // check next token to be false mut next_check_flag := false - //if re.prog[state.pc].rep >= re.prog[state.pc].rep_min && - // if we are done with max go on dot char are dedicated case!! if re.prog[state.pc].rep >= re.prog[state.pc].rep_max { @@ -2415,113 +2405,3 @@ pub fn (mut re RE) match_string(in_txt string) (int,int) { } return start, end } - -// -// Finders -// - -// find try to find the first match in the input string -[direct_array_access] -pub fn (mut re RE) find(in_txt string) (int,int) { - old_flag := re.flag - - re.flag |= f_src // enable search mode - start, mut end := re.match_base(in_txt.str, in_txt.len + 1) - //print("Find [$start,$end] '${in_txt[start..end]}'") - if end > in_txt.len { - end = in_txt.len - } - re.flag = old_flag - - if start >= 0 && end > start { - return start, end - } - return no_match_found, 0 -} - -// find all the non overlapping occurrences of the match pattern -[direct_array_access] -pub fn (mut re RE) find_all(in_txt string) []int { - mut i := 0 - mut res := []int{} - mut ls := -1 - for i < in_txt.len { - s,e := re.find(in_txt[i..]) - if s >= 0 && e > s && i+s > ls { - //println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls") - res << i+s - res << i+e - ls = i+s - i = i+e - continue - } else { - i++ - } - - } - return res -} - -// replace return a string where the matches are replaced with the replace string -pub fn (mut re RE) replace(in_txt string, repl string) string { - pos := re.find_all(in_txt) - if pos.len > 0 { - mut res := "" - mut i := 0 - - mut s1 := 0 - mut e1 := in_txt.len - - for i < pos.len { - e1 = pos[i] - res += in_txt[s1..e1] + repl - s1 = pos[i+1] - i += 2 - } - - res += in_txt[s1..] - return res - } - return in_txt -} - -pub type FnReplace = fn (re RE, in_txt string, start int, end int) string - -// replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function -pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string { - mut i := 0 - mut res := "" - mut ls := -1 - - mut s1 := 0 - //mut e1 := in_txt.len - - for i < in_txt.len { - s,e := re.find(in_txt[i..]) - if s >= 0 && e > s && i+s > ls { - //println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls") - start := i + s - end := i + e - // update grups index diplacement - mut gi := 0 - for gi < re.groups.len { - re.groups[gi] += i - gi++ - } - repl := repl_fn(re, in_txt, start, end) - - res += in_txt[s1..start] + repl - s1 = end - - ls = i + s - i = i + e - continue - } else { - i++ - } - - } - res += in_txt[s1..] - return res -} - diff --git a/vlib/regex/regex_opt.v b/vlib/regex/regex_opt.v index 8746f993bb..5ebab92686 100644 --- a/vlib/regex/regex_opt.v +++ b/vlib/regex/regex_opt.v @@ -17,9 +17,19 @@ pub fn (mut re RE) compile_opt(pattern string) ? { } // new_regex create a RE of small size, usually sufficient for ordinary use -[deprecated] pub fn new() RE { - return impl_new_regex_by_size(1) + // init regex + mut re := regex.RE{} + re.prog = []Token {len: max_code_len + 1} // max program length, can not be longer then the pattern + re.cc = []CharClass{len: max_code_len} // can not be more char class the the length of the pattern + re.group_csave_flag = false // enable continuos group saving + re.group_max_nested = 128 // set max 128 group nested + re.group_max = max_code_len >> 1 // we can't have more groups than the half of the pattern legth + + re.group_stack = []int{len: re.group_max, init: -1} + re.group_data = []int{len: re.group_max, init: -1} + + return re } // new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated diff --git a/vlib/regex/regex_test.v b/vlib/regex/regex_test.v index b421866f96..334b3f657b 100644 --- a/vlib/regex/regex_test.v +++ b/vlib/regex/regex_test.v @@ -144,29 +144,6 @@ match_test_suite = [ ] ) -struct TestItemFa { - src string - q string - r []int -} - -const ( -match_test_suite_fa = [ - // find_all tests - TestItemFa{ - "oggi pippo è andato a casa di pluto ed ha trovato pippo", - r"p[iplut]+o", - [5, 10, 31, 36, 51, 56] - }, - TestItemFa{ - "oggi pibao è andato a casa di pbababao ed ha trovato pibabababao", - r"(pi?(ba)+o)", - [5, 10, 31, 39, 54, 65] - }, - -] -) - struct TestItemRe { src string q string @@ -174,7 +151,7 @@ struct TestItemRe { r string } const ( -match_test_suite_re = [ +match_test_suite_replace = [ // replace tests TestItemRe{ "oggi pibao è andato a casa di pbababao ed ha trovato pibabababao", @@ -241,12 +218,53 @@ cgroups_test_suite = [ ] ) + +struct Test_find_all { + src string + q string + res []int // [0,4,5,6...] + res_str []string // ['find0','find1'...] +} +const ( +find_all_test_suite = [ + Test_find_all{ + "abcd 1234 efgh 1234 ghkl1234 ab34546df", + r"\d+", + [5, 9, 15, 19, 24, 28, 31, 36], + ['1234', '1234', '1234', '34546'] + }, + Test_find_all{ + "abcd 1234 efgh 1234 ghkl1234 ab34546df", + r"\a+", + [0, 4, 10, 14, 20, 24, 29, 31, 36, 38], + ['abcd', 'efgh', 'ghkl', 'ab', 'df'] + }, + Test_find_all{ + "oggi pippo è andato a casa di pluto ed ha trovato pippo", + r"p[iplut]+o", + [5, 10, 31, 36, 51, 56], + ['pippo', 'pluto', 'pippo'] + }, + Test_find_all{ + "oggi pibao è andato a casa di pbababao ed ha trovato pibabababao", + r"(pi?(ba)+o)", + [5, 10, 31, 39, 54, 65], + ['pibao', 'pbababao', 'pibabababao'] + }, + Test_find_all{ + "Today is a good day and tomorrow will be for sure.", + r"[Tt]o\w+", + [0, 5, 24, 32], + ['Today', 'tomorrow'] + } +] +) + const ( debug = false // true for debug println ) fn test_regex(){ - // check capturing groups for c,to in cgroups_test_suite { // debug print @@ -275,8 +293,8 @@ fn test_regex(){ if start != to.s || end != to.e { //println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") - println("ERROR!") - C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) + eprintln("ERROR!") + //C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) assert false continue } @@ -284,7 +302,7 @@ fn test_regex(){ // check cgroups if to.cgn.len > 0 { if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] { - println("Capturing group len error! found: ${re.group_csave[0]} true ground: ${to.cg[0]}") + eprintln("Capturing group len error! found: ${re.group_csave[0]} true ground: ${to.cg[0]}") assert false continue } @@ -293,7 +311,7 @@ fn test_regex(){ mut ln := re.group_csave[0]*3 for ln > 0 { if re.group_csave[ln] != to.cg[ln] { - println("Capturing group failed on $ln item!") + eprintln("Capturing group failed on $ln item!") assert false } ln-- @@ -302,7 +320,7 @@ fn test_regex(){ // check named captured groups for k in to.cgn.keys() { if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1 - println("Named capturing group error! [$k]") + eprintln("Named capturing group error! [$k]") assert false continue } @@ -314,9 +332,9 @@ fn test_regex(){ } for ln:=0; ln < re.groups.len; ln++ { if re.groups[ln] != to.cg[ln] { - println("Capture group doesn't match:") - println("true ground: [${to.cg}]") - println("elaborated : [${re.groups}]") + eprintln("Capture group doesn't match:") + eprintln("true ground: [${to.cg}]") + eprintln("elaborated : [${re.groups}]") assert false } } @@ -324,9 +342,9 @@ fn test_regex(){ } // check find_all - for c,to in match_test_suite_fa{ + for c,to in find_all_test_suite { // debug print - if debug { println("#$c [$to.src] q[$to.q] $to.r") } + if debug { println("#$c [$to.src] q[$to.q] ($to.res, $to.res_str)") } mut re := regex.regex_opt(to.q) or { eprintln('err: $err') @@ -334,25 +352,24 @@ fn test_regex(){ continue } + re.reset() res := re.find_all(to.src) - if res.len != to.r.len { - println("ERROR: find_all, array of different size.") + if res != to.res { + eprintln('err: find_all !!') + if debug { println("#$c exp: $to.res calculated: $res") } assert false - continue } - for c1,i in res { - if i != to.r[c1] { - println("ERROR: find_all, different indexes.") - assert false - continue - } + res_str := re.find_all_str(to.src) + if res_str != to.res_str { + eprintln('err: find_all_str !!') + if debug { println("#$c exp: $to.res_str calculated: $res_str") } + assert false } - } // check replace - for c,to in match_test_suite_re{ + for c,to in match_test_suite_replace{ // debug print if debug { println("#$c [$to.src] q[$to.q] $to.r") } @@ -364,7 +381,7 @@ fn test_regex(){ res := re.replace(to.src,to.rep) if res != to.r { - println("ERROR: replace.") + eprintln("ERROR: replace.") assert false continue } @@ -383,12 +400,12 @@ fn test_regex(){ continue } // q_str := re.get_query() - // println("Query: $q_str") + // eprintln("Query: $q_str") start,end := re.find(to.src) if start != to.s || end != to.e { err_str := re.get_parse_error_string(start) - println("ERROR : $err_str start: ${start} end: ${end}") + eprintln("ERROR : $err_str start: ${start} end: ${end}") assert false } else { //tmp_str := text[start..end] @@ -416,8 +433,8 @@ fn test_regex(){ } if start != to.s || end != to.e { - println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") - println("ERROR!") + eprintln("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") + eprintln("ERROR!") //C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) assert false continue @@ -427,7 +444,7 @@ fn test_regex(){ tmp_str1 := to.src.clone() start1, end1 := re.match_string(tmp_str1) if start1 != start || end1 != end { - println("two run ERROR!!") + eprintln("two run ERROR!!") assert false continue } diff --git a/vlib/regex/regex_util.v b/vlib/regex/regex_util.v index 3572d02f19..ddaf0f7654 100644 --- a/vlib/regex/regex_util.v +++ b/vlib/regex/regex_util.v @@ -117,6 +117,7 @@ pub fn (re RE) get_group_list() []Re_group { mut res := []Re_group{len: re.groups.len >> 1} mut gi := 0 //println("len: ${re.groups.len} groups: ${re.groups}") + for gi < re.groups.len { if re.groups[gi] >= 0 { txt_st := re.groups[gi] @@ -136,3 +137,143 @@ pub fn (re RE) get_group_list() []Re_group { return res } +/****************************************************************************** +* +* Finders +* +******************************************************************************/ +// find try to find the first match in the input string +[direct_array_access] +pub fn (mut re RE) find(in_txt string) (int,int) { + old_flag := re.flag + re.flag |= f_src // enable search mode + + start, mut end := re.match_base(in_txt.str, in_txt.len + 1) + //print("Find [$start,$end] '${in_txt[start..end]}'") + if end > in_txt.len { + end = in_txt.len + } + re.flag = old_flag + + if start >= 0 && end > start { + return start, end + } + return no_match_found, 0 +} + +// find_all find all the non overlapping occurrences of the match pattern +[direct_array_access] +pub fn (mut re RE) find_all(in_txt string) []int { + mut i := 0 + mut res := []int{} + mut ls := -1 + + for i < in_txt.len { + s,e := re.find(in_txt[i..]) + if s >= 0 && e > s && i+s > ls { + //println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls") + res << i+s + res << i+e + ls = i+s + i = i+e + continue + } else { + i++ + } + + } + return res +} + +// find_all_str find all the non overlapping occurrences of the match pattern, return a string list +[direct_array_access] +pub fn (mut re RE) find_all_str(in_txt string) []string { + mut i := 0 + mut res := []string{} + mut ls := -1 + + for i < in_txt.len { + s,e := re.find(in_txt[i..]) + if s >= 0 && e > s && i+s > ls { + //println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls") + res << in_txt[i+s..i+e] + ls = i+s + i = i+e + continue + } else { + i++ + } + + } + return res +} +/****************************************************************************** +* +* Replacers +* +******************************************************************************/ +// replace return a string where the matches are replaced with the replace string +pub fn (mut re RE) replace(in_txt string, repl string) string { + pos := re.find_all(in_txt) + + if pos.len > 0 { + mut res := "" + mut i := 0 + + mut s1 := 0 + mut e1 := in_txt.len + + for i < pos.len { + e1 = pos[i] + res += in_txt[s1..e1] + repl + s1 = pos[i+1] + i += 2 + } + + res += in_txt[s1..] + return res + } + return in_txt +} + +// type of function used for custom replace +// in_txt source text +// start index of the start of the match in in_txt +// end index of the end of the match in in_txt +// the match is in in_txt[start..end] +pub type FnReplace = fn (re RE, in_txt string, start int, end int) string + +// replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function +pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string { + mut i := 0 + mut res := "" + mut ls := -1 + mut s1 := 0 + + for i < in_txt.len { + s,e := re.find(in_txt[i..]) + if s >= 0 && e > s && i+s > ls { + //println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls") + start := i + s + end := i + e + // update grups index diplacement + mut gi := 0 + for gi < re.groups.len { + re.groups[gi] += i + gi++ + } + repl := repl_fn(re, in_txt, start, end) + + res += in_txt[s1..start] + repl + s1 = end + + ls = i + s + i = i + e + continue + } else { + i++ + } + } + res += in_txt[s1..] + return res +}