From 15ffce13179c8c0a1c17aab13b72d86e54ba8a11 Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Thu, 3 Dec 2020 19:33:53 +0100 Subject: [PATCH] regex: fix compilation issues with gcc under ubuntu (#7112) --- vlib/regex/regex.v | 176 ++++++++++++++++++++++----------------------- 1 file changed, 86 insertions(+), 90 deletions(-) diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index b3fc542676..1be0eda102 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -27,7 +27,7 @@ pub const( // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`] // new line chars for now only '\n' - new_line_list = [`\n`,`\r`] + new_line_list = [`\n`, `\r`] // Results no_match_found = -1 @@ -49,7 +49,7 @@ const( //************************************* // regex program instructions //************************************* - ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char + ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char // char class 11 0100 AA xxxxxxxx // AA = 00 regular class @@ -92,9 +92,7 @@ fn utf8util_char_len(b byte) int { fn (re RE) get_char(in_txt string, i int) (u32,int) { ini := unsafe {in_txt.str[i]} // ascii 8 bit - if (re.flag & f_bin) !=0 || - ini & 0x80 == 0 - { + if (re.flag & f_bin) !=0 || ini & 0x80 == 0 { return u32(ini), 1 } // unicode char @@ -102,7 +100,7 @@ fn (re RE) get_char(in_txt string, i int) (u32,int) { mut tmp := 0 mut ch := u32(0) for tmp < char_len { - ch = (ch << 8) | unsafe {in_txt.str[i+tmp]} + ch = (ch << 8) | unsafe {in_txt.str[i + tmp]} tmp++ } return ch,char_len @@ -112,9 +110,7 @@ fn (re RE) get_char(in_txt string, i int) (u32,int) { [inline] fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) { // ascii 8 bit - if (re.flag & f_bin) !=0 || - unsafe {in_txt[i]} & 0x80 == 0 - { + if (re.flag & f_bin) !=0 || unsafe {in_txt[i]} & 0x80 == 0 { return u32(unsafe {in_txt[i]}), 1 } // unicode char @@ -122,7 +118,7 @@ fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) { mut tmp := 0 mut ch := u32(0) for tmp < char_len { - ch = (ch << 8) | unsafe {in_txt[i+tmp]} + ch = (ch << 8) | unsafe {in_txt[i + tmp]} tmp++ } return ch,char_len @@ -131,11 +127,11 @@ fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) { [inline] fn is_alnum(in_char byte) bool { mut tmp := in_char - `A` - if tmp >= 0x00 && tmp <= 25 { return true } + if tmp <= 25 { return true } tmp = in_char - `a` - if tmp >= 0x00 && tmp <= 25 { return true } + if tmp <= 25 { return true } tmp = in_char - `0` - if tmp >= 0x00 && tmp <= 9 { return true } + if tmp <= 9 { return true } if tmp == `_` { return true } return false } @@ -158,7 +154,7 @@ fn is_not_space(in_char byte) bool { [inline] fn is_digit(in_char byte) bool { tmp := in_char - `0` - return tmp <= 0x09 && tmp >= 0 + return tmp <= 0x09 } [inline] @@ -179,13 +175,13 @@ fn is_not_wordchar(in_char byte) bool { [inline] fn is_lower(in_char byte) bool { tmp := in_char - `a` - return tmp >= 0x00 && tmp <= 25 + return tmp <= 25 } [inline] fn is_upper(in_char byte) bool { tmp := in_char - `A` - return tmp >= 0x00 && tmp <= 25 + return tmp <= 25 } pub fn (re RE) get_parse_error_string(err int) string { @@ -211,7 +207,7 @@ fn utf8_str(ch rune) string { mut i := 4 mut res := "" for i > 0 { - v := byte((ch >> ((i-1)*8)) & 0xFF) + v := byte((ch >> ((i - 1) * 8)) & 0xFF) if v != 0{ res += "${v:1c}" } @@ -236,30 +232,30 @@ mut: ist rune // char - ch rune // char of the token if any - ch_len byte // char len + ch rune // char of the token if any + ch_len byte // char len // Quantifiers / branch rep_min int // used also for jump next in the OR branch [no match] pc jump rep_max int // used also for jump next in the OR branch [ match] pc jump - greedy bool // greedy quantifier flag + greedy bool // greedy quantifier flag // Char class - cc_index int = -1 + cc_index int = -1 // counters for quantifier check (repetitions) rep int // validator function pointer - validator FnValidator + validator FnValidator // groups variables - group_rep int // repetition of the group - group_id int = -1 // id of the group - goto_pc int = -1 // jump to this PC if is needed + group_rep int // repetition of the group + group_id int = -1 // id of the group + goto_pc int = -1 // jump to this PC if is needed // OR flag for the token - next_is_or bool // true if the next token is an OR + next_is_or bool // true if the next token is an OR } [inline] @@ -310,34 +306,34 @@ pub mut: // groups - group_count int // number of groups in this regex struct - groups []int // groups index results - group_max_nested int = 3 // max nested group - group_max int = 8 // max allowed number of different groups + group_count int // number of groups in this regex struct + groups []int // groups index results + group_max_nested int = 3 // max nested group + group_max int = 8 // max allowed number of different groups - group_csave []int = []int{} // groups continuous save array - group_csave_index int= -1 // groups continuous save index + group_csave []int = []int{} // groups continuous save array + group_csave_index int = -1 // groups continuous save index - group_map map[string]int // groups names map + group_map map[string]int // groups names map // flags - flag int // flag for optional parameters + flag int // flag for optional parameters // Debug/log - debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE - log_func FnLog = simple_log // log function, can be customized by the user - query string // query string + debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE + log_func FnLog = simple_log // log function, can be customized by the user + query string // query string } // Reset RE object //[inline] fn (mut re RE) reset(){ - re.cc_index = 0 + re.cc_index = 0 mut i := 0 for i < re.prog.len { - re.prog[i].group_rep = 0 // clear repetition of the group - re.prog[i].rep = 0 // clear repetition of the token + re.prog[i].group_rep = 0 // clear repetition of the group + re.prog[i].rep = 0 // clear repetition of the token i++ } re.groups = [-1].repeat(re.group_count*2) @@ -347,7 +343,7 @@ fn (mut re RE) reset(){ // reset group_csave if re.group_csave.len > 0 { re.group_csave_index = 1 - re.group_csave[0] = 0 // reset the capture count + re.group_csave[0] = 0 // reset the capture count } } @@ -356,8 +352,8 @@ fn (mut re RE) reset(){ fn (mut re RE) reset_src(){ mut i := 0 for i < re.prog.len { - re.prog[i].group_rep = 0 // clear repetition of the group - re.prog[i].rep = 0 // clear repetition of the token + re.prog[i].group_rep = 0 // clear repetition of the group + re.prog[i].rep = 0 // clear repetition of the token i++ } re.state_stack_index = -1 @@ -367,8 +363,8 @@ fn (mut re RE) reset_src(){ pub fn (re RE) get_group(group_name string) (int, int) { if group_name in re.group_map { tmp_index := re.group_map[group_name]-1 - start := re.groups[tmp_index*2] - end := re.groups[tmp_index*2+1] + start := re.groups[tmp_index * 2] + end := re.groups[tmp_index * 2 + 1] return start,end } return -1, -1 @@ -397,7 +393,7 @@ const( ] // these chars are escape if preceded by a \ - bsls_escape_list = [ `\\`,`|`,`.`,`*`,`+`,`-`,`{`,`}`,`[`,`]` ] + bsls_escape_list = [`\\`, `|`, `.`, `*`, `+`, `-`, `{`, `}`, `[`, `]`] ) enum BSLS_parse_state { @@ -414,7 +410,7 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){ for i < in_txt.len { // get our char - char_tmp,char_len := re.get_char(in_txt,i) + char_tmp, char_len := re.get_char(in_txt, i) ch := byte(char_tmp) if status == .start && ch == `\\` { @@ -427,7 +423,7 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){ if status == .bsls_found { for c,x in bsls_validator_array { if x.ch == ch { - return c,i-in_i+1 + return c, i-in_i+1 } } status = .normal_char @@ -437,9 +433,9 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){ // no BSLS validator, manage as normal escape char char if status == .normal_char { if ch in bsls_escape_list { - return no_match_found,i-in_i+1 + return no_match_found, i-in_i+1 } - return err_syntax_error,i-in_i+1 + return err_syntax_error, i-in_i+1 } // at the present time we manage only one char after the \ @@ -465,10 +461,10 @@ const( struct CharClass { mut: - cc_type int = cc_null // type of cc token - ch0 rune // first char of the interval a-b a in this case - ch1 rune // second char of the interval a-b b in this case - validator FnValidator // validator function pointer + cc_type int = cc_null // type of cc token + ch0 rune // first char of the interval a-b a in this case + ch1 rune // second char of the interval a-b b in this case + validator FnValidator // validator function pointer } enum CharClass_parse_state { @@ -562,7 +558,7 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) { mut i := in_i mut tmp_index := re.cc_index - res_index := re.cc_index + res_index := re.cc_index mut cc_type := u32(ist_char_class_pos) @@ -570,7 +566,7 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) { // check if we are out of memory for char classes if tmp_index >= re.cc.len { - return err_cc_alloc_overflow,0,u32(0) + return err_cc_alloc_overflow, 0, u32(0) } // get our char @@ -710,7 +706,7 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { // exit on no compatible char with {} quantifier if utf8util_char_len(ch) != 1 { - return err_syntax_error,i,0,false + return err_syntax_error, i, 0, false } // min parsing skip if comma present @@ -913,14 +909,13 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) { // // compile return (return code, index) where index is the index of the error in the query string if return code is an error code [deprecated] -pub fn (mut re RE) compile(in_txt string) (int,int) { +pub fn (mut re RE) compile(in_txt string) (int, int) { return re.impl_compile(in_txt) } fn (mut re RE) impl_compile(in_txt string) (int,int) { mut i := 0 // input string index mut pc := 0 // program counter - mut tmp_code := u32(0) // group management variables mut group_count := -1 @@ -932,7 +927,6 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) { i = 0 for i < in_txt.len { - tmp_code = u32(0) mut char_tmp := u32(0) mut char_len := 0 //println("i: ${i:3d} ch: ${in_txt.str[i]:c}") @@ -958,20 +952,20 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) { //check max groups allowed if group_count > re.group_max { - return err_groups_overflow,i+1 + return err_groups_overflow, i+1 } group_stack_index++ // check max nested groups allowed if group_stack_index > re.group_max_nested { - return err_groups_max_nested,i+1 + return err_groups_max_nested, i+1 } tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i) // manage question mark format error if tmp_res < -1 { - return err_group_qm_notation,next_i + return err_group_qm_notation, next_i } //println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]") @@ -988,10 +982,10 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) { if cgroup_name.len > 0 { //println("GROUP NAME: ${cgroup_name}") if cgroup_name in re.group_map{ - group_id = re.group_map[cgroup_name]-1 + group_id = re.group_map[cgroup_name] - 1 group_count-- } else { - re.group_map[cgroup_name] = group_id+1 + re.group_map[cgroup_name] = group_id + 1 } } @@ -1018,7 +1012,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) { // ist_group_end if char_len==1 && pc > 0 && byte(char_tmp) == `)` { if group_stack_index < 0 { - return err_group_not_balanced,i+1 + return err_group_not_balanced, i+1 } goto_pc := group_stack[group_stack_index] @@ -1161,7 +1155,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) { } // if not an escape or a bsls char then it is an error (at least for now!) else { - return bsls_index,i+tmp + return bsls_index, i+tmp } } } @@ -1192,7 +1186,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) { } // store the number of groups in the query - re.group_count = group_count+1 + re.group_count = group_count + 1 //****************************************** // Post processing @@ -1482,33 +1476,35 @@ fn state_str(s Match_state) string { struct StateObj { pub mut: - match_flag bool + match_flag bool match_index int = -1 match_first int = -1 } pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // result status - mut result := no_match_found // function return + mut result := no_match_found // function return mut first_match := -1 //index of the first match - mut i := 0 // source string index - mut ch := rune(0) // examinated char - mut char_len := 0 // utf8 examinated char len - mut m_state := Match_state.start // start point for the matcher FSM + mut i := 0 // source string index + mut ch := rune(0) // examinated char + mut char_len := 0 // utf8 examinated char len + mut m_state := Match_state.start // start point for the matcher FSM - mut pc := -1 // program counter - mut state := StateObj{} // actual state - mut ist := rune(0) // actual instruction - mut l_ist :=rune(0) // last matched instruction + mut pc := -1 // program counter + mut state := StateObj{} // actual state + mut ist := rune(0) // actual instruction + mut l_ist :=rune(0) // last matched instruction - mut group_stack := [-1].repeat(re.group_max) - mut group_data := [-1].repeat(re.group_max) + //mut group_stack := [-1].repeat(re.group_max) + //mut group_data := [-1].repeat(re.group_max) + mut group_stack := []int{len: re.group_max, init: -1} + mut group_data := []int{len: re.group_max, init: -1} - mut group_index := -1 // group id used to know how many groups are open + mut group_index := -1 // group id used to know how many groups are open - mut step_count := 0 // stats for debug - mut dbg_line := 0 // count debug line printed + mut step_count := 0 // stats for debug + mut dbg_line := 0 // count debug line printed re.reset() @@ -1535,7 +1531,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // DEBUG LOG //****************************************** if re.debug>0 { - mut buf2 := strings.new_builder(re.cc.len+128) + mut buf2 := strings.new_builder(re.cc.len + 128) // print all the instructions @@ -1658,7 +1654,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // starting and init if m_state == .start { pc = -1 - i = 0 + i = 0 m_state = .ist_next continue } @@ -1962,7 +1958,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { /*********************************** * Quantifier management ***********************************/ - // ist_quant_ng + // ist_quant_ng => quantifier negative test on group if m_state == .ist_quant_ng { // we are finished here @@ -2039,7 +2035,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { return err_internal_error, i } - // ist_quant_pg + // ist_quant_pg => quantifier positive test on group else if m_state == .ist_quant_pg { //println(".ist_quant_pg") mut tmp_pc := pc @@ -2084,7 +2080,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { return err_internal_error, i } - // ist_quant_n + // ist_quant_n => quantifier negative test on token else if m_state == .ist_quant_n { rep := re.prog[pc].rep //println("Here!! PC $pc is_next_or: ${re.prog[pc].next_is_or}") @@ -2125,7 +2121,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { //return no_match_found, 0 } - // ist_quant_p + // ist_quant_p => quantifier positive test on token else if m_state == .ist_quant_p { // exit on first match if (re.flag & f_efm) != 0 { @@ -2255,7 +2251,7 @@ pub fn (mut re RE) find(in_txt string) (int,int) { start, end := re.match_base(in_txt.str, in_txt.len) re.flag = old_flag if start >= 0 && end > start { - return start,end + return start, end } return no_match_found, 0 }