regex: fix compilation issues with gcc under ubuntu (#7112)

2020-12-03 19:33:53 +01:00 · 2020-12-03 19:33:53 +01:00 · 15ffce1317
parent 793f9ae9e3
commit 15ffce1317
1 changed files with 86 additions and 90 deletions
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -27,7 +27,7 @@ pub const(
 	// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
 	spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
 	// new line chars for now only '\n'
-	new_line_list = [`\n`,`\r`]
+	new_line_list = [`\n`, `\r`]
 	// Results
 	no_match_found          = -1
@ -49,7 +49,7 @@ const(
 	//*************************************
 	// regex program instructions
 	//*************************************
-	ist_simple_char  = u32(0x7FFFFFFF)   // single char instruction, 31 bit available to char
+	ist_simple_char  = u32(0x7FFFFFFF)  // single char instruction, 31 bit available to char
 	// char class 11 0100 AA xxxxxxxx
 	// AA = 00  regular class
@ -92,9 +92,7 @@ fn utf8util_char_len(b byte) int {
 fn (re RE) get_char(in_txt string, i int) (u32,int) {
 	ini := unsafe {in_txt.str[i]}
 	// ascii 8 bit
-	if (re.flag & f_bin) !=0 ||
+	if (re.flag & f_bin) !=0 ||	ini & 0x80 == 0 {
 		ini & 0x80 == 0
 	{
 		return u32(ini), 1
 	}
 	// unicode char
@ -102,7 +100,7 @@ fn (re RE) get_char(in_txt string, i int) (u32,int) {
 	mut tmp := 0
 	mut ch := u32(0)
 	for tmp < char_len {
-		ch = (ch << 8) | unsafe {in_txt.str[i+tmp]}
+		ch = (ch << 8) | unsafe {in_txt.str[i + tmp]}
 		tmp++
 	}
 	return ch,char_len
@ -112,9 +110,7 @@ fn (re RE) get_char(in_txt string, i int) (u32,int) {
 [inline]
 fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) {
 	// ascii 8 bit
-	if (re.flag & f_bin) !=0 ||
+	if (re.flag & f_bin) !=0 ||	unsafe {in_txt[i]} & 0x80 == 0 {
 		unsafe {in_txt[i]} & 0x80 == 0
 	{
 		return u32(unsafe {in_txt[i]}), 1
 	}
 	// unicode char
@ -122,7 +118,7 @@ fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) {
 	mut tmp := 0
 	mut ch := u32(0)
 	for tmp < char_len {
-		ch = (ch << 8) | unsafe {in_txt[i+tmp]}
+		ch = (ch << 8) | unsafe {in_txt[i + tmp]}
 		tmp++
 	}
 	return ch,char_len
@ -131,11 +127,11 @@ fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) {
 [inline]
 fn is_alnum(in_char byte) bool {
 	mut tmp := in_char - `A`
-	if tmp >= 0x00 && tmp <= 25 { return true }
+	if tmp <= 25 { return true }
 	tmp = in_char - `a`
-	if tmp >= 0x00 && tmp <= 25 { return true }
+	if tmp <= 25 { return true }
 	tmp = in_char - `0`
-	if tmp >= 0x00 && tmp <= 9  { return true }
+	if tmp <= 9  { return true }
 	if tmp == `_` { return true }
 	return false
 }
@ -158,7 +154,7 @@ fn is_not_space(in_char byte) bool {
 [inline]
 fn is_digit(in_char byte) bool {
 	tmp := in_char - `0`
-	return tmp <= 0x09 && tmp >= 0
+	return tmp <= 0x09
 }
 [inline]
@ -179,13 +175,13 @@ fn is_not_wordchar(in_char byte) bool {
 [inline]
 fn is_lower(in_char byte) bool {
 	tmp := in_char - `a`
-	return  tmp >= 0x00 && tmp <= 25
+	return tmp <= 25
 }
 [inline]
 fn is_upper(in_char byte) bool {
 	tmp := in_char - `A`
-	return  tmp >= 0x00 && tmp <= 25
+	return tmp <= 25
 }
 pub fn (re RE) get_parse_error_string(err int) string {
@ -211,7 +207,7 @@ fn utf8_str(ch rune) string {
 	mut i := 4
 	mut res := ""
 	for i > 0 {
-		v := byte((ch >> ((i-1)*8)) & 0xFF)
+		v := byte((ch >> ((i - 1) * 8)) & 0xFF)
 		if v != 0{
 			res += "${v:1c}"
 		}
@ -236,30 +232,30 @@ mut:
 	ist rune
 	// char
-	ch rune   // char of the token if any
+	ch rune                     // char of the token if any
-	ch_len byte             // char len
+	ch_len byte                 // char len
 	// Quantifiers / branch
 	rep_min         int         // used also for jump next in the OR branch [no match] pc jump
 	rep_max         int         // used also for jump next in the OR branch [   match] pc jump
-	greedy          bool    // greedy quantifier flag
+	greedy          bool        // greedy quantifier flag
 	// Char class
-	cc_index        int    = -1
+	cc_index        int = -1
 	// counters for quantifier check (repetitions)
 	rep             int
 	// validator function pointer
-	validator FnValidator
+	validator       FnValidator
 	// groups variables
-	group_rep          int      // repetition of the group
+	group_rep       int        // repetition of the group
-	group_id           int = -1    // id of the group
+	group_id        int = -1   // id of the group
-	goto_pc            int = -1    // jump to this PC if is needed
+	goto_pc         int = -1   // jump to this PC if is needed
 	// OR flag for the token
-	next_is_or bool        // true if the next token is an OR
+	next_is_or      bool       // true if the next token is an OR
 }
 [inline]
@ -310,34 +306,34 @@ pub mut:
 	// groups
-	group_count int        // number of groups in this regex struct
+	group_count       int              // number of groups in this regex struct
-	groups []int               // groups index results
+	groups            []int            // groups index results
-	group_max_nested int = 3   // max nested group
+	group_max_nested  int  = 3         // max nested group
-	group_max int        = 8   // max allowed number of different groups
+	group_max         int  = 8         // max allowed number of different groups
-	group_csave []int    = []int{}  // groups continuous save array
+	group_csave       []int = []int{}  // groups continuous save array
-	group_csave_index int= -1       // groups continuous save index
+	group_csave_index int = -1         // groups continuous save index
-	group_map map[string]int   // groups names map
+	group_map         map[string]int   // groups names map
 	// flags
-	flag int                   // flag for optional parameters
+	flag              int              // flag for optional parameters
 	// Debug/log
-	debug int                          // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE
+	debug             int             // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE
-	log_func FnLog       = simple_log  // log function, can be customized by the user
+	log_func          FnLog = simple_log  // log function, can be customized by the user
-	query string                   // query string
+	query             string          // query string
 }
 // Reset RE object
 //[inline]
 fn (mut re RE) reset(){
-	re.cc_index         = 0
+	re.cc_index = 0
 	mut i := 0
 	for i < re.prog.len {
-		re.prog[i].group_rep          = 0 // clear repetition of the group
+		re.prog[i].group_rep = 0 // clear repetition of the group
-		re.prog[i].rep                = 0 // clear repetition of the token
+		re.prog[i].rep       = 0 // clear repetition of the token
 		i++
 	}
 	re.groups = [-1].repeat(re.group_count*2)
@ -347,7 +343,7 @@ fn (mut re RE) reset(){
 	// reset group_csave
 	if re.group_csave.len > 0 {
 		re.group_csave_index = 1
-		re.group_csave[0] = 0     // reset the capture count
+		re.group_csave[0]    = 0 // reset the capture count
 	}
 }
@ -356,8 +352,8 @@ fn (mut re RE) reset(){
 fn (mut re RE) reset_src(){
 	mut i := 0
 	for i < re.prog.len {
-		re.prog[i].group_rep          = 0 // clear repetition of the group
+		re.prog[i].group_rep = 0 // clear repetition of the group
-		re.prog[i].rep                = 0 // clear repetition of the token
+		re.prog[i].rep       = 0 // clear repetition of the token
 		i++
 	}
 	re.state_stack_index = -1
@ -367,8 +363,8 @@ fn (mut re RE) reset_src(){
 pub fn (re RE) get_group(group_name string) (int, int) {
 	if group_name in re.group_map {
 		tmp_index := re.group_map[group_name]-1
-		start := re.groups[tmp_index*2]
+		start     := re.groups[tmp_index * 2]
-		end := re.groups[tmp_index*2+1]
+		end       := re.groups[tmp_index * 2 + 1]
 		return start,end
 	}
 	return -1, -1
@ -397,7 +393,7 @@ const(
 	]
 	// these chars are escape if preceded by a \
-	bsls_escape_list = [ `\\`,`|`,`.`,`*`,`+`,`-`,`{`,`}`,`[`,`]` ]
+	bsls_escape_list = [`\\`, `|`, `.`, `*`, `+`, `-`, `{`, `}`, `[`, `]`]
 )
 enum BSLS_parse_state {
@ -414,7 +410,7 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){
 	for i < in_txt.len {
 		// get our char
-		char_tmp,char_len := re.get_char(in_txt,i)
+		char_tmp, char_len := re.get_char(in_txt, i)
 		ch := byte(char_tmp)
 		if status == .start && ch == `\\` {
@ -427,7 +423,7 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){
 		if status == .bsls_found {
 			for c,x in bsls_validator_array {
 				if x.ch == ch {
-					return c,i-in_i+1
+					return c, i-in_i+1
 				}
 			}
 			status = .normal_char
@ -437,9 +433,9 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){
 		// no BSLS validator, manage as normal escape char char
 		if status == .normal_char {
 			if ch in bsls_escape_list {
-				return no_match_found,i-in_i+1
+				return no_match_found, i-in_i+1
 			}
-			return err_syntax_error,i-in_i+1
+			return err_syntax_error, i-in_i+1
 		}
 		// at the present time we manage only one char after the \
@ -465,10 +461,10 @@ const(
 struct CharClass {
 mut:
-	cc_type int = cc_null      // type of cc token
+	cc_type   int = cc_null // type of cc token
-	ch0 rune       // first char of the interval a-b  a in this case
+	ch0       rune          // first char of the interval a-b  a in this case
-	ch1 rune	   // second char of the interval a-b b in this case
+	ch1       rune	        // second char of the interval a-b b in this case
-	validator FnValidator      // validator function pointer
+	validator FnValidator   // validator function pointer
 }
 enum CharClass_parse_state {
@ -562,7 +558,7 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
 	mut i := in_i
 	mut tmp_index := re.cc_index
-	res_index := re.cc_index
+	res_index     := re.cc_index
 	mut cc_type := u32(ist_char_class_pos)
@ -570,7 +566,7 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
 		// check if we are out of memory for char classes
 		if tmp_index >= re.cc.len {
-			return err_cc_alloc_overflow,0,u32(0)
+			return err_cc_alloc_overflow, 0, u32(0)
 		}
 		// get our char
@ -710,7 +706,7 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
 		// exit on no compatible char with {} quantifier
 		if utf8util_char_len(ch) != 1 {
-			return err_syntax_error,i,0,false
+			return err_syntax_error, i, 0, false
 		}
 		// min parsing skip if comma present
@ -913,14 +909,13 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
 //
 // compile return (return code, index) where index is the index of the error in the query string if return code is an error code
 [deprecated]
-pub fn (mut re RE) compile(in_txt string) (int,int) {
+pub fn (mut re RE) compile(in_txt string) (int, int) {
 	return re.impl_compile(in_txt)
 }
 fn (mut re RE) impl_compile(in_txt string) (int,int) {
 	mut i        := 0      // input string index
 	mut pc       := 0      // program counter
 	mut tmp_code := u32(0)
 	// group management variables
 	mut group_count           := -1
@ -932,7 +927,6 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
 	i = 0
 	for i < in_txt.len {
 		tmp_code = u32(0)
 		mut char_tmp := u32(0)
 		mut char_len := 0
 		//println("i: ${i:3d} ch: ${in_txt.str[i]:c}")
@ -958,20 +952,20 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
 			//check max groups allowed
 			if group_count > re.group_max {
-				return err_groups_overflow,i+1
+				return err_groups_overflow, i+1
 			}
 			group_stack_index++
 			// check max nested groups allowed
 			if group_stack_index > re.group_max_nested {
-				return err_groups_max_nested,i+1
+				return err_groups_max_nested, i+1
 			}
 			tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i)
 			// manage question mark format error
 			if tmp_res < -1 {
-				return err_group_qm_notation,next_i
+				return err_group_qm_notation, next_i
 			}
 			//println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
@ -988,10 +982,10 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
 			if cgroup_name.len > 0 {
 				//println("GROUP NAME: ${cgroup_name}")
 				if cgroup_name in re.group_map{
-					group_id = re.group_map[cgroup_name]-1
+					group_id = re.group_map[cgroup_name] - 1
 					group_count--
 				} else {
-					re.group_map[cgroup_name] = group_id+1
+					re.group_map[cgroup_name] = group_id + 1
 				}
 			}
@ -1018,7 +1012,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
 		// ist_group_end
 		if char_len==1 && pc > 0 && byte(char_tmp) == `)` {
 			if group_stack_index < 0 {
-				return err_group_not_balanced,i+1
+				return err_group_not_balanced, i+1
 			}
 			goto_pc := group_stack[group_stack_index]
@ -1161,7 +1155,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
 				}
 				// if not an escape or a bsls char then it is an error (at least for now!)
 				else {
-					return bsls_index,i+tmp
+					return bsls_index, i+tmp
 				}
 			}
 		}
@ -1192,7 +1186,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
 	}
 	// store the number of groups in the query
-	re.group_count = group_count+1
+	re.group_count = group_count + 1
 	//******************************************
 	// Post processing
@ -1482,33 +1476,35 @@ fn state_str(s Match_state) string {
 struct StateObj {
 pub mut:
-	match_flag bool
+	match_flag  bool
 	match_index int = -1
 	match_first int = -1
 }
 pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 	// result status
-	mut result := no_match_found     // function return
+	mut result      := no_match_found // function return
 	mut first_match := -1             //index of the first match
-	mut i := 0                       // source string index
+	mut i        := 0                 // source string index
-	mut ch := rune(0)                 // examinated char
+	mut ch       := rune(0)           // examinated char
-	mut char_len := 0                // utf8 examinated char len
+	mut char_len := 0                 // utf8 examinated char len
-	mut m_state := Match_state.start // start point for the matcher FSM
+	mut m_state  := Match_state.start // start point for the matcher FSM
-	mut pc := -1                     // program counter
+	mut pc    := -1                   // program counter
-	mut state := StateObj{}          // actual state
+	mut state := StateObj{}           // actual state
-	mut ist := rune(0)                // actual instruction
+	mut ist   := rune(0)              // actual instruction
-	mut l_ist :=rune(0)              // last matched instruction
+	mut l_ist :=rune(0)               // last matched instruction
-	mut group_stack      := [-1].repeat(re.group_max)
+	//mut group_stack      := [-1].repeat(re.group_max)
-	mut group_data       := [-1].repeat(re.group_max)
+	//mut group_data       := [-1].repeat(re.group_max)
 	mut group_stack := []int{len: re.group_max, init: -1}
 	mut group_data  := []int{len: re.group_max, init: -1}
-	mut group_index := -1            // group id used to know how many groups are open
+	mut group_index := -1             // group id used to know how many groups are open
-	mut step_count := 0              // stats for debug
+	mut step_count  := 0              // stats for debug
-	mut dbg_line   := 0              // count debug line printed
+	mut dbg_line    := 0              // count debug line printed
 	re.reset()
@ -1535,7 +1531,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 		// DEBUG LOG
 		//******************************************
 		if re.debug>0 {
-			mut buf2 := strings.new_builder(re.cc.len+128)
+			mut buf2 := strings.new_builder(re.cc.len + 128)
 			// print all the instructions
@ -1658,7 +1654,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 		// starting and init
 		if m_state == .start {
 			pc = -1
-			i = 0
+			i  = 0
 			m_state = .ist_next
 			continue
 		}
@ -1962,7 +1958,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 		/***********************************
 		* Quantifier management
 		***********************************/
-		// ist_quant_ng
+		// ist_quant_ng => quantifier negative test on group
 		if m_state == .ist_quant_ng {
 			// we are finished here
@ -2039,7 +2035,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 			return err_internal_error, i
 		}
-		// ist_quant_pg
+		// ist_quant_pg => quantifier positive test on group
 		else if m_state == .ist_quant_pg {
 			//println(".ist_quant_pg")
 			mut tmp_pc := pc
@ -2084,7 +2080,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 			return err_internal_error, i
 		}
-		// ist_quant_n
+		// ist_quant_n => quantifier negative test on token
 		else if m_state == .ist_quant_n {
 			rep := re.prog[pc].rep
 			//println("Here!! PC $pc is_next_or: ${re.prog[pc].next_is_or}")
@ -2125,7 +2121,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 			//return no_match_found, 0
 		}
-		// ist_quant_p
+		// ist_quant_p => quantifier positive test on token
 		else if m_state == .ist_quant_p {
 			// exit on first match
 			if (re.flag & f_efm) != 0 {
@ -2255,7 +2251,7 @@ pub fn (mut re RE) find(in_txt string) (int,int) {
 	start, end := re.match_base(in_txt.str, in_txt.len)
 	re.flag = old_flag
 	if start >= 0 && end > start {
-		return start,end
+		return start, end
 	}
 	return no_match_found, 0
 }