regex: fix compilation issues with gcc under ubuntu (#7112)

pull/7118/head
penguindark 2020-12-03 19:33:53 +01:00 committed by GitHub
parent 793f9ae9e3
commit 15ffce1317
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 86 additions and 90 deletions

View File

@ -27,7 +27,7 @@ pub const(
// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`] spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
// new line chars for now only '\n' // new line chars for now only '\n'
new_line_list = [`\n`,`\r`] new_line_list = [`\n`, `\r`]
// Results // Results
no_match_found = -1 no_match_found = -1
@ -92,9 +92,7 @@ fn utf8util_char_len(b byte) int {
fn (re RE) get_char(in_txt string, i int) (u32,int) { fn (re RE) get_char(in_txt string, i int) (u32,int) {
ini := unsafe {in_txt.str[i]} ini := unsafe {in_txt.str[i]}
// ascii 8 bit // ascii 8 bit
if (re.flag & f_bin) !=0 || if (re.flag & f_bin) !=0 || ini & 0x80 == 0 {
ini & 0x80 == 0
{
return u32(ini), 1 return u32(ini), 1
} }
// unicode char // unicode char
@ -102,7 +100,7 @@ fn (re RE) get_char(in_txt string, i int) (u32,int) {
mut tmp := 0 mut tmp := 0
mut ch := u32(0) mut ch := u32(0)
for tmp < char_len { for tmp < char_len {
ch = (ch << 8) | unsafe {in_txt.str[i+tmp]} ch = (ch << 8) | unsafe {in_txt.str[i + tmp]}
tmp++ tmp++
} }
return ch,char_len return ch,char_len
@ -112,9 +110,7 @@ fn (re RE) get_char(in_txt string, i int) (u32,int) {
[inline] [inline]
fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) { fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) {
// ascii 8 bit // ascii 8 bit
if (re.flag & f_bin) !=0 || if (re.flag & f_bin) !=0 || unsafe {in_txt[i]} & 0x80 == 0 {
unsafe {in_txt[i]} & 0x80 == 0
{
return u32(unsafe {in_txt[i]}), 1 return u32(unsafe {in_txt[i]}), 1
} }
// unicode char // unicode char
@ -122,7 +118,7 @@ fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) {
mut tmp := 0 mut tmp := 0
mut ch := u32(0) mut ch := u32(0)
for tmp < char_len { for tmp < char_len {
ch = (ch << 8) | unsafe {in_txt[i+tmp]} ch = (ch << 8) | unsafe {in_txt[i + tmp]}
tmp++ tmp++
} }
return ch,char_len return ch,char_len
@ -131,11 +127,11 @@ fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) {
[inline] [inline]
fn is_alnum(in_char byte) bool { fn is_alnum(in_char byte) bool {
mut tmp := in_char - `A` mut tmp := in_char - `A`
if tmp >= 0x00 && tmp <= 25 { return true } if tmp <= 25 { return true }
tmp = in_char - `a` tmp = in_char - `a`
if tmp >= 0x00 && tmp <= 25 { return true } if tmp <= 25 { return true }
tmp = in_char - `0` tmp = in_char - `0`
if tmp >= 0x00 && tmp <= 9 { return true } if tmp <= 9 { return true }
if tmp == `_` { return true } if tmp == `_` { return true }
return false return false
} }
@ -158,7 +154,7 @@ fn is_not_space(in_char byte) bool {
[inline] [inline]
fn is_digit(in_char byte) bool { fn is_digit(in_char byte) bool {
tmp := in_char - `0` tmp := in_char - `0`
return tmp <= 0x09 && tmp >= 0 return tmp <= 0x09
} }
[inline] [inline]
@ -179,13 +175,13 @@ fn is_not_wordchar(in_char byte) bool {
[inline] [inline]
fn is_lower(in_char byte) bool { fn is_lower(in_char byte) bool {
tmp := in_char - `a` tmp := in_char - `a`
return tmp >= 0x00 && tmp <= 25 return tmp <= 25
} }
[inline] [inline]
fn is_upper(in_char byte) bool { fn is_upper(in_char byte) bool {
tmp := in_char - `A` tmp := in_char - `A`
return tmp >= 0x00 && tmp <= 25 return tmp <= 25
} }
pub fn (re RE) get_parse_error_string(err int) string { pub fn (re RE) get_parse_error_string(err int) string {
@ -211,7 +207,7 @@ fn utf8_str(ch rune) string {
mut i := 4 mut i := 4
mut res := "" mut res := ""
for i > 0 { for i > 0 {
v := byte((ch >> ((i-1)*8)) & 0xFF) v := byte((ch >> ((i - 1) * 8)) & 0xFF)
if v != 0{ if v != 0{
res += "${v:1c}" res += "${v:1c}"
} }
@ -316,7 +312,7 @@ pub mut:
group_max int = 8 // max allowed number of different groups group_max int = 8 // max allowed number of different groups
group_csave []int = []int{} // groups continuous save array group_csave []int = []int{} // groups continuous save array
group_csave_index int= -1 // groups continuous save index group_csave_index int = -1 // groups continuous save index
group_map map[string]int // groups names map group_map map[string]int // groups names map
@ -367,8 +363,8 @@ fn (mut re RE) reset_src(){
pub fn (re RE) get_group(group_name string) (int, int) { pub fn (re RE) get_group(group_name string) (int, int) {
if group_name in re.group_map { if group_name in re.group_map {
tmp_index := re.group_map[group_name]-1 tmp_index := re.group_map[group_name]-1
start := re.groups[tmp_index*2] start := re.groups[tmp_index * 2]
end := re.groups[tmp_index*2+1] end := re.groups[tmp_index * 2 + 1]
return start,end return start,end
} }
return -1, -1 return -1, -1
@ -397,7 +393,7 @@ const(
] ]
// these chars are escape if preceded by a \ // these chars are escape if preceded by a \
bsls_escape_list = [ `\\`,`|`,`.`,`*`,`+`,`-`,`{`,`}`,`[`,`]` ] bsls_escape_list = [`\\`, `|`, `.`, `*`, `+`, `-`, `{`, `}`, `[`, `]`]
) )
enum BSLS_parse_state { enum BSLS_parse_state {
@ -414,7 +410,7 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){
for i < in_txt.len { for i < in_txt.len {
// get our char // get our char
char_tmp,char_len := re.get_char(in_txt,i) char_tmp, char_len := re.get_char(in_txt, i)
ch := byte(char_tmp) ch := byte(char_tmp)
if status == .start && ch == `\\` { if status == .start && ch == `\\` {
@ -427,7 +423,7 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){
if status == .bsls_found { if status == .bsls_found {
for c,x in bsls_validator_array { for c,x in bsls_validator_array {
if x.ch == ch { if x.ch == ch {
return c,i-in_i+1 return c, i-in_i+1
} }
} }
status = .normal_char status = .normal_char
@ -437,9 +433,9 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){
// no BSLS validator, manage as normal escape char char // no BSLS validator, manage as normal escape char char
if status == .normal_char { if status == .normal_char {
if ch in bsls_escape_list { if ch in bsls_escape_list {
return no_match_found,i-in_i+1 return no_match_found, i-in_i+1
} }
return err_syntax_error,i-in_i+1 return err_syntax_error, i-in_i+1
} }
// at the present time we manage only one char after the \ // at the present time we manage only one char after the \
@ -570,7 +566,7 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
// check if we are out of memory for char classes // check if we are out of memory for char classes
if tmp_index >= re.cc.len { if tmp_index >= re.cc.len {
return err_cc_alloc_overflow,0,u32(0) return err_cc_alloc_overflow, 0, u32(0)
} }
// get our char // get our char
@ -710,7 +706,7 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
// exit on no compatible char with {} quantifier // exit on no compatible char with {} quantifier
if utf8util_char_len(ch) != 1 { if utf8util_char_len(ch) != 1 {
return err_syntax_error,i,0,false return err_syntax_error, i, 0, false
} }
// min parsing skip if comma present // min parsing skip if comma present
@ -913,14 +909,13 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
// //
// compile return (return code, index) where index is the index of the error in the query string if return code is an error code // compile return (return code, index) where index is the index of the error in the query string if return code is an error code
[deprecated] [deprecated]
pub fn (mut re RE) compile(in_txt string) (int,int) { pub fn (mut re RE) compile(in_txt string) (int, int) {
return re.impl_compile(in_txt) return re.impl_compile(in_txt)
} }
fn (mut re RE) impl_compile(in_txt string) (int,int) { fn (mut re RE) impl_compile(in_txt string) (int,int) {
mut i := 0 // input string index mut i := 0 // input string index
mut pc := 0 // program counter mut pc := 0 // program counter
mut tmp_code := u32(0)
// group management variables // group management variables
mut group_count := -1 mut group_count := -1
@ -932,7 +927,6 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
i = 0 i = 0
for i < in_txt.len { for i < in_txt.len {
tmp_code = u32(0)
mut char_tmp := u32(0) mut char_tmp := u32(0)
mut char_len := 0 mut char_len := 0
//println("i: ${i:3d} ch: ${in_txt.str[i]:c}") //println("i: ${i:3d} ch: ${in_txt.str[i]:c}")
@ -958,20 +952,20 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
//check max groups allowed //check max groups allowed
if group_count > re.group_max { if group_count > re.group_max {
return err_groups_overflow,i+1 return err_groups_overflow, i+1
} }
group_stack_index++ group_stack_index++
// check max nested groups allowed // check max nested groups allowed
if group_stack_index > re.group_max_nested { if group_stack_index > re.group_max_nested {
return err_groups_max_nested,i+1 return err_groups_max_nested, i+1
} }
tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i) tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i)
// manage question mark format error // manage question mark format error
if tmp_res < -1 { if tmp_res < -1 {
return err_group_qm_notation,next_i return err_group_qm_notation, next_i
} }
//println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]") //println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
@ -988,10 +982,10 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
if cgroup_name.len > 0 { if cgroup_name.len > 0 {
//println("GROUP NAME: ${cgroup_name}") //println("GROUP NAME: ${cgroup_name}")
if cgroup_name in re.group_map{ if cgroup_name in re.group_map{
group_id = re.group_map[cgroup_name]-1 group_id = re.group_map[cgroup_name] - 1
group_count-- group_count--
} else { } else {
re.group_map[cgroup_name] = group_id+1 re.group_map[cgroup_name] = group_id + 1
} }
} }
@ -1018,7 +1012,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
// ist_group_end // ist_group_end
if char_len==1 && pc > 0 && byte(char_tmp) == `)` { if char_len==1 && pc > 0 && byte(char_tmp) == `)` {
if group_stack_index < 0 { if group_stack_index < 0 {
return err_group_not_balanced,i+1 return err_group_not_balanced, i+1
} }
goto_pc := group_stack[group_stack_index] goto_pc := group_stack[group_stack_index]
@ -1161,7 +1155,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
} }
// if not an escape or a bsls char then it is an error (at least for now!) // if not an escape or a bsls char then it is an error (at least for now!)
else { else {
return bsls_index,i+tmp return bsls_index, i+tmp
} }
} }
} }
@ -1192,7 +1186,7 @@ fn (mut re RE) impl_compile(in_txt string) (int,int) {
} }
// store the number of groups in the query // store the number of groups in the query
re.group_count = group_count+1 re.group_count = group_count + 1
//****************************************** //******************************************
// Post processing // Post processing
@ -1502,8 +1496,10 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
mut ist := rune(0) // actual instruction mut ist := rune(0) // actual instruction
mut l_ist :=rune(0) // last matched instruction mut l_ist :=rune(0) // last matched instruction
mut group_stack := [-1].repeat(re.group_max) //mut group_stack := [-1].repeat(re.group_max)
mut group_data := [-1].repeat(re.group_max) //mut group_data := [-1].repeat(re.group_max)
mut group_stack := []int{len: re.group_max, init: -1}
mut group_data := []int{len: re.group_max, init: -1}
mut group_index := -1 // group id used to know how many groups are open mut group_index := -1 // group id used to know how many groups are open
@ -1535,7 +1531,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// DEBUG LOG // DEBUG LOG
//****************************************** //******************************************
if re.debug>0 { if re.debug>0 {
mut buf2 := strings.new_builder(re.cc.len+128) mut buf2 := strings.new_builder(re.cc.len + 128)
// print all the instructions // print all the instructions
@ -1962,7 +1958,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
/*********************************** /***********************************
* Quantifier management * Quantifier management
***********************************/ ***********************************/
// ist_quant_ng // ist_quant_ng => quantifier negative test on group
if m_state == .ist_quant_ng { if m_state == .ist_quant_ng {
// we are finished here // we are finished here
@ -2039,7 +2035,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
return err_internal_error, i return err_internal_error, i
} }
// ist_quant_pg // ist_quant_pg => quantifier positive test on group
else if m_state == .ist_quant_pg { else if m_state == .ist_quant_pg {
//println(".ist_quant_pg") //println(".ist_quant_pg")
mut tmp_pc := pc mut tmp_pc := pc
@ -2084,7 +2080,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
return err_internal_error, i return err_internal_error, i
} }
// ist_quant_n // ist_quant_n => quantifier negative test on token
else if m_state == .ist_quant_n { else if m_state == .ist_quant_n {
rep := re.prog[pc].rep rep := re.prog[pc].rep
//println("Here!! PC $pc is_next_or: ${re.prog[pc].next_is_or}") //println("Here!! PC $pc is_next_or: ${re.prog[pc].next_is_or}")
@ -2125,7 +2121,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
//return no_match_found, 0 //return no_match_found, 0
} }
// ist_quant_p // ist_quant_p => quantifier positive test on token
else if m_state == .ist_quant_p { else if m_state == .ist_quant_p {
// exit on first match // exit on first match
if (re.flag & f_efm) != 0 { if (re.flag & f_efm) != 0 {
@ -2255,7 +2251,7 @@ pub fn (mut re RE) find(in_txt string) (int,int) {
start, end := re.match_base(in_txt.str, in_txt.len) start, end := re.match_base(in_txt.str, in_txt.len)
re.flag = old_flag re.flag = old_flag
if start >= 0 && end > start { if start >= 0 && end > start {
return start,end return start, end
} }
return no_match_found, 0 return no_match_found, 0
} }