/* regex 1.0 alpha Copyright (c) 2019-2020 Dario Deledda. All rights reserved. Use of this source code is governed by an MIT license that can be found in the LICENSE file. This file contains regex module Know limitation: - find is implemented in a trivial way - not full compliant PCRE - not compliant POSIX ERE */ module regex import strings pub const( v_regex_version = "1.0 alpha" // regex module version max_code_len = 256 // default small base code len for the regex programs max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`] // new line chars for now only '\n' new_line_list = [`\n`, `\r`] // Results no_match_found = -1 // Errors compile_ok = 0 // the regex string compiled, all ok err_char_unknown = -2 // the char used is unknow to the system err_undefined = -3 // the compiler symbol is undefined err_internal_error = -4 // Bug in the regex system!! err_cc_alloc_overflow = -5 // memory for char class full!! err_syntax_error = -6 // syntax error in regex compiling err_groups_overflow = -7 // max number of groups reached err_groups_max_nested = -8 // max number of nested group reached err_group_not_balanced = -9 // group not balanced err_group_qm_notation = -10 // group invalid notation ) const( //************************************* // regex program instructions //************************************* ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char // char class 11 0100 AA xxxxxxxx // AA = 00 regular class // AA = 01 Negated class ^ char ist_char_class = 0xD1000000 // MASK ist_char_class_pos = 0xD0000000 // char class normal [abc] ist_char_class_neg = 0xD1000000 // char class negate [^abc] // dot char 10 0110 xx xxxxxxxx ist_dot_char = 0x98000000 // match any char except \n // backslash chars 10 0100 xx xxxxxxxx ist_bsls_char = 0x90000000 // backslash char // OR | 10 010Y xx xxxxxxxx ist_or_branch = 0x91000000 // OR case // groups 10 010Y xx xxxxxxxx ist_group_start = 0x92000000 // group start ( ist_group_end = 0x94000000 // group end ) // control instructions ist_prog_end = u32(0x88000000) //10 0010 xx xxxxxxxx //************************************* ) /* General Utilities */ // utf8util_char_len calculate the length in bytes of a utf8 char [inline] fn utf8util_char_len(b byte) int { return (( 0xe5000000 >> (( b >> 3 ) & 0x1e )) & 3 ) + 1 } // get_char get a char from position i and return an u32 with the unicode code [inline] [direct_array_access] fn (re RE) get_char(in_txt string, i int) (u32,int) { ini := unsafe {in_txt.str[i]} // ascii 8 bit if (re.flag & f_bin) !=0 || ini & 0x80 == 0 { return u32(ini), 1 } // unicode char char_len := utf8util_char_len(ini) mut tmp := 0 mut ch := u32(0) for tmp < char_len { ch = (ch << 8) | unsafe {in_txt.str[i + tmp]} tmp++ } return ch,char_len } // get_charb get a char from position i and return an u32 with the unicode code [inline] [direct_array_access] fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) { // ascii 8 bit if (re.flag & f_bin) !=0 || unsafe {in_txt[i]} & 0x80 == 0 { return u32(unsafe {in_txt[i]}), 1 } // unicode char char_len := utf8util_char_len(unsafe {in_txt[i]}) mut tmp := 0 mut ch := u32(0) for tmp < char_len { ch = (ch << 8) | unsafe {in_txt[i + tmp]} tmp++ } return ch,char_len } [inline] fn is_alnum(in_char byte) bool { mut tmp := in_char - `A` if tmp <= 25 { return true } tmp = in_char - `a` if tmp <= 25 { return true } tmp = in_char - `0` if tmp <= 9 { return true } if tmp == `_` { return true } return false } [inline] fn is_not_alnum(in_char byte) bool { return !is_alnum(in_char) } [inline] fn is_space(in_char byte) bool { return in_char in spaces } [inline] fn is_not_space(in_char byte) bool { return !is_space(in_char) } [inline] fn is_digit(in_char byte) bool { tmp := in_char - `0` return tmp <= 0x09 } [inline] fn is_not_digit(in_char byte) bool { return !is_digit(in_char) } [inline] fn is_wordchar(in_char byte) bool { return is_alnum(in_char) || in_char == `_` } [inline] fn is_not_wordchar(in_char byte) bool { return !is_alnum(in_char) } [inline] fn is_lower(in_char byte) bool { tmp := in_char - `a` return tmp <= 25 } [inline] fn is_upper(in_char byte) bool { tmp := in_char - `A` return tmp <= 25 } pub fn (re RE) get_parse_error_string(err int) string { match err { compile_ok { return "compile_ok" } no_match_found { return "no_match_found" } err_char_unknown { return "err_char_unknown" } err_undefined { return "err_undefined" } err_internal_error { return "err_internal_error" } err_cc_alloc_overflow { return "err_cc_alloc_overflow" } err_syntax_error { return "err_syntax_error" } err_groups_overflow { return "err_groups_overflow" } err_groups_max_nested { return "err_groups_max_nested" } err_group_not_balanced { return "err_group_not_balanced" } err_group_qm_notation { return "err_group_qm_notation" } else { return "err_unknown" } } } // utf8_str convert and utf8 sequence to a printable string [inline] fn utf8_str(ch rune) string { mut i := 4 mut res := "" for i > 0 { v := byte((ch >> ((i - 1) * 8)) & 0xFF) if v != 0{ res += "${v:1c}" } i-- } return res } // simple_log default log function fn simple_log(txt string) { print(txt) } /****************************************************************************** * * Token Structs * ******************************************************************************/ pub type FnValidator = fn (byte) bool struct Token{ mut: ist rune // char ch rune // char of the token if any ch_len byte // char len // Quantifiers / branch rep_min int // used also for jump next in the OR branch [no match] pc jump rep_max int // used also for jump next in the OR branch [ match] pc jump greedy bool // greedy quantifier flag // Char class cc_index int = -1 // counters for quantifier check (repetitions) rep int // validator function pointer validator FnValidator // groups variables group_rep int // repetition of the group group_id int = -1 // id of the group goto_pc int = -1 // jump to this PC if is needed // OR flag for the token next_is_or bool // true if the next token is an OR // dot_char token variables dot_check_pc int = -1 // pc of the next token to check last_dot_flag bool // if true indicate that is the last dot_char in the regex } [inline] fn (mut tok Token) reset() { tok.rep = 0 } /****************************************************************************** * * Regex struct * ******************************************************************************/ pub const ( f_nl = 0x00000001 // end the match when find a new line symbol f_ms = 0x00000002 // match true only if the match is at the start of the string f_me = 0x00000004 // match true only if the match is at the end of the string f_efm = 0x00000100 // exit on first token matched, used by search f_bin = 0x00000200 // work only on bytes, ignore utf-8 // behaviour modifier flags f_src = 0x00020000 // search mode enabled ) struct StateDotObj{ mut: i int = -1 // char index in the input buffer pc int = -1 // program counter saved mi int = -1 // match_index saved group_stack_index int = -1 // continuous save on capturing groups } pub type FnLog = fn (string) pub struct RE { pub mut: prog []Token prog_len int // regex program len // char classes storage cc []CharClass // char class list cc_index int // index // groups group_count int // number of groups in this regex struct groups []int // groups index results group_max_nested int = 3 // max nested group group_max int = 8 // max allowed number of different groups group_csave_flag bool // flag to enable continuous saving group_csave []int = []int{} // groups continuous save list group_map map[string]int // groups names map // flags flag int // flag for optional parameters // Debug/log debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE log_func FnLog = simple_log // log function, can be customized by the user query string // query string } // Reset RE object [inline] [direct_array_access] fn (mut re RE) reset(){ re.cc_index = 0 mut i := 0 for i < re.prog_len { re.prog[i].group_rep = 0 // clear repetition of the group re.prog[i].rep = 0 // clear repetition of the token i++ } // init groups array if re.group_count > 0 { re.groups = []int{len: re.group_count*2, init: -1} } // reset group_csave if re.group_csave_flag == true { re.group_csave.clear() // = []int{} } } // reset for search mode fail // gcc bug, dont use [inline] or go 5 time slower //[inline] [direct_array_access] fn (mut re RE) reset_src(){ mut i := 0 for i < re.prog_len { re.prog[i].group_rep = 0 // clear repetition of the group re.prog[i].rep = 0 // clear repetition of the token i++ } } /****************************************************************************** * * Backslashes chars * ******************************************************************************/ struct BslsStruct { ch rune // meta char validator FnValidator // validator function pointer } const( bsls_validator_array = [ BslsStruct{`w`, is_alnum}, BslsStruct{`W`, is_not_alnum}, BslsStruct{`s`, is_space}, BslsStruct{`S`, is_not_space}, BslsStruct{`d`, is_digit}, BslsStruct{`D`, is_not_digit}, BslsStruct{`a`, is_lower}, BslsStruct{`A`, is_upper}, ] // these chars are escape if preceded by a \ bsls_escape_list = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`] ) enum BSLS_parse_state { start bsls_found bsls_char normal_char } // parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){ mut status := BSLS_parse_state.start mut i := in_i for i < in_txt.len { // get our char char_tmp, char_len := re.get_char(in_txt, i) ch := byte(char_tmp) if status == .start && ch == `\\` { status = .bsls_found i += char_len continue } // check if is our bsls char, for now only one length sequence if status == .bsls_found { for c,x in bsls_validator_array { if x.ch == ch { return c, i-in_i+1 } } status = .normal_char continue } // no BSLS validator, manage as normal escape char char if status == .normal_char { if ch in bsls_escape_list { return no_match_found, i-in_i+1 } return err_syntax_error, i-in_i+1 } // at the present time we manage only one char after the \ break } // not our bsls return KO return err_syntax_error, i } /****************************************************************************** * * Char class * ******************************************************************************/ const( cc_null = 0 // empty cc token cc_char = 1 // simple char: a cc_int = 2 // char interval: a-z cc_bsls = 3 // backslash char cc_end = 4 // cc sequence terminator ) struct CharClass { mut: cc_type int = cc_null // type of cc token ch0 rune // first char of the interval a-b a in this case ch1 rune // second char of the interval a-b b in this case validator FnValidator // validator function pointer } enum CharClass_parse_state { start in_char in_bsls separator finish } fn (re RE) get_char_class(pc int) string { buf := []byte{len:(re.cc.len)} mut buf_ptr := &byte(&buf) mut cc_i := re.prog[pc].cc_index mut i := 0 mut tmp := 0 for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != cc_end { if re.cc[cc_i].cc_type == cc_bsls { unsafe { buf_ptr[i++] = `\\` buf_ptr[i++] = byte(re.cc[cc_i].ch0) } } else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 { tmp = 3 for tmp >= 0 { x := byte((re.cc[cc_i].ch0 >> (tmp*8)) & 0xFF) if x != 0 { unsafe { buf_ptr[i++] = x } } tmp-- } } else { tmp = 3 for tmp >= 0 { x := byte((re.cc[cc_i].ch0 >> (tmp*8)) & 0xFF) if x != 0 { unsafe { buf_ptr[i++] = x } } tmp-- } unsafe { buf_ptr[i++] = `-` } tmp = 3 for tmp >= 0 { x := byte((re.cc[cc_i].ch1 >> (tmp*8)) & 0xFF) if x != 0 { unsafe { buf_ptr[i++] = x } } tmp-- } } cc_i++ } unsafe { buf_ptr[i] = byte(0) } return tos_clone( buf_ptr ) } fn (re RE) check_char_class(pc int, ch rune) bool { mut cc_i := re.prog[pc].cc_index for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != cc_end { if re.cc[cc_i].cc_type == cc_bsls { if re.cc[cc_i].validator(byte(ch)) { return true } } else if ch >= re.cc[cc_i].ch0 && ch <= re.cc[cc_i].ch1 { return true } cc_i++ } return false } // parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) { mut status := CharClass_parse_state.start mut i := in_i mut tmp_index := re.cc_index res_index := re.cc_index mut cc_type := u32(ist_char_class_pos) for i < in_txt.len { // check if we are out of memory for char classes if tmp_index >= re.cc.len { return err_cc_alloc_overflow, 0, u32(0) } // get our char char_tmp,char_len := re.get_char(in_txt,i) ch := byte(char_tmp) //println("CC #${i:3d} ch: ${ch:c}") // negation if status == .start && ch == `^` { cc_type = u32(ist_char_class_neg) i += char_len continue } // minus symbol if status == .start && ch == `-` { re.cc[tmp_index].cc_type = cc_char re.cc[tmp_index].ch0 = char_tmp re.cc[tmp_index].ch1 = char_tmp i += char_len tmp_index++ continue } // bsls if (status == .start || status == .in_char) && ch == `\\` { //println("CC bsls.") status = .in_bsls i += char_len continue } if status == .in_bsls { //println("CC bsls validation.") for c,x in bsls_validator_array { if x.ch == ch { //println("CC bsls found [${ch:c}]") re.cc[tmp_index].cc_type = cc_bsls re.cc[tmp_index].ch0 = bsls_validator_array[c].ch re.cc[tmp_index].ch1 = bsls_validator_array[c].ch re.cc[tmp_index].validator = bsls_validator_array[c].validator i += char_len tmp_index++ status = .in_char break } } if status == .in_bsls { //println("CC bsls not found [${ch:c}]") status = .in_char }else { continue } } // simple char if (status == .start || status == .in_char) && ch != `-` && ch != `]` { status = .in_char re.cc[tmp_index].cc_type = cc_char re.cc[tmp_index].ch0 = char_tmp re.cc[tmp_index].ch1 = char_tmp i += char_len tmp_index++ continue } // check range separator if status == .in_char && ch == `-` { status = .separator i += char_len continue } // check range end if status == .separator && ch != `]` && ch != `-` { status = .in_char re.cc[tmp_index-1].cc_type = cc_int re.cc[tmp_index-1].ch1 = char_tmp i += char_len continue } // char class end if status == .in_char && ch == `]` { re.cc[tmp_index].cc_type = cc_end re.cc[tmp_index].ch0 = 0 re.cc[tmp_index].ch1 = 0 re.cc_index = tmp_index+1 return res_index, i-in_i+2, cc_type } i++ } return err_syntax_error,0,u32(0) } /****************************************************************************** * * Re Compiler * ******************************************************************************/ // // Quantifier // enum Quant_parse_state { start min_parse comma_checked max_parse greedy gredy_parse finish } // parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { mut status := Quant_parse_state.start mut i := in_i mut q_min := 0 // default min in a {} quantifier is 1 mut q_max := 0 // deafult max in a {} quantifier is max_quantifier mut ch := byte(0) for i < in_txt.len { unsafe { ch = in_txt.str[i] } //println("${ch:c} status: $status") // exit on no compatible char with {} quantifier if utf8util_char_len(ch) != 1 { return err_syntax_error, i, 0, false } // min parsing skip if comma present if status == .start && ch == `,` { q_min = 0 // default min in a {} quantifier is 0 status = .comma_checked i++ continue } if status == .start && is_digit( ch ) { status = .min_parse q_min *= 10 q_min += int(ch - `0`) i++ continue } if status == .min_parse && is_digit( ch ) { q_min *= 10 q_min += int(ch - `0`) i++ continue } // we have parsed the min, now check the max if status == .min_parse && ch == `,` { status = .comma_checked i++ continue } // single value {4} if status == .min_parse && ch == `}` { q_max = q_min status = .greedy continue } // end without max if status == .comma_checked && ch == `}` { q_max = max_quantifier status = .greedy continue } // start max parsing if status == .comma_checked && is_digit( ch ) { status = .max_parse q_max *= 10 q_max += int(ch - `0`) i++ continue } // parse the max if status == .max_parse && is_digit( ch ) { q_max *= 10 q_max += int(ch - `0`) i++ continue } // finished the quantifier if status == .max_parse && ch == `}` { status = .greedy continue } // check if greedy flag char ? is present if status == .greedy { if i+1 < in_txt.len { i++ status = .gredy_parse continue } return q_min, q_max, i-in_i+2, false } // check the greedy flag if status == .gredy_parse { if ch == `?` { return q_min, q_max, i-in_i+2, true } else { i-- return q_min, q_max, i-in_i+2, false } } // not a {} quantifier, exit return err_syntax_error, i, 0, false } // not a conform {} quantifier return err_syntax_error, i, 0, false } // // Groups // enum Group_parse_state { start q_mark // (? q_mark1 // (?:|P checking p_status // (?P p_start // (?P< p_end // (?P<...> p_in_name // (?P<... finish } // parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index) fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) { mut status := Group_parse_state.start mut i := in_i mut name := '' for i < in_txt.len && status != .finish { // get our char char_tmp,char_len := re.get_char(in_txt,i) ch := byte(char_tmp) // start if status == .start && ch == `(` { status = .q_mark i += char_len continue } // check for question marks if status == .q_mark && ch == `?` { status = .q_mark1 i += char_len continue } // non capturing group if status == .q_mark1 && ch == `:` { i += char_len return 0, false, name, i } // enter in P section if status == .q_mark1 && ch == `P` { status = .p_status i += char_len continue } // not a valid q mark found if status == .q_mark1 { //println("NO VALID Q MARK") return -2 , true, name, i } if status == .p_status && ch == `<` { status = .p_start i += char_len continue } if status == .p_start && ch != `>` { status = .p_in_name name += "${ch:1c}" // TODO: manage utf8 chars i += char_len continue } // colect name if status == .p_in_name && ch != `>` && is_alnum(ch) { name += "${ch:1c}" // TODO: manage utf8 chars i += char_len continue } // end name if status == .p_in_name && ch == `>` { i += char_len return 0, true, name, i } // error on name group if status == .p_in_name { return -2 , true, name, i } // normal group, nothig to do, exit return 0 , true, name, i } /* UNREACHABLE */ //println("ERROR!! NOT MEANT TO BE HERE!!1") return -2 , true, name, i } // // main compiler // // compile return (return code, index) where index is the index of the error in the query string if return code is an error code [deprecated] pub fn (mut re RE) compile(in_txt string) (int, int) { return re.impl_compile(in_txt) } fn (mut re RE) impl_compile(in_txt string) (int,int) { mut i := 0 // input string index mut pc := 0 // program counter // group management variables mut group_count := -1 mut group_stack := []int{len: re.group_max_nested, init: 0} mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1} mut group_stack_index := -1 re.query = in_txt // save the query string i = 0 for i < in_txt.len { mut char_tmp := u32(0) mut char_len := 0 //println("i: ${i:3d} ch: ${in_txt.str[i]:c}") char_tmp,char_len = re.get_char(in_txt,i) // // check special cases: $ ^ // if char_len == 1 && i == 0 && byte(char_tmp) == `^` { re.flag = f_ms i = i + char_len continue } if char_len == 1 && i == (in_txt.len-1) && byte(char_tmp) == `$` { re.flag = f_me i = i + char_len continue } // ist_group_start if char_len == 1 && pc >= 0 && byte(char_tmp) == `(` { //check max groups allowed if group_count > re.group_max { return err_groups_overflow, i+1 } group_stack_index++ // check max nested groups allowed if group_stack_index > re.group_max_nested { return err_groups_max_nested, i+1 } tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i) // manage question mark format error if tmp_res < -1 { return err_group_qm_notation, next_i } //println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]") i = next_i if cgroup_flag == true { group_count++ } // calculate the group id // if it is a named group, recycle the group id // NOTE: **** the group index is +1 because map return 0 when not found!! **** mut group_id := group_count if cgroup_name.len > 0 { //println("GROUP NAME: ${cgroup_name}") if cgroup_name in re.group_map{ group_id = re.group_map[cgroup_name] - 1 group_count-- } else { re.group_map[cgroup_name] = group_id + 1 } } group_stack_txt_index[group_stack_index] = i group_stack[group_stack_index] = pc re.prog[pc].ist = u32(0) | ist_group_start re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 // set the group id if cgroup_flag == false { //println("NO CAPTURE GROUP") re.prog[pc].group_id = -1 } else { re.prog[pc].group_id = group_id } pc = pc + 1 continue } // ist_group_end if char_len==1 && pc > 0 && byte(char_tmp) == `)` { if group_stack_index < 0 { return err_group_not_balanced, i+1 } goto_pc := group_stack[group_stack_index] group_stack_index-- re.prog[pc].ist = u32(0) | ist_group_end re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc //re.prog[goto_pc].group_id = group_count // id of this group, used for storing data pc = pc + 1 i = i + char_len continue } // ist_dot_char match any char except the following token if char_len==1 && pc >= 0 && byte(char_tmp) == `.` { re.prog[pc].ist = u32(0) | ist_dot_char re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 pc = pc + 1 i = i + char_len continue } // OR branch if char_len==1 && pc > 0 && byte(char_tmp) == `|` { // two consecutive ist_dot_char are an error if pc > 0 && re.prog[pc-1].ist == ist_or_branch { return err_syntax_error,i } re.prog[pc].ist = u32(0) | ist_or_branch pc = pc + 1 i = i + char_len continue } // Quantifiers if char_len==1 && pc > 0{ mut quant_flag := true match byte(char_tmp) { `?` { //println("q: ${char_tmp:c}") re.prog[pc-1].rep_min = 0 re.prog[pc-1].rep_max = 1 } `+` { //println("q: ${char_tmp:c}") re.prog[pc-1].rep_min = 1 re.prog[pc-1].rep_max = max_quantifier } `*` { //println("q: ${char_tmp:c}") re.prog[pc-1].rep_min = 0 re.prog[pc-1].rep_max = max_quantifier } `{` { min, max, tmp, greedy := re.parse_quantifier(in_txt, i+1) // it is a quantifier if min >= 0 { //println("{$min,$max}\n str:[${in_txt[i..i+tmp]}] greedy:$greedy") i = i + tmp re.prog[pc-1].rep_min = min re.prog[pc-1].rep_max = max re.prog[pc-1].greedy = greedy continue } else { return min,i } // TODO: decide if the open bracket can be conform without the close bracket /* // no conform, parse as normal char else { quant_flag = false } */ } else{ quant_flag = false } } if quant_flag { i = i + char_len continue } } // IST_CHAR_CLASS_* if char_len==1 && pc >= 0{ if byte(char_tmp) == `[` { cc_index,tmp,cc_type := re.parse_char_class(in_txt, i+1) if cc_index >= 0 { //println("index: $cc_index str:${in_txt[i..i+tmp]}") i = i + tmp re.prog[pc].ist = u32(0) | cc_type re.prog[pc].cc_index = cc_index re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 pc = pc + 1 continue } // cc_class vector memory full else if cc_index < 0 { return cc_index, i } } } // ist_bsls_char if char_len==1 && pc >= 0{ if byte(char_tmp) == `\\` { bsls_index,tmp := re.parse_bsls(in_txt,i) //println("index: $bsls_index str:${in_txt[i..i+tmp]}") if bsls_index >= 0 { i = i + tmp re.prog[pc].ist = u32(0) | ist_bsls_char re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 re.prog[pc].validator = bsls_validator_array[bsls_index].validator re.prog[pc].ch = bsls_validator_array[bsls_index].ch pc = pc + 1 continue } // this is an escape char, skip the bsls and continue as a normal char else if bsls_index == no_match_found { i += char_len char_tmp,char_len = re.get_char(in_txt,i) // continue as simple char } // if not an escape or a bsls char then it is an error (at least for now!) else { return bsls_index, i+tmp } } } // ist_simple_char re.prog[pc].ist = ist_simple_char re.prog[pc].ch = char_tmp re.prog[pc].ch_len = byte(char_len) re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 //println("char: ${char_tmp:c}") pc = pc +1 i+=char_len } // add end of the program re.prog[pc].ist = ist_prog_end re.prog_len = pc // check for unbalanced groups if group_stack_index != -1 { return err_group_not_balanced, group_stack_txt_index[group_stack_index]+1 } // check for OR at the end of the program if pc > 0 && re.prog[pc-1].ist == ist_or_branch { return err_syntax_error,in_txt.len } // store the number of groups in the query re.group_count = group_count + 1 //****************************************** // Post processing //****************************************** // // manage ist_dot_char // // find the checks for dot chars, if any... mut pc1 := 0 mut dot_char_count := 0 mut last_dot_char_pc := -1 for pc1 < pc { if re.prog[pc1].ist == ist_dot_char { //println("Dot_char pc: $pc1") last_dot_char_pc = pc1 dot_char_count++ mut pc2 := pc1 + 1 for pc2 < pc { if re.prog[pc2].ist == ist_dot_char { return err_syntax_error,0 } if re.prog[pc2].ist !in [rune(ist_prog_end), ist_group_end, ist_group_start] { //println("Next dot char check is PC: ${pc2}") re.prog[pc1].dot_check_pc = pc2 break } pc2++ } } pc1++ } //println("last_dot_char_pc: $last_dot_char_pc") if last_dot_char_pc >= 0 { pc1 = last_dot_char_pc + 1 mut is_last_dot := true for pc1 < pc { if re.prog[pc1].ist !in [rune(ist_prog_end), ist_group_end] { is_last_dot = false break } pc1++ } if is_last_dot { re.prog[last_dot_char_pc].last_dot_flag = true } } //****************************************** // OR branch // a|b|cd // d exit point // a,b,c branches // set the jump in the right places pc1 = 0 for pc1 < pc-2 { //println("Here $pc1 ${pc-2}") // two consecutive OR are a syntax error if re.prog[pc1+1].ist == ist_or_branch && re.prog[pc1+2].ist == ist_or_branch { return err_syntax_error, i } // manange a|b chains like a|(b)|c|d... // standard solution if re.prog[pc1].ist != ist_or_branch && re.prog[pc1+1].ist == ist_or_branch && re.prog[pc1+2].ist != ist_or_branch { re.prog[pc1].next_is_or = true // set that the next token is an OR re.prog[pc1+1].rep_min = pc1+2 // failed match jump // match jump, if an OR chain the next token will be an OR token mut pc2 := pc1+2 for pc2 < pc-1 { ist := re.prog[pc2].ist if ist == ist_group_start { re.prog[pc1+1].rep_max = re.prog[pc2].goto_pc + 1 break } if ist != ist_or_branch { re.prog[pc1+1].rep_max = pc2 + 1 break } pc2++ } // special case query of few chars, teh true can't go on the first instruction if re.prog[pc1+1].rep_max == pc1 { re.prog[pc1+1].rep_max = 3 } //println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]") pc1 = pc2 continue } pc1++ } //****************************************** // DEBUG PRINT REGEX GENERATED CODE //****************************************** if re.debug > 0 { gc := re.get_code() re.log_func( gc ) } //****************************************** return compile_ok, 0 } // get_code return the compiled code as regex string, note: may be different from the source! pub fn (re RE) get_code() string { mut pc1 := 0 mut res := strings.new_builder(re.cc.len*2*re.prog.len) res.write("========================================\nv RegEx compiler v $v_regex_version output:\n") mut stop_flag := false for pc1 <= re.prog.len { tk := re.prog[pc1] res.write("PC:${pc1:3d}") res.write(" ist: ") res.write("${tk.ist:8x}".replace(" ","0") ) res.write(" ") ist :=tk.ist if ist == ist_bsls_char { res.write("[\\${tk.ch:1c}] BSLS") } else if ist == ist_prog_end { res.write("PROG_END") stop_flag = true } else if ist == ist_or_branch { res.write("OR ") } else if ist == ist_char_class_pos { res.write("[${re.get_char_class(pc1)}] CHAR_CLASS_POS") } else if ist == ist_char_class_neg { res.write("[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG") } else if ist == ist_dot_char { res.write(". DOT_CHAR nx chk: ${tk.dot_check_pc}") if tk.last_dot_flag == true { res.write(" last!") } } else if ist == ist_group_start { res.write("( GROUP_START #:${tk.group_id}") if tk.group_id == -1 { res.write(" ?:") } else { for x in re.group_map.keys() { if re.group_map[x] == (tk.group_id+1) { res.write(" ?P<${x}>") break } } } } else if ist == ist_group_end { res.write(") GROUP_END #:${tk.group_id}") } else if ist == ist_simple_char { res.write("[${tk.ch:1c}] query_ch") } if tk.rep_max == max_quantifier { res.write(" {${tk.rep_min:3d},MAX}") }else{ if ist == ist_or_branch { res.write(" if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}") } else { res.write(" {${tk.rep_min:3d},${tk.rep_max:3d}}") } if tk.greedy == true { res.write("?") } } res.write("\n") if stop_flag { break } pc1++ } res.write("========================================\n") return res.str() } // get_query return a string with a reconstruction of the query starting from the regex program code pub fn (re RE) get_query() string { mut res := strings.new_builder(re.query.len*2) if (re.flag & f_ms) != 0 { res.write("^") } mut i := 0 for i < re.prog.len && re.prog[i].ist != ist_prog_end && re.prog[i].ist != 0{ tk := unsafe { &re.prog[i] } ch := tk.ist // GROUP start if ch == ist_group_start { if re.debug == 0 { res.write("(") } else { if tk.group_id == -1 { res.write("(?:") // non capturing group } else { res.write("#${tk.group_id}(") } } for x in re.group_map.keys() { if re.group_map[x] == (tk.group_id+1) { res.write("?P<${x}>") break } } i++ continue } // GROUP end if ch == ist_group_end { res.write(")") } // OR branch if ch == ist_or_branch { res.write("|") if re.debug > 0 { res.write("{${tk.rep_min},${tk.rep_max}}") } i++ continue } // char class if ch == ist_char_class_neg || ch == ist_char_class_pos { res.write("[") if ch == ist_char_class_neg { res.write("^") } res.write("${re.get_char_class(i)}") res.write("]") } // bsls char if ch == ist_bsls_char { res.write("\\${tk.ch:1c}") } // ist_dot_char if ch == ist_dot_char { res.write(".") } // char alone if ch == ist_simple_char { if byte(ch) in bsls_escape_list { res.write("\\") } res.write("${tk.ch:c}") } // quantifier if !(tk.rep_min == 1 && tk.rep_max == 1) { if tk.rep_min == 0 && tk.rep_max == 1 { res.write("?") } else if tk.rep_min == 1 && tk.rep_max == max_quantifier { res.write("+") } else if tk.rep_min == 0 && tk.rep_max == max_quantifier { res.write("*") } else { if tk.rep_max == max_quantifier { res.write("{${tk.rep_min},MAX}") } else { res.write("{${tk.rep_min},${tk.rep_max}}") } if tk.greedy == true { res.write("?") } } } i++ } if (re.flag & f_me) != 0 { res.write("$") } return res.str() } /****************************************************************************** * * Groups saving utilities * ******************************************************************************/ [direct_array_access] fn (mut re RE) group_continuous_save(g_index int) { if re.group_csave_flag == true { // continuous save, save until we have space // init the first element as counter if re.group_csave.len == 0 { re.group_csave << 0 } gi := g_index >> 1 start := re.groups[g_index] end := re.groups[g_index+1] // check if we are simply increasing the size ot the found group if re.group_csave.len >=4 && gi == re.group_csave[re.group_csave.len - 3] && start == re.group_csave[re.group_csave.len - 2] { re.group_csave[re.group_csave.len - 1] = end return } // otherwise append a new group to the list // increment counter re.group_csave[0]++ // save the record re.group_csave << (g_index >> 1) // group id re.group_csave << re.groups[g_index] // start re.group_csave << re.groups[g_index+1] // end } } /****************************************************************************** * * Matching * ******************************************************************************/ enum Match_state{ start = 0 stop end new_line ist_load // load and execute instruction ist_next // go to next instruction ist_next_ks // go to next instruction without clenaning the state ist_quant_p // match positive ,quantifier check ist_quant_n // match negative, quantifier check ist_quant_pg // match positive ,group quantifier check ist_quant_ng // match negative ,group quantifier check } fn state_str(s Match_state) string { match s{ .start { return "start" } .stop { return "stop" } .end { return "end" } .new_line { return "new line" } .ist_load { return "ist_load" } .ist_next { return "ist_next" } .ist_next_ks { return "ist_next_ks" } .ist_quant_p { return "ist_quant_p" } .ist_quant_n { return "ist_quant_n" } .ist_quant_pg { return "ist_quant_pg" } .ist_quant_ng { return "ist_quant_ng" } } } struct StateObj { pub mut: group_index int = -1 // group id used to know how many groups are open match_flag bool match_index int = -1 first_match int = -1 //index of the first match pc int = -1 // program counter i int = -1 // source string index char_len int last_dot_pc int = -1 // last dot chat pc } [direct_array_access] pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // result status mut result := no_match_found // function return mut ch := rune(0) // examinated char mut char_len := 0 // utf8 examinated char len mut m_state := Match_state.start // start point for the matcher FSM mut src_end := false mut last_fnd_pc := -1 mut state := StateObj{} // actual state mut ist := rune(0) // actual instruction mut l_ist := rune(0) // last matched instruction mut state_list := []StateObj{} mut group_stack := []int{len: re.group_max, init: -1} mut group_data := []int{len: re.group_max, init: -1} //mut group_index := -1 // group id used to know how many groups are open mut step_count := 0 // stats for debug mut dbg_line := 0 // count debug line printed re.reset() if re.debug>0 { // print header mut h_buf := strings.new_builder(32) h_buf.write("flags: ") h_buf.write("${re.flag:8x}".replace(" ","0")) h_buf.write("\n") sss := h_buf.str() re.log_func(sss) } for m_state != .end { if state.pc >= 0 && state.pc < re.prog.len { ist = re.prog[state.pc].ist }else if state.pc >= re.prog.len { //println("ERROR!! PC overflow!!") return err_internal_error, state.i } //****************************************** // DEBUG LOG //****************************************** if re.debug>0 { mut buf2 := strings.new_builder(re.cc.len + 128) // print all the instructions // end of the input text if state.i >= in_txt_len { buf2.write("# ${step_count:3d} END OF INPUT TEXT\n") sss := buf2.str() re.log_func(sss) }else{ // print only the exe instruction if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 { if ist == ist_prog_end { buf2.write("# ${step_count:3d} PROG_END\n") } else if ist == 0 || m_state in [.start,.ist_next,.stop] { buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n") }else{ ch, char_len = re.get_charb(in_txt, state.i) buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>") buf2.write("${ist:8x}".replace(" ","0")) buf2.write(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ") if ist == ist_simple_char { buf2.write("query_ch: [${re.prog[state.pc].ch:1c}]") } else { if ist == ist_bsls_char { buf2.write("BSLS [\\${re.prog[state.pc].ch:1c}]") } else if ist == ist_prog_end { buf2.write("PROG_END") } else if ist == ist_or_branch { buf2.write("OR") } else if ist == ist_char_class_pos { buf2.write("CHAR_CLASS_POS[${re.get_char_class(state.pc)}]") } else if ist == ist_char_class_neg { buf2.write("CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]") } else if ist == ist_dot_char { buf2.write("DOT_CHAR") } else if ist == ist_group_start { tmp_gi :=re.prog[state.pc].group_id tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep buf2.write("GROUP_START #:${tmp_gi} rep:${tmp_gr} ") } else if ist == ist_group_end { buf2.write("GROUP_END #:${re.prog[state.pc].group_id} deep:${state.group_index}") } } if re.prog[state.pc].rep_max == max_quantifier { buf2.write("{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}") } else { buf2.write("{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}") } if re.prog[state.pc].greedy == true { buf2.write("?") } buf2.write(" (#${state.group_index})") if ist == ist_dot_char { buf2.write(" last!") } buf2.write("\n") } sss2 := buf2.str() re.log_func( sss2 ) } } step_count++ dbg_line++ } //****************************************** if ist == ist_prog_end { //println("HERE we end!") break } // we're out of text, manage it if state.i > in_txt_len || m_state == .new_line { //println("Finished text!!") src_end = true // manage groups if state.group_index >= 0 && state.match_index >= 0 { //println("End text with open groups!") // close the groups for state.group_index >= 0 { tmp_pc := group_data[state.group_index] re.prog[tmp_pc].group_rep++ //println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}") if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{ start_i := group_stack[state.group_index] group_stack[state.group_index]=-1 // save group results g_index := re.prog[tmp_pc].group_id*2 if start_i >= 0 { re.groups[g_index] = start_i } else { re.groups[g_index] = 0 } // we have fished the text, we must manage out pf bound indexes if state.i >= in_txt_len { state.i = in_txt_len-1 } re.groups[g_index+1] = state.i if re.groups[g_index+1] >= in_txt_len { //println("clamp group on stop!") re.groups[g_index+1] = in_txt_len-1 } // continuous save, save until we have space re.group_continuous_save(g_index) } state.group_index-- } } // the text is finished and the groups closed and we are the last group, ok exit if ist == ist_group_end && re.prog[state.pc+1].ist == ist_prog_end { //println("Last group end") return state.first_match, state.i } if state.pc == -1 { state.pc = last_fnd_pc } //println("Finished text!!") //println("Instruction: ${ist:08x} pc: $state.pc") //println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}") // program end if ist == ist_prog_end { //println("Program end on end of text!") return state.first_match, state.i } // we are in a last dot_ char case if l_ist == ist_dot_char { //println("***** We have a last dot_char") //println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}") //println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}") //println("first match: ${state.first_match}") if re.prog[state.pc].last_dot_flag == true && re.prog[state.pc].rep >= re.prog[state.pc].rep_min && re.prog[state.pc].rep <= re.prog[state.pc].rep_max { return state.first_match, state.i } //println("Not fitted!!") } //m_state = .end //break return no_match_found,0 } // starting and init if m_state == .start { state.pc = -1 state.i = 0 m_state = .ist_next continue } // ist_next, next instruction reseting its state else if m_state == .ist_next { state.pc = state.pc + 1 re.prog[state.pc].reset() // check if we are in the program bounds if state.pc < 0 || state.pc > re.prog.len { //println("ERROR!! PC overflow!!") return err_internal_error, state.i } m_state = .ist_load continue } // ist_next_ks, next instruction keeping its state else if m_state == .ist_next_ks { state.pc = state.pc + 1 // check if we are in the program bounds if state.pc < 0 || state.pc > re.prog.len { //println("ERROR!! PC overflow!!") return err_internal_error, state.i } m_state = .ist_load continue } // load the char ch, char_len = re.get_charb(in_txt, state.i) // check new line if flag f_nl enabled if (re.flag & f_nl) != 0 && char_len == 1 && byte(ch) in new_line_list { m_state = .new_line continue } // check if stop else if m_state == .stop { // we are in search mode, don't exit until the end if ((re.flag & f_src) != 0) && (ist != ist_prog_end) { last_fnd_pc = state.pc state.pc = -1 state.i += char_len m_state = .ist_next re.reset_src() state.match_index = -1 state.first_match = -1 continue } if ist == ist_prog_end { return state.first_match, state.i } // manage here dot char if state_list.len > 0 { //println("Here we are, with stop: state buffer: [${state_list.len}]") state = state_list.pop() state.match_flag = true l_ist = u32(ist_dot_char) if state.first_match < 0 { state.first_match = state.i } state.match_index = state.i re.prog[state.pc].rep++ // increase repetitions state.i += char_len m_state = .ist_quant_p continue } // exit on no match return result,0 } // ist_load else if m_state == .ist_load { // program end if ist == ist_prog_end { // if we are in match exit well if state.group_index >= 0 && state.match_index >= 0 { state.group_index = -1 } m_state = .stop continue } // check GROUP start, no quantifier is checkd for this token!! else if ist == ist_group_start { state.group_index++ group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape group_stack[state.group_index] = state.i // index where we start to manage //println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}") m_state = .ist_next continue } // check GROUP end else if ist == ist_group_end { // we are in matching streak //println("Group END!! last ist: ${l_ist:08x}") if state.match_index >= 0 { // restore txt index stack and save the group data //println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}") if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 { start_i := group_stack[state.group_index] //group_stack[state.group_index]=-1 // save group results g_index := re.prog[state.pc].group_id*2 if start_i >= 0 { re.groups[g_index] = start_i } else { re.groups[g_index] = 0 } re.groups[g_index+1] = state.i if g_index > 0 && re.groups[g_index] <= re.groups[g_index-1] { re.groups[g_index] = re.groups[g_index-1] } if re.groups[g_index+1] >= in_txt_len { //println("clamp group!") re.groups[g_index+1] = in_txt_len-1 } //println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: $state.i in_txt_len: $in_txt_len") // continuous save, save until we have space re.group_continuous_save(g_index) } re.prog[state.pc].group_rep++ // increase repetitions //println("GROUP $group_index END ${re.prog[state.pc].group_rep}") m_state = .ist_quant_pg continue } m_state = .ist_quant_ng continue } // check OR else if ist == ist_or_branch { if state.match_index >= 0 { state.pc = re.prog[state.pc].rep_max //println("ist_or_branch True pc: $state.pc") }else{ state.pc = re.prog[state.pc].rep_min //println("ist_or_branch False pc: $state.pc") } re.prog[state.pc].reset() m_state = .ist_load continue } // check ist_dot_char else if ist == ist_dot_char { //println("ist_dot_char rep: ${re.prog[state.pc].rep}") // check next token to be false mut next_check_flag := false //if re.prog[state.pc].rep >= re.prog[state.pc].rep_min && // if we are done with max go on dot char are dedicated case!! if re.prog[state.pc].rep >= re.prog[state.pc].rep_max { state_list.pop() m_state = .ist_next continue } if re.prog[state.pc].dot_check_pc >= 0 { // load the char //ch_t, _ := re.get_charb(in_txt, state.i+char_len) ch_t := ch chk_pc := re.prog[state.pc].dot_check_pc // simple char if re.prog[chk_pc].ist == ist_simple_char { if re.prog[chk_pc].ch == ch_t { next_check_flag = true } //println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag") } // char char_class else if re.prog[chk_pc].ist == ist_char_class_pos || re.prog[chk_pc].ist == ist_char_class_neg { mut cc_neg := false if re.prog[chk_pc].ist == ist_char_class_neg { cc_neg = true } mut cc_res := re.check_char_class(chk_pc,ch_t) if cc_neg { cc_res = !cc_res } next_check_flag = cc_res //println("Check [ist_char_class] => $next_check_flag") } // check bsls else if re.prog[chk_pc].ist == ist_bsls_char { next_check_flag = re.prog[chk_pc].validator(byte(ch_t)) //println("Check [ist_bsls_char] => $next_check_flag") } } // check if we must continue or pass to the next IST if next_check_flag == true { //println("save the state!!") state_list << StateObj { group_index: state.group_index match_flag: state.match_flag match_index: state.match_index first_match: state.first_match pc: state.pc i: state.i + char_len char_len: char_len last_dot_pc: state.pc } m_state = .ist_quant_n //println("dot_char stack len: $state_list.len") continue } state.match_flag = true l_ist = u32(ist_dot_char) if state.first_match < 0 { state.first_match = state.i } state.match_index = state.i re.prog[state.pc].rep++ // increase repetitions state.i += char_len m_state = .ist_quant_p continue } // char class IST else if ist == ist_char_class_pos || ist == ist_char_class_neg { state.match_flag = false mut cc_neg := false if ist == ist_char_class_neg { cc_neg = true } mut cc_res := re.check_char_class(state.pc,ch) if cc_neg { cc_res = !cc_res } if cc_res { state.match_flag = true l_ist = u32(ist_char_class_pos) if state.first_match < 0 { state.first_match = state.i } state.match_index = state.i re.prog[state.pc].rep++ // increase repetitions state.i += char_len // next char m_state = .ist_quant_p continue } m_state = .ist_quant_n continue } // check bsls else if ist == ist_bsls_char { state.match_flag = false tmp_res := re.prog[state.pc].validator(byte(ch)) //println("BSLS in_ch: ${ch:c} res: $tmp_res") if tmp_res { state.match_flag = true l_ist = u32(ist_bsls_char) if state.first_match < 0 { state.first_match = state.i } state.match_index = state.i re.prog[state.pc].rep++ // increase repetitions state.i += char_len // next char m_state = .ist_quant_p continue } m_state = .ist_quant_n continue } // simple char IST else if ist == ist_simple_char { //println("ist_simple_char") state.match_flag = false if re.prog[state.pc].ch == ch { state.match_flag = true l_ist = ist_simple_char if state.first_match < 0 { state.first_match = state.i } //println("state.match_index: ${state.match_index}") state.match_index = state.i re.prog[state.pc].rep++ // increase repetitions state.i += char_len // next char m_state = .ist_quant_p continue } m_state = .ist_quant_n continue } /* UNREACHABLE */ //println("PANIC2!! state: $m_state") return err_internal_error, state.i } /*********************************** * Quantifier management ***********************************/ // ist_quant_ng => quantifier negative test on group else if m_state == .ist_quant_ng { // we are finished here if state.group_index < 0 { //println("Early stop!") result = no_match_found m_state = .stop continue } tmp_pc := group_data[state.group_index] // PC to the end of the group token rep := re.prog[tmp_pc].group_rep // use a temp variable re.prog[tmp_pc].group_rep = 0 // clear the repetitions //println(".ist_quant_ng group_pc_end: $tmp_pc rep: $rep") if rep >= re.prog[tmp_pc].rep_min { //println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index") state.i = group_stack[state.group_index] state.pc = tmp_pc state.group_index-- m_state = .ist_next continue } else if re.prog[tmp_pc].next_is_or { //println("ist_quant_ng OR Negative branch") state.i = group_stack[state.group_index] state.pc = re.prog[tmp_pc+1].rep_min -1 state.group_index-- m_state = .ist_next continue } else if rep>0 && rep < re.prog[tmp_pc].rep_min { //println("ist_quant_ng UNDER THE MINIMUM g.i: $state.group_index") // check if we are inside a group, if yes exit from the nested groups if state.group_index > 0{ state.group_index-- state.pc = tmp_pc m_state = .ist_quant_ng //.ist_next continue } if state.group_index == 0 { state.group_index-- state.pc = tmp_pc // TEST m_state = .ist_next continue } result = no_match_found m_state = .stop continue } else if rep==0 && rep < re.prog[tmp_pc].rep_min { //println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: $state.group_index") if state.group_index > 0{ state.group_index-- state.pc = tmp_pc m_state = .ist_quant_ng //.ist_next continue } result = no_match_found m_state = .stop continue } //println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:$rep") /* UNREACHABLE */ return err_internal_error, state.i } // ist_quant_pg => quantifier positive test on group else if m_state == .ist_quant_pg { //println(".ist_quant_pg") mut tmp_pc := state.pc if state.group_index >= 0 { tmp_pc = group_data[state.group_index] } rep := re.prog[tmp_pc].group_rep if rep < re.prog[tmp_pc].rep_min { //println("ist_quant_pg UNDER RANGE") state.pc = re.prog[tmp_pc].goto_pc m_state = .ist_next continue } else if rep == re.prog[tmp_pc].rep_max { //println("ist_quant_pg MAX RANGE") re.prog[tmp_pc].group_rep = 0 // clear the repetitions state.group_index-- m_state = .ist_next continue } else if rep >= re.prog[tmp_pc].rep_min { //println("ist_quant_pg IN RANGE group_index:$state.group_index") // check greedy flag, if true exit on minimum if re.prog[tmp_pc].greedy == true { re.prog[tmp_pc].group_rep = 0 // clear the repetitions state.group_index-- m_state = .ist_next continue } state.pc = re.prog[tmp_pc].goto_pc - 1 state.group_index-- m_state = .ist_next continue } /* UNREACHABLE */ //println("PANIC3!! state: $m_state") return err_internal_error, state.i } // ist_quant_n => quantifier negative test on token else if m_state == .ist_quant_n { rep := re.prog[state.pc].rep //println("Here!! PC $state.pc is_next_or: ${re.prog[state.pc].next_is_or}") // zero quantifier * or ? if rep == 0 && re.prog[state.pc].rep_min == 0 { //println("ist_quant_n c_zero RANGE MIN") m_state = .ist_next // go to next ist continue } // match + or * else if rep >= re.prog[state.pc].rep_min { //println("ist_quant_n MATCH RANGE") m_state = .ist_next continue } // check the OR if present if re.prog[state.pc].next_is_or { //println("OR present on failing") state.match_index = -1 m_state = .ist_next continue } // we are in a group manage no match from here if state.group_index >= 0 { //println("ist_quant_n FAILED insied a GROUP group_index:$state.group_index") m_state = .ist_quant_ng continue } // no other options //println("ist_quant_n no_match_found") result = no_match_found m_state = .stop continue //return no_match_found, 0 } // ist_quant_p => quantifier positive test on token else if m_state == .ist_quant_p { // exit on first match if (re.flag & f_efm) != 0 { return state.i, state.i+1 } rep := re.prog[state.pc].rep // under range if rep > 0 && rep < re.prog[state.pc].rep_min { //println("ist_quant_p UNDER RANGE") m_state = .ist_load // continue the loop continue } // range ok, continue loop else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max { //println("ist_quant_p IN RANGE") // check greedy flag, if true exit on minimum if re.prog[state.pc].greedy == true { m_state = .ist_next continue } m_state = .ist_load continue } // max reached else if rep == re.prog[state.pc].rep_max { //println("ist_quant_p MAX RANGE") m_state = .ist_next continue } } /* UNREACHABLE */ //println("PANIC4!! state: $m_state") return err_internal_error, state.i } //println("Check end of text!") // Check the results if state.match_index >= 0 { if state.group_index < 0 { if re.prog[state.pc].ist == ist_prog_end { //println("program ended!!") if (re.flag & f_src) != 0 { //println("find return") return state.first_match, state.i } else { return 0, state.i } } //println("No Group here, natural end [$state.first_match,$state.i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len") if re.prog[state.pc+1].ist == ist_prog_end || re.prog[state.pc].ist == ist_prog_end{ rep := re.prog[state.pc].rep //println("rep: $rep re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}") if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max { return state.first_match, state.i } //println("Program not finished! ") return no_match_found, 0 } if src_end { //println("program end") return state.first_match, state.i } //print("No match found!!") return no_match_found, 0 } else { //println("Group match! OK") //println("first_match: $state.first_match, i: $state.i") //println("Skip last group") return state.first_match,state.i //return state.first_match,group_stack[state.group_index--] } } //println("no_match_found, natural end") return no_match_found, 0 } /****************************************************************************** * * Public functions * ******************************************************************************/ // // Matchers // [direct_array_access] pub fn (mut re RE) match_string(in_txt string) (int,int) { start, mut end := re.match_base(in_txt.str, in_txt.len + 1) if end > in_txt.len { end = in_txt.len } if start >= 0 && end > start { if (re.flag & f_ms) != 0 && start > 0 { return no_match_found, 0 } if (re.flag & f_me) != 0 && end < in_txt.len { if in_txt[end] in new_line_list { return start, end } return no_match_found, 0 } return start, end } return start, end } // // Finders // // find try to find the first match in the input string [direct_array_access] pub fn (mut re RE) find(in_txt string) (int,int) { old_flag := re.flag re.flag |= f_src // enable search mode start, mut end := re.match_base(in_txt.str, in_txt.len + 1) //print("Find [$start,$end] '${in_txt[start..end]}'") if end > in_txt.len { end = in_txt.len } re.flag = old_flag if start >= 0 && end > start { return start, end } return no_match_found, 0 } // find all the non overlapping occurrences of the match pattern [direct_array_access] pub fn (mut re RE) find_all(in_txt string) []int { mut i := 0 mut res := []int{} mut ls := -1 for i < in_txt.len { s,e := re.find(in_txt[i..]) if s >= 0 && e > s && i+s > ls { //println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls") res << i+s res << i+e ls = i+s i = i+e continue } else { i++ } } return res } // replace return a string where the matches are replaced with the replace string pub fn (mut re RE) replace(in_txt string, repl string) string { pos := re.find_all(in_txt) if pos.len > 0 { mut res := "" mut i := 0 mut s1 := 0 mut e1 := in_txt.len for i < pos.len { e1 = pos[i] res += in_txt[s1..e1] + repl s1 = pos[i+1] i += 2 } res += in_txt[s1..] return res } return in_txt } pub type FnReplace = fn (re RE, in_txt string, start int, end int) string // replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string { mut i := 0 mut res := "" mut ls := -1 mut s1 := 0 //mut e1 := in_txt.len for i < in_txt.len { s,e := re.find(in_txt[i..]) if s >= 0 && e > s && i+s > ls { //println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls") start := i + s end := i + e // update grups index diplacement mut gi := 0 for gi < re.groups.len { re.groups[gi] += i gi++ } repl := repl_fn(re, in_txt, start, end) res += in_txt[s1..start] + repl s1 = end ls = i + s i = i + e continue } else { i++ } } res += in_txt[s1..] return res }