v/vlib/regex/regex.v

2366 lines
61 KiB
V

/*
regex 1.0 alpha
Copyright (c) 2019-2021 Dario Deledda. All rights reserved.
Use of this source code is governed by an MIT license
that can be found in the LICENSE file.
This file contains regex module
Know limitation:
- find is implemented in a trivial way
- not full compliant PCRE
- not compliant POSIX ERE
*/
module regex
import strings
pub const (
v_regex_version = '1.0 alpha' // regex module version
max_code_len = 256 // default small base code len for the regex programs
max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
// new line chars for now only '\n'
new_line_list = [`\n`, `\r`]
// Results
no_match_found = -1
// Errors
compile_ok = 0 // the regex string compiled, all ok
err_char_unknown = -2 // the char used is unknow to the system
err_undefined = -3 // the compiler symbol is undefined
err_internal_error = -4 // Bug in the regex system!!
err_cc_alloc_overflow = -5 // memory for char class full!!
err_syntax_error = -6 // syntax error in regex compiling
err_groups_overflow = -7 // max number of groups reached
err_groups_max_nested = -8 // max number of nested group reached
err_group_not_balanced = -9 // group not balanced
err_group_qm_notation = -10 // group invalid notation
)
const (
//*************************************
// regex program instructions
//*************************************
ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char
// char class 11 0100 AA xxxxxxxx
// AA = 00 regular class
// AA = 01 Negated class ^ char
ist_char_class = 0xD1000000 // MASK
ist_char_class_pos = 0xD0000000 // char class normal [abc]
ist_char_class_neg = 0xD1000000 // char class negate [^abc]
// dot char 10 0110 xx xxxxxxxx
ist_dot_char = 0x98000000 // match any char except \n
// backslash chars 10 0100 xx xxxxxxxx
ist_bsls_char = 0x90000000 // backslash char
// OR | 10 010Y xx xxxxxxxx
ist_or_branch = 0x91000000 // OR case
// groups 10 010Y xx xxxxxxxx
ist_group_start = 0x92000000 // group start (
ist_group_end = 0x94000000 // group end )
// control instructions
ist_prog_end = u32(0x88000000) // 10 0010 xx xxxxxxxx
//*************************************
)
/*
General Utilities
*/
// utf8util_char_len calculate the length in bytes of a utf8 char
[inline]
fn utf8util_char_len(b byte) int {
return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1
}
// get_char get a char from position i and return an u32 with the unicode code
[direct_array_access; inline]
fn (re RE) get_char(in_txt string, i int) (u32, int) {
ini := unsafe { in_txt.str[i] }
// ascii 8 bit
if (re.flag & regex.f_bin) != 0 || ini & 0x80 == 0 {
return u32(ini), 1
}
// unicode char
char_len := utf8util_char_len(ini)
mut tmp := 0
mut ch := u32(0)
for tmp < char_len {
ch = (ch << 8) | unsafe { in_txt.str[i + tmp] }
tmp++
}
return ch, char_len
}
// get_charb get a char from position i and return an u32 with the unicode code
[direct_array_access; inline]
fn (re RE) get_charb(in_txt &byte, i int) (u32, int) {
// ascii 8 bit
if (re.flag & regex.f_bin) != 0 || unsafe { in_txt[i] } & 0x80 == 0 {
return u32(unsafe { in_txt[i] }), 1
}
// unicode char
char_len := utf8util_char_len(unsafe { in_txt[i] })
mut tmp := 0
mut ch := u32(0)
for tmp < char_len {
ch = (ch << 8) | unsafe { in_txt[i + tmp] }
tmp++
}
return ch, char_len
}
[inline]
fn is_alnum(in_char byte) bool {
mut tmp := in_char - `A`
if tmp <= 25 {
return true
}
tmp = in_char - `a`
if tmp <= 25 {
return true
}
tmp = in_char - `0`
if tmp <= 9 {
return true
}
if in_char == `_` {
return true
}
return false
}
[inline]
fn is_not_alnum(in_char byte) bool {
return !is_alnum(in_char)
}
[inline]
fn is_space(in_char byte) bool {
return in_char in regex.spaces
}
[inline]
fn is_not_space(in_char byte) bool {
return !is_space(in_char)
}
[inline]
fn is_digit(in_char byte) bool {
tmp := in_char - `0`
return tmp <= 0x09
}
[inline]
fn is_not_digit(in_char byte) bool {
return !is_digit(in_char)
}
/*
[inline]
fn is_wordchar(in_char byte) bool {
return is_alnum(in_char) || in_char == `_`
}
[inline]
fn is_not_wordchar(in_char byte) bool {
return !is_alnum(in_char)
}
*/
[inline]
fn is_lower(in_char byte) bool {
tmp := in_char - `a`
return tmp <= 25
}
[inline]
fn is_upper(in_char byte) bool {
tmp := in_char - `A`
return tmp <= 25
}
pub fn (re RE) get_parse_error_string(err int) string {
match err {
regex.compile_ok { return 'compile_ok' }
regex.no_match_found { return 'no_match_found' }
regex.err_char_unknown { return 'err_char_unknown' }
regex.err_undefined { return 'err_undefined' }
regex.err_internal_error { return 'err_internal_error' }
regex.err_cc_alloc_overflow { return 'err_cc_alloc_overflow' }
regex.err_syntax_error { return 'err_syntax_error' }
regex.err_groups_overflow { return 'err_groups_overflow' }
regex.err_groups_max_nested { return 'err_groups_max_nested' }
regex.err_group_not_balanced { return 'err_group_not_balanced' }
regex.err_group_qm_notation { return 'err_group_qm_notation' }
else { return 'err_unknown' }
}
}
// utf8_str convert and utf8 sequence to a printable string
[inline]
fn utf8_str(ch rune) string {
mut i := 4
mut res := ''
for i > 0 {
v := byte((ch >> ((i - 1) * 8)) & 0xFF)
if v != 0 {
res += '${v:1c}'
}
i--
}
return res
}
// simple_log default log function
fn simple_log(txt string) {
print(txt)
}
/******************************************************************************
*
* Token Structs
*
******************************************************************************/
pub type FnValidator = fn (byte) bool
struct Token {
mut:
ist rune
// char
ch rune // char of the token if any
ch_len byte // char len
// Quantifiers / branch
rep_min int // used also for jump next in the OR branch [no match] pc jump
rep_max int // used also for jump next in the OR branch [ match] pc jump
greedy bool // greedy quantifier flag
// Char class
cc_index int = -1
// counters for quantifier check (repetitions)
rep int
// validator function pointer
validator FnValidator
// groups variables
group_rep int // repetition of the group
group_id int = -1 // id of the group
goto_pc int = -1 // jump to this PC if is needed
// OR flag for the token
next_is_or bool // true if the next token is an OR
// dot_char token variables
dot_check_pc int = -1 // pc of the next token to check
last_dot_flag bool // if true indicate that is the last dot_char in the regex
}
[inline]
fn (mut tok Token) reset() {
tok.rep = 0
}
/******************************************************************************
*
* Regex struct
*
******************************************************************************/
pub const (
f_nl = 0x00000001 // end the match when find a new line symbol
f_ms = 0x00000002 // match true only if the match is at the start of the string
f_me = 0x00000004 // match true only if the match is at the end of the string
f_efm = 0x00000100 // exit on first token matched, used by search
f_bin = 0x00000200 // work only on bytes, ignore utf-8
// behaviour modifier flags
f_src = 0x00020000 // search mode enabled
)
struct StateDotObj {
mut:
i int = -1 // char index in the input buffer
pc int = -1 // program counter saved
mi int = -1 // match_index saved
group_stack_index int = -1 // continuous save on capturing groups
}
pub type FnLog = fn (string)
pub struct RE {
pub mut:
prog []Token
prog_len int // regex program len
// char classes storage
cc []CharClass // char class list
cc_index int // index
// groups
group_count int // number of groups in this regex struct
groups []int // groups index results
group_max_nested int = 3 // max nested group
group_max int = 8 // max allowed number of different groups
state_list []StateObj
group_csave_flag bool // flag to enable continuous saving
group_csave []int //= []int{} // groups continuous save list
group_map map[string]int // groups names map
group_stack []int
group_data []int
// flags
flag int // flag for optional parameters
// Debug/log
debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE
log_func FnLog = simple_log // log function, can be customized by the user
query string // query string
}
// Reset RE object
[direct_array_access; inline]
fn (mut re RE) reset() {
re.cc_index = 0
mut i := 0
for i < re.prog_len {
re.prog[i].group_rep = 0 // clear repetition of the group
re.prog[i].rep = 0 // clear repetition of the token
i++
}
// init groups array
if re.group_count > 0 {
re.groups = []int{len: re.group_count * 2, init: -1}
}
// reset group_csave
if re.group_csave_flag == true {
re.group_csave.clear() // = []int{}
}
// reset state list
re.state_list.clear()
re.group_stack.clear()
}
// reset for search mode fail
// gcc bug, dont use [inline] or go 5 time slower
//[inline]
[direct_array_access]
fn (mut re RE) reset_src() {
mut i := 0
for i < re.prog_len {
re.prog[i].group_rep = 0 // clear repetition of the group
re.prog[i].rep = 0 // clear repetition of the token
i++
}
}
/******************************************************************************
*
* Backslashes chars
*
******************************************************************************/
struct BslsStruct {
ch rune // meta char
validator FnValidator // validator function pointer
}
const (
bsls_validator_array = [
BslsStruct{`w`, is_alnum},
BslsStruct{`W`, is_not_alnum},
BslsStruct{`s`, is_space},
BslsStruct{`S`, is_not_space},
BslsStruct{`d`, is_digit},
BslsStruct{`D`, is_not_digit},
BslsStruct{`a`, is_lower},
BslsStruct{`A`, is_upper},
]
// these chars are escape if preceded by a \
bsls_escape_list = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`,
`^`, `!`]
)
enum BSLS_parse_state {
start
bsls_found
bsls_char
normal_char
}
// parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present
fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) {
mut status := BSLS_parse_state.start
mut i := in_i
for i < in_txt.len {
// get our char
char_tmp, char_len := re.get_char(in_txt, i)
ch := byte(char_tmp)
if status == .start && ch == `\\` {
status = .bsls_found
i += char_len
continue
}
// check if is our bsls char, for now only one length sequence
if status == .bsls_found {
for c, x in regex.bsls_validator_array {
if x.ch == ch {
return c, i - in_i + 1
}
}
status = .normal_char
continue
}
// no BSLS validator, manage as normal escape char char
if status == .normal_char {
if ch in regex.bsls_escape_list {
return regex.no_match_found, i - in_i + 1
}
return regex.err_syntax_error, i - in_i + 1
}
// at the present time we manage only one char after the \
break
}
// not our bsls return KO
return regex.err_syntax_error, i
}
/******************************************************************************
*
* Char class
*
******************************************************************************/
const (
cc_null = 0 // empty cc token
cc_char = 1 // simple char: a
cc_int = 2 // char interval: a-z
cc_bsls = 3 // backslash char
cc_end = 4 // cc sequence terminator
)
struct CharClass {
mut:
cc_type int = regex.cc_null // type of cc token
ch0 rune // first char of the interval a-b a in this case
ch1 rune // second char of the interval a-b b in this case
validator FnValidator // validator function pointer
}
enum CharClass_parse_state {
start
in_char
in_bsls
separator
finish
}
fn (re RE) get_char_class(pc int) string {
buf := []byte{len: (re.cc.len)}
mut buf_ptr := unsafe { &byte(&buf) }
mut cc_i := re.prog[pc].cc_index
mut i := 0
mut tmp := 0
for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end {
if re.cc[cc_i].cc_type == regex.cc_bsls {
unsafe {
buf_ptr[i] = `\\`
i++
buf_ptr[i] = byte(re.cc[cc_i].ch0)
i++
}
} else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 {
tmp = 3
for tmp >= 0 {
x := byte((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
if x != 0 {
unsafe {
buf_ptr[i] = x
i++
}
}
tmp--
}
} else {
tmp = 3
for tmp >= 0 {
x := byte((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
if x != 0 {
unsafe {
buf_ptr[i] = x
i++
}
}
tmp--
}
unsafe {
buf_ptr[i] = `-`
i++
}
tmp = 3
for tmp >= 0 {
x := byte((re.cc[cc_i].ch1 >> (tmp * 8)) & 0xFF)
if x != 0 {
unsafe {
buf_ptr[i] = x
i++
}
}
tmp--
}
}
cc_i++
}
unsafe {
buf_ptr[i] = byte(0)
}
return unsafe { tos_clone(buf_ptr) }
}
fn (re RE) check_char_class(pc int, ch rune) bool {
mut cc_i := re.prog[pc].cc_index
for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end {
if re.cc[cc_i].cc_type == regex.cc_bsls {
if re.cc[cc_i].validator(byte(ch)) {
return true
}
} else if ch >= re.cc[cc_i].ch0 && ch <= re.cc[cc_i].ch1 {
return true
}
cc_i++
}
return false
}
// parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char
fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
mut status := CharClass_parse_state.start
mut i := in_i
mut tmp_index := re.cc_index
res_index := re.cc_index
mut cc_type := u32(regex.ist_char_class_pos)
for i < in_txt.len {
// check if we are out of memory for char classes
if tmp_index >= re.cc.len {
return regex.err_cc_alloc_overflow, 0, u32(0)
}
// get our char
char_tmp, char_len := re.get_char(in_txt, i)
ch := byte(char_tmp)
// println("CC #${i:3d} ch: ${ch:c}")
// negation
if status == .start && ch == `^` {
cc_type = u32(regex.ist_char_class_neg)
i += char_len
continue
}
// minus symbol
if status == .start && ch == `-` {
re.cc[tmp_index].cc_type = regex.cc_char
re.cc[tmp_index].ch0 = char_tmp
re.cc[tmp_index].ch1 = char_tmp
i += char_len
tmp_index++
continue
}
// bsls
if (status == .start || status == .in_char) && ch == `\\` {
// println("CC bsls.")
status = .in_bsls
i += char_len
continue
}
if status == .in_bsls {
// println("CC bsls validation.")
for c, x in regex.bsls_validator_array {
if x.ch == ch {
// println("CC bsls found [${ch:c}]")
re.cc[tmp_index].cc_type = regex.cc_bsls
re.cc[tmp_index].ch0 = regex.bsls_validator_array[c].ch
re.cc[tmp_index].ch1 = regex.bsls_validator_array[c].ch
re.cc[tmp_index].validator = regex.bsls_validator_array[c].validator
i += char_len
tmp_index++
status = .in_char
break
}
}
if status == .in_bsls {
// manage as a simple char
// println("CC bsls not found [${ch:c}]")
re.cc[tmp_index].cc_type = regex.cc_char
re.cc[tmp_index].ch0 = char_tmp
re.cc[tmp_index].ch1 = char_tmp
i += char_len
tmp_index++
status = .in_char
continue
} else {
continue
}
}
// simple char
if (status == .start || status == .in_char) && ch != `-` && ch != `]` {
status = .in_char
re.cc[tmp_index].cc_type = regex.cc_char
re.cc[tmp_index].ch0 = char_tmp
re.cc[tmp_index].ch1 = char_tmp
i += char_len
tmp_index++
continue
}
// check range separator
if status == .in_char && ch == `-` {
status = .separator
i += char_len
continue
}
// check range end
if status == .separator && ch != `]` && ch != `-` {
status = .in_char
re.cc[tmp_index - 1].cc_type = regex.cc_int
re.cc[tmp_index - 1].ch1 = char_tmp
i += char_len
continue
}
// char class end
if status == .in_char && ch == `]` {
re.cc[tmp_index].cc_type = regex.cc_end
re.cc[tmp_index].ch0 = 0
re.cc[tmp_index].ch1 = 0
re.cc_index = tmp_index + 1
return res_index, i - in_i + 2, cc_type
}
i++
}
return regex.err_syntax_error, 0, u32(0)
}
/******************************************************************************
*
* Re Compiler
*
******************************************************************************/
//
// Quantifier
//
enum Quant_parse_state {
start
min_parse
comma_checked
max_parse
greedy
gredy_parse
finish
}
// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
mut status := Quant_parse_state.start
mut i := in_i
mut q_min := 0 // default min in a {} quantifier is 1
mut q_max := 0 // deafult max in a {} quantifier is max_quantifier
mut ch := byte(0)
for i < in_txt.len {
unsafe {
ch = in_txt.str[i]
}
// println("${ch:c} status: $status")
// exit on no compatible char with {} quantifier
if utf8util_char_len(ch) != 1 {
return regex.err_syntax_error, i, 0, false
}
// min parsing skip if comma present
if status == .start && ch == `,` {
q_min = 0 // default min in a {} quantifier is 0
status = .comma_checked
i++
continue
}
if status == .start && is_digit(ch) {
status = .min_parse
q_min *= 10
q_min += int(ch - `0`)
i++
continue
}
if status == .min_parse && is_digit(ch) {
q_min *= 10
q_min += int(ch - `0`)
i++
continue
}
// we have parsed the min, now check the max
if status == .min_parse && ch == `,` {
status = .comma_checked
i++
continue
}
// single value {4}
if status == .min_parse && ch == `}` {
q_max = q_min
status = .greedy
continue
}
// end without max
if status == .comma_checked && ch == `}` {
q_max = regex.max_quantifier
status = .greedy
continue
}
// start max parsing
if status == .comma_checked && is_digit(ch) {
status = .max_parse
q_max *= 10
q_max += int(ch - `0`)
i++
continue
}
// parse the max
if status == .max_parse && is_digit(ch) {
q_max *= 10
q_max += int(ch - `0`)
i++
continue
}
// finished the quantifier
if status == .max_parse && ch == `}` {
status = .greedy
continue
}
// check if greedy flag char ? is present
if status == .greedy {
if i + 1 < in_txt.len {
i++
status = .gredy_parse
continue
}
return q_min, q_max, i - in_i + 2, false
}
// check the greedy flag
if status == .gredy_parse {
if ch == `?` {
return q_min, q_max, i - in_i + 2, true
} else {
i--
return q_min, q_max, i - in_i + 2, false
}
}
// not a {} quantifier, exit
return regex.err_syntax_error, i, 0, false
}
// not a conform {} quantifier
return regex.err_syntax_error, i, 0, false
}
//
// Groups
//
enum Group_parse_state {
start
q_mark // (?
q_mark1 // (?:|P checking
p_status // (?P
p_start // (?P<
p_end // (?P<...>
p_in_name // (?P<...
finish
}
// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
mut status := Group_parse_state.start
mut i := in_i
mut name := ''
for i < in_txt.len && status != .finish {
// get our char
char_tmp, char_len := re.get_char(in_txt, i)
ch := byte(char_tmp)
// start
if status == .start && ch == `(` {
status = .q_mark
i += char_len
continue
}
// check for question marks
if status == .q_mark && ch == `?` {
status = .q_mark1
i += char_len
continue
}
// non capturing group
if status == .q_mark1 && ch == `:` {
i += char_len
return 0, false, name, i
}
// enter in P section
if status == .q_mark1 && ch == `P` {
status = .p_status
i += char_len
continue
}
// not a valid q mark found
if status == .q_mark1 {
// println("NO VALID Q MARK")
return -2, true, name, i
}
if status == .p_status && ch == `<` {
status = .p_start
i += char_len
continue
}
if status == .p_start && ch != `>` {
status = .p_in_name
name += '${ch:1c}' // TODO: manage utf8 chars
i += char_len
continue
}
// colect name
if status == .p_in_name && ch != `>` && is_alnum(ch) {
name += '${ch:1c}' // TODO: manage utf8 chars
i += char_len
continue
}
// end name
if status == .p_in_name && ch == `>` {
i += char_len
return 0, true, name, i
}
// error on name group
if status == .p_in_name {
return -2, true, name, i
}
// normal group, nothig to do, exit
return 0, true, name, i
}
// UNREACHABLE
// println("ERROR!! NOT MEANT TO BE HERE!!1")
return -2, true, name, i
}
const (
quntifier_chars = [rune(`+`), `*`, `?`, `{`]
)
//
// main compiler
//
// compile return (return code, index) where index is the index of the error in the query string if return code is an error code
fn (mut re RE) impl_compile(in_txt string) (int, int) {
mut i := 0 // input string index
mut pc := 0 // program counter
// group management variables
mut group_count := -1
mut group_stack := []int{len: re.group_max_nested, init: 0}
mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1}
mut group_stack_index := -1
re.query = in_txt // save the query string
i = 0
for i < in_txt.len {
mut char_tmp := u32(0)
mut char_len := 0
// println("i: ${i:3d} ch: ${in_txt.str[i]:c}")
char_tmp, char_len = re.get_char(in_txt, i)
//
// check special cases: $ ^
//
if char_len == 1 && i == 0 && byte(char_tmp) == `^` {
re.flag = regex.f_ms
i = i + char_len
continue
}
if char_len == 1 && i == (in_txt.len - 1) && byte(char_tmp) == `$` {
re.flag = regex.f_me
i = i + char_len
continue
}
// ist_group_start
if char_len == 1 && pc >= 0 && byte(char_tmp) == `(` {
// check max groups allowed
if group_count > re.group_max {
return regex.err_groups_overflow, i + 1
}
group_stack_index++
// check max nested groups allowed
if group_stack_index > re.group_max_nested {
return regex.err_groups_max_nested, i + 1
}
tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt, i)
// manage question mark format error
if tmp_res < -1 {
return regex.err_group_qm_notation, next_i
}
// println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
i = next_i
if cgroup_flag == true {
group_count++
}
// calculate the group id
// if it is a named group, recycle the group id
// NOTE: **** the group index is +1 because map return 0 when not found!! ****
mut group_id := group_count
if cgroup_name.len > 0 {
// println("GROUP NAME: ${cgroup_name}")
if cgroup_name in re.group_map {
group_id = re.group_map[cgroup_name] - 1
group_count--
} else {
re.group_map[cgroup_name] = group_id + 1
}
}
group_stack_txt_index[group_stack_index] = i
group_stack[group_stack_index] = pc
re.prog[pc].ist = u32(0) | regex.ist_group_start
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
// set the group id
if cgroup_flag == false {
// println("NO CAPTURE GROUP")
re.prog[pc].group_id = -1
} else {
re.prog[pc].group_id = group_id
}
pc = pc + 1
continue
}
// ist_group_end
if char_len == 1 && pc > 0 && byte(char_tmp) == `)` {
if group_stack_index < 0 {
return regex.err_group_not_balanced, i + 1
}
goto_pc := group_stack[group_stack_index]
group_stack_index--
re.prog[pc].ist = u32(0) | regex.ist_group_end
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need
re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data
re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
// re.prog[goto_pc].group_id = group_count // id of this group, used for storing data
pc = pc + 1
i = i + char_len
continue
}
// ist_dot_char match any char except the following token
if char_len == 1 && pc >= 0 && byte(char_tmp) == `.` {
re.prog[pc].ist = u32(0) | regex.ist_dot_char
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
pc = pc + 1
i = i + char_len
continue
}
// OR branch
if char_len == 1 && pc > 0 && byte(char_tmp) == `|` {
// two consecutive ist_dot_char are an error
if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
return regex.err_syntax_error, i
}
re.prog[pc].ist = u32(0) | regex.ist_or_branch
pc = pc + 1
i = i + char_len
continue
}
// Quantifiers
if char_len == 1 && pc > 0 {
mut char_next := rune(0)
mut char_next_len := 0
if (char_len + i) < in_txt.len {
char_next, char_next_len = re.get_char(in_txt, i + char_len)
}
mut quant_flag := true
match byte(char_tmp) {
`?` {
// println("q: ${char_tmp:c}")
// check illegal quantifier sequences
if char_next_len == 1 && char_next in regex.quntifier_chars {
return regex.err_syntax_error, i
}
re.prog[pc - 1].rep_min = 0
re.prog[pc - 1].rep_max = 1
}
`+` {
// println("q: ${char_tmp:c}")
// check illegal quantifier sequences
if char_next_len == 1 && char_next in regex.quntifier_chars {
return regex.err_syntax_error, i
}
re.prog[pc - 1].rep_min = 1
re.prog[pc - 1].rep_max = regex.max_quantifier
}
`*` {
// println("q: ${char_tmp:c}")
// check illegal quantifier sequences
if char_next_len == 1 && char_next in regex.quntifier_chars {
return regex.err_syntax_error, i
}
re.prog[pc - 1].rep_min = 0
re.prog[pc - 1].rep_max = regex.max_quantifier
}
`{` {
min, max, tmp, greedy := re.parse_quantifier(in_txt, i + 1)
// it is a quantifier
if min >= 0 {
// println("{$min,$max}\n str:[${in_txt[i..i+tmp]}] greedy:$greedy")
i = i + tmp
re.prog[pc - 1].rep_min = min
re.prog[pc - 1].rep_max = max
re.prog[pc - 1].greedy = greedy
// check illegal quantifier sequences
if i <= in_txt.len {
char_next, char_next_len = re.get_char(in_txt, i)
if char_next_len == 1 && char_next in regex.quntifier_chars {
return regex.err_syntax_error, i
}
}
continue
} else {
return min, i
}
// TODO: decide if the open bracket can be conform without the close bracket
/*
// no conform, parse as normal char
else {
quant_flag = false
}
*/
}
else {
quant_flag = false
}
}
if quant_flag {
i = i + char_len
continue
}
}
// IST_CHAR_CLASS_*
if char_len == 1 && pc >= 0 {
if byte(char_tmp) == `[` {
cc_index, tmp, cc_type := re.parse_char_class(in_txt, i + 1)
if cc_index >= 0 {
// println("index: $cc_index str:${in_txt[i..i+tmp]}")
i = i + tmp
re.prog[pc].ist = u32(0) | cc_type
re.prog[pc].cc_index = cc_index
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
pc = pc + 1
continue
}
// cc_class vector memory full
else if cc_index < 0 {
return cc_index, i
}
}
}
// ist_bsls_char
if char_len == 1 && pc >= 0 {
if byte(char_tmp) == `\\` {
bsls_index, tmp := re.parse_bsls(in_txt, i)
// println("index: $bsls_index str:${in_txt[i..i+tmp]}")
if bsls_index >= 0 {
i = i + tmp
re.prog[pc].ist = u32(0) | regex.ist_bsls_char
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
re.prog[pc].validator = regex.bsls_validator_array[bsls_index].validator
re.prog[pc].ch = regex.bsls_validator_array[bsls_index].ch
pc = pc + 1
continue
}
// this is an escape char, skip the bsls and continue as a normal char
else if bsls_index == regex.no_match_found {
i += char_len
char_tmp, char_len = re.get_char(in_txt, i)
// continue as simple char
}
// if not an escape or a bsls char then it is an error (at least for now!)
else {
return bsls_index, i + tmp
}
}
}
// ist_simple_char
re.prog[pc].ist = regex.ist_simple_char
re.prog[pc].ch = char_tmp
re.prog[pc].ch_len = byte(char_len)
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
// println("char: ${char_tmp:c}")
pc = pc + 1
i += char_len
}
// add end of the program
re.prog[pc].ist = regex.ist_prog_end
re.prog_len = pc
// check for unbalanced groups
if group_stack_index != -1 {
return regex.err_group_not_balanced, group_stack_txt_index[group_stack_index] + 1
}
// check for OR at the end of the program
if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
return regex.err_syntax_error, in_txt.len
}
// store the number of groups in the query
re.group_count = group_count + 1
//******************************************
// Post processing
//******************************************
//
// manage ist_dot_char
//
// find the checks for dot chars, if any...
mut pc1 := 0
mut dot_char_count := 0
mut last_dot_char_pc := -1
for pc1 < pc {
if re.prog[pc1].ist == regex.ist_dot_char {
// println("Dot_char pc: $pc1")
last_dot_char_pc = pc1
dot_char_count++
mut pc2 := pc1 + 1
for pc2 < pc {
if re.prog[pc2].ist == regex.ist_dot_char {
return regex.err_syntax_error, 0
}
if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
regex.ist_group_start] {
// println("Next dot char check is PC: ${pc2}")
re.prog[pc1].dot_check_pc = pc2
break
}
pc2++
}
}
pc1++
}
// println("last_dot_char_pc: $last_dot_char_pc")
if last_dot_char_pc >= 0 {
pc1 = last_dot_char_pc + 1
mut is_last_dot := true
for pc1 < pc {
if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
is_last_dot = false
break
}
pc1++
}
if is_last_dot {
re.prog[last_dot_char_pc].last_dot_flag = true
}
}
//******************************************
// OR branch
// a|b|cd
// d exit point
// a,b,c branches
// set the jump in the right places
pc1 = 0
for pc1 < pc - 2 {
// println("Here $pc1 ${pc-2}")
// two consecutive OR are a syntax error
if re.prog[pc1 + 1].ist == regex.ist_or_branch
&& re.prog[pc1 + 2].ist == regex.ist_or_branch {
return regex.err_syntax_error, i
}
// manange a|b chains like a|(b)|c|d...
// standard solution
if re.prog[pc1].ist != regex.ist_or_branch && re.prog[pc1 + 1].ist == regex.ist_or_branch
&& re.prog[pc1 + 2].ist != regex.ist_or_branch {
re.prog[pc1].next_is_or = true // set that the next token is an OR
re.prog[pc1 + 1].rep_min = pc1 + 2 // failed match jump
// match jump, if an OR chain the next token will be an OR token
mut pc2 := pc1 + 2
for pc2 < pc - 1 {
ist := re.prog[pc2].ist
if ist == regex.ist_group_start {
re.prog[pc1 + 1].rep_max = re.prog[pc2].goto_pc + 1
break
}
if ist != regex.ist_or_branch {
re.prog[pc1 + 1].rep_max = pc2 + 1
break
}
pc2++
}
// special case query of few chars, teh true can't go on the first instruction
if re.prog[pc1 + 1].rep_max == pc1 {
re.prog[pc1 + 1].rep_max = 3
}
// println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]")
pc1 = pc2
continue
}
pc1++
}
//******************************************
// DEBUG PRINT REGEX GENERATED CODE
//******************************************
if re.debug > 0 {
gc := re.get_code()
re.log_func(gc)
}
//******************************************
return regex.compile_ok, 0
}
// get_code return the compiled code as regex string, note: may be different from the source!
pub fn (re RE) get_code() string {
mut pc1 := 0
mut res := strings.new_builder(re.cc.len * 2 * re.prog.len)
res.write_string('========================================\nv RegEx compiler v $regex.v_regex_version output:\n')
mut stop_flag := false
for pc1 <= re.prog.len {
tk := re.prog[pc1]
res.write_string('PC:${pc1:3d}')
res.write_string(' ist: ')
res.write_string('${tk.ist:8x}'.replace(' ', '0'))
res.write_string(' ')
ist := tk.ist
if ist == regex.ist_bsls_char {
res.write_string('[\\${tk.ch:1c}] BSLS')
} else if ist == regex.ist_prog_end {
res.write_string('PROG_END')
stop_flag = true
} else if ist == regex.ist_or_branch {
res.write_string('OR ')
} else if ist == regex.ist_char_class_pos {
res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS')
} else if ist == regex.ist_char_class_neg {
res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG')
} else if ist == regex.ist_dot_char {
res.write_string('. DOT_CHAR nx chk: $tk.dot_check_pc')
if tk.last_dot_flag == true {
res.write_string(' last!')
}
} else if ist == regex.ist_group_start {
res.write_string('( GROUP_START #:$tk.group_id')
if tk.group_id == -1 {
res.write_string(' ?:')
} else {
for x in re.group_map.keys() {
if re.group_map[x] == (tk.group_id + 1) {
res.write_string(' ?P<$x>')
break
}
}
}
} else if ist == regex.ist_group_end {
res.write_string(') GROUP_END #:$tk.group_id')
} else if ist == regex.ist_simple_char {
res.write_string('[${tk.ch:1c}] query_ch')
}
if tk.rep_max == regex.max_quantifier {
res.write_string(' {${tk.rep_min:3d},MAX}')
} else {
if ist == regex.ist_or_branch {
res.write_string(' if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}')
} else {
res.write_string(' {${tk.rep_min:3d},${tk.rep_max:3d}}')
}
if tk.greedy == true {
res.write_string('?')
}
}
res.write_string('\n')
if stop_flag {
break
}
pc1++
}
res.write_string('========================================\n')
return res.str()
}
// get_query return a string with a reconstruction of the query starting from the regex program code
pub fn (re RE) get_query() string {
mut res := strings.new_builder(re.query.len * 2)
if (re.flag & regex.f_ms) != 0 {
res.write_string('^')
}
mut i := 0
for i < re.prog.len && re.prog[i].ist != regex.ist_prog_end && re.prog[i].ist != 0 {
tk := unsafe { &re.prog[i] }
ch := tk.ist
// GROUP start
if ch == regex.ist_group_start {
if re.debug == 0 {
res.write_string('(')
} else {
if tk.group_id == -1 {
res.write_string('(?:') // non capturing group
} else {
res.write_string('#${tk.group_id}(')
}
}
for x in re.group_map.keys() {
if re.group_map[x] == (tk.group_id + 1) {
res.write_string('?P<$x>')
break
}
}
i++
continue
}
// GROUP end
if ch == regex.ist_group_end {
res.write_string(')')
}
// OR branch
if ch == regex.ist_or_branch {
res.write_string('|')
if re.debug > 0 {
res.write_string('{$tk.rep_min,$tk.rep_max}')
}
i++
continue
}
// char class
if ch == regex.ist_char_class_neg || ch == regex.ist_char_class_pos {
res.write_string('[')
if ch == regex.ist_char_class_neg {
res.write_string('^')
}
res.write_string('${re.get_char_class(i)}')
res.write_string(']')
}
// bsls char
if ch == regex.ist_bsls_char {
res.write_string('\\${tk.ch:1c}')
}
// ist_dot_char
if ch == regex.ist_dot_char {
res.write_string('.')
}
// char alone
if ch == regex.ist_simple_char {
if byte(ch) in regex.bsls_escape_list {
res.write_string('\\')
}
res.write_string('${tk.ch:c}')
}
// quantifier
if !(tk.rep_min == 1 && tk.rep_max == 1) {
if tk.rep_min == 0 && tk.rep_max == 1 {
res.write_string('?')
} else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier {
res.write_string('+')
} else if tk.rep_min == 0 && tk.rep_max == regex.max_quantifier {
res.write_string('*')
} else {
if tk.rep_max == regex.max_quantifier {
res.write_string('{$tk.rep_min,MAX}')
} else {
res.write_string('{$tk.rep_min,$tk.rep_max}')
}
if tk.greedy == true {
res.write_string('?')
}
}
}
i++
}
if (re.flag & regex.f_me) != 0 {
res.write_string('$')
}
return res.str()
}
/******************************************************************************
*
* Groups saving utilities
*
******************************************************************************/
[direct_array_access]
fn (mut re RE) group_continuous_save(g_index int) {
if re.group_csave_flag == true {
// continuous save, save until we have space
// init the first element as counter
if re.group_csave.len == 0 {
re.group_csave << 0
}
gi := g_index >> 1
start := re.groups[g_index]
end := re.groups[g_index + 1]
// check if we are simply increasing the size ot the found group
if re.group_csave.len >= 4 && gi == re.group_csave[re.group_csave.len - 3]
&& start == re.group_csave[re.group_csave.len - 2] {
re.group_csave[re.group_csave.len - 1] = end
return
}
// otherwise append a new group to the list
// increment counter
re.group_csave[0]++
// save the record
re.group_csave << (g_index >> 1) // group id
re.group_csave << re.groups[g_index] // start
re.group_csave << re.groups[g_index + 1] // end
}
}
/******************************************************************************
*
* Matching
*
******************************************************************************/
enum Match_state {
start = 0
stop
end
new_line
ist_load // load and execute instruction
ist_next // go to next instruction
ist_next_ks // go to next instruction without clenaning the state
ist_quant_p // match positive ,quantifier check
ist_quant_n // match negative, quantifier check
ist_quant_pg // match positive ,group quantifier check
ist_quant_ng // match negative ,group quantifier check
}
fn state_str(s Match_state) string {
match s {
.start { return 'start' }
.stop { return 'stop' }
.end { return 'end' }
.new_line { return 'new line' }
.ist_load { return 'ist_load' }
.ist_next { return 'ist_next' }
.ist_next_ks { return 'ist_next_ks' }
.ist_quant_p { return 'ist_quant_p' }
.ist_quant_n { return 'ist_quant_n' }
.ist_quant_pg { return 'ist_quant_pg' }
.ist_quant_ng { return 'ist_quant_ng' }
}
}
struct StateObj {
pub mut:
group_index int = -1 // group id used to know how many groups are open
match_flag bool // indicate if we are in a match condition
match_index int = -1 // index of the last match
first_match int = -1 // index of the first match
pc int = -1 // program counter
i int = -1 // source string index
char_len int // last char legth
last_dot_pc int = -1 // last dot chat pc
}
[direct_array_access]
pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
// result status
mut result := regex.no_match_found // function return
mut ch := rune(0) // examinated char
mut char_len := 0 // utf8 examinated char len
mut m_state := Match_state.start // start point for the matcher FSM
mut src_end := false
mut last_fnd_pc := -1
mut state := StateObj{} // actual state
mut ist := rune(0) // actual instruction
mut l_ist := rune(0) // last matched instruction
mut step_count := 0 // stats for debug
mut dbg_line := 0 // count debug line printed
re.reset()
if re.debug > 0 {
// print header
mut h_buf := strings.new_builder(32)
h_buf.write_string('flags: ')
h_buf.write_string('${re.flag:8x}'.replace(' ', '0'))
h_buf.write_string('\n')
sss := h_buf.str()
re.log_func(sss)
}
for m_state != .end {
if state.pc >= 0 && state.pc < re.prog.len {
ist = re.prog[state.pc].ist
} else if state.pc >= re.prog.len {
// println("ERROR!! PC overflow!!")
return regex.err_internal_error, state.i
}
//******************************************
// DEBUG LOG
//******************************************
if re.debug > 0 {
mut buf2 := strings.new_builder(re.cc.len + 128)
// print all the instructions
// end of the input text
if state.i >= in_txt_len {
buf2.write_string('# ${step_count:3d} END OF INPUT TEXT\n')
sss := buf2.str()
re.log_func(sss)
} else {
// print only the exe instruction
if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 {
if ist == regex.ist_prog_end {
buf2.write_string('# ${step_count:3d} PROG_END\n')
} else if ist == 0 || m_state in [.start, .ist_next, .stop] {
buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n')
} else {
ch, char_len = re.get_charb(in_txt, state.i)
buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>')
buf2.write_string('${ist:8x}'.replace(' ', '0'))
buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',$char_len] f.m:[${state.first_match:3d},${state.match_index:3d}] ")
if ist == regex.ist_simple_char {
buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]')
} else {
if ist == regex.ist_bsls_char {
buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]')
} else if ist == regex.ist_prog_end {
buf2.write_string('PROG_END')
} else if ist == regex.ist_or_branch {
buf2.write_string('OR')
} else if ist == regex.ist_char_class_pos {
buf2.write_string('CHAR_CLASS_POS[${re.get_char_class(state.pc)}]')
} else if ist == regex.ist_char_class_neg {
buf2.write_string('CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]')
} else if ist == regex.ist_dot_char {
buf2.write_string('DOT_CHAR')
} else if ist == regex.ist_group_start {
tmp_gi := re.prog[state.pc].group_id
tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep
buf2.write_string('GROUP_START #:$tmp_gi rep:$tmp_gr ')
} else if ist == regex.ist_group_end {
buf2.write_string('GROUP_END #:${re.prog[state.pc].group_id} deep:$state.group_index')
}
}
if re.prog[state.pc].rep_max == regex.max_quantifier {
buf2.write_string('{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}')
} else {
buf2.write_string('{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}')
}
if re.prog[state.pc].greedy == true {
buf2.write_string('?')
}
buf2.write_string(' (#$state.group_index)')
if ist == regex.ist_dot_char {
buf2.write_string(' last!')
}
buf2.write_string('\n')
}
sss2 := buf2.str()
re.log_func(sss2)
}
}
step_count++
dbg_line++
}
//******************************************
if ist == regex.ist_prog_end {
// println("HERE we end!")
break
}
// we're out of text, manage it
if state.i >= in_txt_len || m_state == .new_line {
// println("Finished text!!")
src_end = true
// manage groups
if state.group_index >= 0 && state.match_index >= 0 {
// println("End text with open groups!")
// close the groups
for state.group_index >= 0 {
tmp_pc := re.group_data[state.group_index]
re.prog[tmp_pc].group_rep++
// println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min
&& re.prog[tmp_pc].group_id >= 0 {
start_i := re.group_stack[state.group_index]
re.group_stack[state.group_index] = -1
// save group results
g_index := re.prog[tmp_pc].group_id * 2
if start_i >= 0 {
re.groups[g_index] = start_i
} else {
re.groups[g_index] = 0
}
// we have fished the text, we must manage out pf bound indexes
if state.i >= in_txt_len {
state.i = in_txt_len - 1
}
re.groups[g_index + 1] = state.i
if re.groups[g_index + 1] >= in_txt_len {
// println("clamp group on stop!")
re.groups[g_index + 1] = in_txt_len - 1
}
// continuous save, save until we have space
re.group_continuous_save(g_index)
}
state.group_index--
}
}
// the text is finished and the groups closed and we are the last group, ok exit
if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
// println("Last group end")
return state.first_match, state.i
}
if state.pc == -1 {
state.pc = last_fnd_pc
}
// println("Finished text!!")
// println("Instruction: ${ist:08x} pc: $state.pc")
// println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}")
// program end
if ist == regex.ist_prog_end {
// println("Program end on end of text!")
return state.first_match, state.i
}
// we are in a last dot_ char case
if l_ist == regex.ist_dot_char {
// println("***** We have a last dot_char")
// println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}")
// println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}")
// println("first match: ${state.first_match}")
if re.prog[state.pc].last_dot_flag == true
&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min
&& re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
return state.first_match, state.i
}
// println("Not fitted!!")
}
// m_state = .end
// break
// no groups open, check the last token quantifier
if ist != regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
if re.prog[state.pc].rep >= re.prog[state.pc].rep_min
&& re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
// println("We are in good repetition")
return state.first_match, state.i
}
}
// print("No good exit!!")
return regex.no_match_found, 0
}
// starting and init
if m_state == .start {
state.pc = -1
state.i = 0
m_state = .ist_next
continue
}
// ist_next, next instruction reseting its state
else if m_state == .ist_next {
state.pc = state.pc + 1
re.prog[state.pc].reset()
// check if we are in the program bounds
if state.pc < 0 || state.pc > re.prog.len {
// println("ERROR!! PC overflow!!")
return regex.err_internal_error, state.i
}
m_state = .ist_load
continue
}
// ist_next_ks, next instruction keeping its state
else if m_state == .ist_next_ks {
state.pc = state.pc + 1
// check if we are in the program bounds
if state.pc < 0 || state.pc > re.prog.len {
// println("ERROR!! PC overflow!!")
return regex.err_internal_error, state.i
}
m_state = .ist_load
continue
}
// load the char
ch, char_len = re.get_charb(in_txt, state.i)
// check new line if flag f_nl enabled
if (re.flag & regex.f_nl) != 0 && char_len == 1 && byte(ch) in regex.new_line_list {
m_state = .new_line
continue
}
// check if stop
else if m_state == .stop {
// we are in search mode, don't exit until the end
if ((re.flag & regex.f_src) != 0) && (ist != regex.ist_prog_end) {
last_fnd_pc = state.pc
state.pc = -1
state.i += char_len
m_state = .ist_next
re.reset_src()
state.match_index = -1
state.first_match = -1
// reset state list
re.reset()
continue
}
if ist == regex.ist_prog_end {
return state.first_match, state.i
}
// manage here dot char
if re.state_list.len > 0 {
// println("Here we are, with stop: state buffer: [${re.state_list.len}]")
state = re.state_list.pop()
state.match_flag = true
l_ist = u32(regex.ist_dot_char)
if state.first_match < 0 {
state.first_match = state.i
}
state.match_index = state.i
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len
m_state = .ist_quant_p
continue
}
// exit on no match
return result, 0
}
// ist_load
else if m_state == .ist_load {
// program end
if ist == regex.ist_prog_end {
// if we are in match exit well
if state.group_index >= 0 && state.match_index >= 0 {
state.group_index = -1
}
m_state = .stop
continue
}
// check GROUP start, no quantifier is checkd for this token!!
else if ist == regex.ist_group_start {
state.group_index++
re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
re.group_stack[state.group_index] = state.i // index where we start to manage
// println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
m_state = .ist_next
continue
}
// check GROUP end
else if ist == regex.ist_group_end {
// we are in matching streak
// println("Group END!! last ist: ${l_ist:08x}")
if state.match_index >= 0 {
// restore txt index stack and save the group data
// println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
start_i := re.group_stack[state.group_index]
// save group results
g_index := re.prog[state.pc].group_id * 2
if start_i >= 0 {
re.groups[g_index] = start_i
} else {
re.groups[g_index] = 0
}
re.groups[g_index + 1] = state.i
if g_index > 0 && re.groups[g_index] <= re.groups[g_index - 1] {
re.groups[g_index] = re.groups[g_index - 1]
}
if re.groups[g_index + 1] >= in_txt_len {
// println("clamp group!")
re.groups[g_index + 1] = in_txt_len - 1
}
// println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: $state.i in_txt_len: $in_txt_len")
// continuous save, save until we have space
re.group_continuous_save(g_index)
}
re.prog[state.pc].group_rep++ // increase repetitions
// println("GROUP $group_index END ${re.prog[state.pc].group_rep}")
m_state = .ist_quant_pg
continue
}
m_state = .ist_quant_ng
continue
}
// check OR
else if ist == regex.ist_or_branch {
if state.match_index >= 0 {
state.pc = re.prog[state.pc].rep_max
// println("ist_or_branch True pc: $state.pc")
} else {
state.pc = re.prog[state.pc].rep_min
// println("ist_or_branch False pc: $state.pc")
}
re.prog[state.pc].reset()
m_state = .ist_load
continue
}
// check ist_dot_char
else if ist == regex.ist_dot_char {
// println("ist_dot_char rep: ${re.prog[state.pc].rep}")
// check next token to be false
mut next_check_flag := false
// if we are done with max go on dot char are dedicated case!!
if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
re.state_list.pop()
m_state = .ist_next
continue
}
if re.prog[state.pc].dot_check_pc >= 0
&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
// load the char
// ch_t, _ := re.get_charb(in_txt, state.i+char_len)
ch_t := ch
chk_pc := re.prog[state.pc].dot_check_pc
// simple char
if re.prog[chk_pc].ist == regex.ist_simple_char {
if re.prog[chk_pc].ch == ch_t {
next_check_flag = true
}
// println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
}
// char char_class
else if re.prog[chk_pc].ist == regex.ist_char_class_pos
|| re.prog[chk_pc].ist == regex.ist_char_class_neg {
mut cc_neg := false
if re.prog[chk_pc].ist == regex.ist_char_class_neg {
cc_neg = true
}
mut cc_res := re.check_char_class(chk_pc, ch_t)
if cc_neg {
cc_res = !cc_res
}
next_check_flag = cc_res
// println("Check [ist_char_class] => $next_check_flag")
}
// check bsls
else if re.prog[chk_pc].ist == regex.ist_bsls_char {
next_check_flag = re.prog[chk_pc].validator(byte(ch_t))
// println("Check [ist_bsls_char] => $next_check_flag")
}
}
// check if we must continue or pass to the next IST
if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
// println("save the state!!")
mut dot_state := StateObj{
group_index: state.group_index
match_flag: state.match_flag
match_index: state.match_index
first_match: state.first_match
pc: state.pc
i: state.i + char_len
char_len: char_len
last_dot_pc: state.pc
}
// if we are mananging a .* stay on the same char on return
if re.prog[state.pc].rep_min == 0 {
dot_state.i -= char_len
}
re.state_list << dot_state
m_state = .ist_quant_n
// println("dot_char stack len: ${re.state_list.len}")
continue
}
state.match_flag = true
l_ist = u32(regex.ist_dot_char)
if state.first_match < 0 {
state.first_match = state.i
}
state.match_index = state.i
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len
m_state = .ist_quant_p
continue
}
// char class IST
else if ist == regex.ist_char_class_pos || ist == regex.ist_char_class_neg {
state.match_flag = false
mut cc_neg := false
if ist == regex.ist_char_class_neg {
cc_neg = true
}
mut cc_res := re.check_char_class(state.pc, ch)
if cc_neg {
cc_res = !cc_res
}
if cc_res {
state.match_flag = true
l_ist = u32(regex.ist_char_class_pos)
if state.first_match < 0 {
state.first_match = state.i
}
state.match_index = state.i
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len // next char
m_state = .ist_quant_p
continue
}
m_state = .ist_quant_n
continue
}
// check bsls
else if ist == regex.ist_bsls_char {
state.match_flag = false
tmp_res := re.prog[state.pc].validator(byte(ch))
// println("BSLS in_ch: ${ch:c} res: $tmp_res")
if tmp_res {
state.match_flag = true
l_ist = u32(regex.ist_bsls_char)
if state.first_match < 0 {
state.first_match = state.i
}
state.match_index = state.i
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len // next char
m_state = .ist_quant_p
continue
}
m_state = .ist_quant_n
continue
}
// simple char IST
else if ist == regex.ist_simple_char {
// println("ist_simple_char")
state.match_flag = false
if re.prog[state.pc].ch == ch {
state.match_flag = true
l_ist = regex.ist_simple_char
if state.first_match < 0 {
state.first_match = state.i
}
// println("state.match_index: ${state.match_index}")
state.match_index = state.i
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len // next char
m_state = .ist_quant_p
continue
}
m_state = .ist_quant_n
continue
}
// UNREACHABLE
// println("PANIC2!! state: $m_state")
return regex.err_internal_error, state.i
}
/***********************************
* Quantifier management
***********************************/
// ist_quant_ng => quantifier negative test on group
else if m_state == .ist_quant_ng {
// we are finished here
if state.group_index < 0 {
// println("Early stop!")
result = regex.no_match_found
m_state = .stop
continue
}
tmp_pc := re.group_data[state.group_index] // PC to the end of the group token
rep := re.prog[tmp_pc].group_rep // use a temp variable
re.prog[tmp_pc].group_rep = 0 // clear the repetitions
// println(".ist_quant_ng group_pc_end: $tmp_pc rep: $rep")
if rep >= re.prog[tmp_pc].rep_min {
// println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index")
state.i = re.group_stack[state.group_index]
state.pc = tmp_pc
state.group_index--
m_state = .ist_next
continue
} else if re.prog[tmp_pc].next_is_or {
// println("ist_quant_ng OR Negative branch")
state.i = re.group_stack[state.group_index]
state.pc = re.prog[tmp_pc + 1].rep_min - 1
state.group_index--
m_state = .ist_next
continue
} else if rep > 0 && rep < re.prog[tmp_pc].rep_min {
// println("ist_quant_ng UNDER THE MINIMUM g.i: $state.group_index")
// check if we are inside a group, if yes exit from the nested groups
if state.group_index > 0 {
state.group_index--
state.pc = tmp_pc
m_state = .ist_quant_ng //.ist_next
continue
}
if state.group_index == 0 {
state.group_index--
state.pc = tmp_pc // TEST
m_state = .ist_next
continue
}
result = regex.no_match_found
m_state = .stop
continue
} else if rep == 0 && rep < re.prog[tmp_pc].rep_min {
// println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: $state.group_index")
if state.group_index > 0 {
state.group_index--
state.pc = tmp_pc
m_state = .ist_quant_ng //.ist_next
continue
}
result = regex.no_match_found
m_state = .stop
continue
}
// println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:$rep")
// UNREACHABLE
return regex.err_internal_error, state.i
}
// ist_quant_pg => quantifier positive test on group
else if m_state == .ist_quant_pg {
// println(".ist_quant_pg")
mut tmp_pc := state.pc
if state.group_index >= 0 {
tmp_pc = re.group_data[state.group_index]
}
rep := re.prog[tmp_pc].group_rep
if rep < re.prog[tmp_pc].rep_min {
// println("ist_quant_pg UNDER RANGE")
state.pc = re.prog[tmp_pc].goto_pc
m_state = .ist_next
continue
} else if rep == re.prog[tmp_pc].rep_max {
// println("ist_quant_pg MAX RANGE")
re.prog[tmp_pc].group_rep = 0 // clear the repetitions
state.group_index--
m_state = .ist_next
continue
} else if rep >= re.prog[tmp_pc].rep_min {
// println("ist_quant_pg IN RANGE group_index:$state.group_index")
// check greedy flag, if true exit on minimum
if re.prog[tmp_pc].greedy == true {
re.prog[tmp_pc].group_rep = 0 // clear the repetitions
state.group_index--
m_state = .ist_next
continue
}
state.pc = re.prog[tmp_pc].goto_pc - 1
state.group_index--
m_state = .ist_next
continue
}
// UNREACHABLE
// println("PANIC3!! state: $m_state")
return regex.err_internal_error, state.i
}
// ist_quant_n => quantifier negative test on token
else if m_state == .ist_quant_n {
rep := re.prog[state.pc].rep
// println("Here!! PC $state.pc is_next_or: ${re.prog[state.pc].next_is_or}")
// zero quantifier * or ?
if rep == 0 && re.prog[state.pc].rep_min == 0 {
// println("ist_quant_n c_zero RANGE MIN")
m_state = .ist_next // go to next ist
continue
}
// match + or *
else if rep >= re.prog[state.pc].rep_min {
// println("ist_quant_n MATCH RANGE")
m_state = .ist_next
continue
}
// check the OR if present
if re.prog[state.pc].next_is_or {
// println("OR present on failing")
state.match_index = -1
m_state = .ist_next
continue
}
// we are in a group manage no match from here
if state.group_index >= 0 {
// println("ist_quant_n FAILED insied a GROUP group_index:$state.group_index")
m_state = .ist_quant_ng
continue
}
// no other options
// println("ist_quant_n no_match_found")
result = regex.no_match_found
m_state = .stop
continue
// return no_match_found, 0
}
// ist_quant_p => quantifier positive test on token
else if m_state == .ist_quant_p {
// println("Here .ist_quant_p")
// exit on first match
if (re.flag & regex.f_efm) != 0 {
return state.i, state.i + 1
}
rep := re.prog[state.pc].rep
// println(rep)
// under range
if rep > 0 && rep < re.prog[state.pc].rep_min {
// println("ist_quant_p UNDER RANGE")
m_state = .ist_load // continue the loop
continue
}
// range ok, continue loop
else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max {
// println("ist_quant_p IN RANGE")
// check greedy flag, if true exit on minimum
if re.prog[state.pc].greedy == true {
m_state = .ist_next
continue
}
m_state = .ist_load
continue
}
// max reached
else if rep == re.prog[state.pc].rep_max {
// println("ist_quant_p MAX RANGE")
m_state = .ist_next
continue
}
}
// UNREACHABLE
// println("PANIC4!! state: $m_state")
return regex.err_internal_error, state.i
}
// println("Check end of text!")
// Check the results
if state.match_index >= 0 {
if state.group_index < 0 {
if re.prog[state.pc].ist == regex.ist_prog_end {
// println("program ended!!")
if (re.flag & regex.f_src) != 0 {
// println("find return")
return state.first_match, state.i
} else {
// println("Here!!")
return 0, state.i
}
}
// println("No Group here, natural end [$state.first_match,$state.i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len")
if re.prog[state.pc + 1].ist == regex.ist_prog_end
|| re.prog[state.pc].ist == regex.ist_prog_end {
rep := re.prog[state.pc].rep
// println("rep: $rep re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}")
if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max {
return state.first_match, state.i
}
// println("Program not finished! ")
return regex.no_match_found, 0
}
if src_end {
// println("program end")
return state.first_match, state.i
}
// print("No match found!!")
return regex.no_match_found, 0
} else {
// println("Group match! OK")
// println("first_match: $state.first_match, i: $state.i")
// println("Skip last group")
return state.first_match, state.i
// return state.first_match,re.group_stack[state.group_index--]
}
}
// println("no_match_found, natural end")
return regex.no_match_found, 0
}