regix: fixes and optimizations

pull/3615/head
penguindark 2020-01-31 02:29:54 +01:00 committed by GitHub
parent 23dd6e2294
commit de832375b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 40 deletions

View File

@ -1,4 +1,4 @@
# V RegEx (Regular expression) 0.9c # V RegEx (Regular expression) 0.9d
[TOC] [TOC]

View File

@ -246,7 +246,7 @@ mut:
cc_index int = -1 cc_index int = -1
// counters for quantifier check (repetitions) // counters for quantifier check (repetitions)
rep int = 0 rep int = 0
// validator function pointer // validator function pointer
validator fn (byte) bool validator fn (byte) bool
@ -257,9 +257,10 @@ mut:
goto_pc int = -1 // jump to this PC if is needed goto_pc int = -1 // jump to this PC if is needed
// OR flag for the token // OR flag for the token
next_is_or bool = false // true if the next token is an OR next_is_or bool = false // true if the next token is an OR
} }
[inline]
fn (tok mut Token) reset() { fn (tok mut Token) reset() {
tok.rep = 0 tok.rep = 0
} }
@ -279,6 +280,7 @@ pub const (
// behaviour modifier flags // behaviour modifier flags
//F_OR = 0x00010000 // the OR work with concatenation like PCRE //F_OR = 0x00010000 // the OR work with concatenation like PCRE
F_SRC = 0x00020000 // search mode enabled
) )
struct StateDotObj{ struct StateDotObj{
@ -324,8 +326,8 @@ pub mut:
} }
// Reset RE object // Reset RE object
//[inline]
fn (re mut RE) reset(){ fn (re mut RE) reset(){
//re.group_count = 0
re.cc_index = 0 re.cc_index = 0
mut i := 0 mut i := 0
@ -345,6 +347,18 @@ fn (re mut RE) reset(){
} }
} }
// reset for search mode fail
// gcc bug, dont use [inline] or go 5 time slower
fn (re mut RE) reset_src(){
mut i := 0
for i < re.prog.len {
re.prog[i].group_rep = 0 // clear repetition of the group
re.prog[i].rep = 0 // clear repetition of the token
i++
}
re.state_stack_index = -1
}
pub fn (re RE) get_group(group_name string) (int, int) { pub fn (re RE) get_group(group_name string) (int, int) {
if group_name in re.group_map { if group_name in re.group_map {
tmp_index := re.group_map[group_name]-1 tmp_index := re.group_map[group_name]-1
@ -1654,6 +1668,18 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
// check if stop // check if stop
if m_state == .stop { if m_state == .stop {
// we are in search mode, don't exit until the end
if re.flag & F_SRC != 0 && ist != IST_PROG_END {
pc = -1
i += char_len
m_state = .ist_next
re.reset_src()
state.match_index = -1
first_match = -1
continue
}
// if we are in restore state ,do it and restart // if we are in restore state ,do it and restart
//C.printf("re.state_stack_index %d\n",re.state_stack_index ) //C.printf("re.state_stack_index %d\n",re.state_stack_index )
if re.state_stack_index >=0 && re.state_stack[re.state_stack_index].pc >= 0 { if re.state_stack_index >=0 && re.state_stack[re.state_stack_index].pc >= 0 {
@ -2187,42 +2213,12 @@ pub fn (re mut RE) match_string(in_txt string) (int,int) {
// find try to find the first match in the input string // find try to find the first match in the input string
pub fn (re mut RE) find(in_txt string) (int,int) { pub fn (re mut RE) find(in_txt string) (int,int) {
mut i := 0
mut start := -1
mut end := -1
old_flag := re.flag old_flag := re.flag
re.flag |= F_SRC // enable search mode
for i < in_txt.len { start, end := re.match_base(in_txt.str, in_txt.len)
re.flag = old_flag
// test only the first part of the query string if start >= 0 && end > start {
re.flag |= F_EFM // set to exit on the first token match return start,end
mut tmp_end := i+re.query.len
if tmp_end > in_txt.len { tmp_end = in_txt.len }
tmp_txt := string{ str: in_txt.str+i, len: tmp_end-i }
start, end = re.match_base(tmp_txt.str, tmp_txt.len)
if start >= 0 && end > start {
// test a complete match
re.flag = old_flag
tmp_txt1 := string{ str: in_txt.str+i , len: in_txt.len-i }
start, end = re.match_base(tmp_txt1.str, tmp_txt1.len)
if start >= 0 && end > start {
if (re.flag & F_MS) != 0 && (i+start) > 0 {
return NO_MATCH_FOUND, 0
}
if (re.flag & F_ME) != 0 && (i+end) < in_txt.len {
return NO_MATCH_FOUND, 0
}
return i+start, i+end
}
}
i++
if re.flag == F_MS && i>0 {
return NO_MATCH_FOUND, 0
}
} }
return NO_MATCH_FOUND, 0 return NO_MATCH_FOUND, 0
} }