regex 0.9b 'replace' and 'find_all' added
parent
e87e5e33a7
commit
25e7ceeef0
|
@ -1,6 +1,6 @@
|
||||||
/**********************************************************************
|
/**********************************************************************
|
||||||
*
|
*
|
||||||
* regex 0.9a
|
* regex 0.9b
|
||||||
*
|
*
|
||||||
* Copyright (c) 2019 Dario Deledda. All rights reserved.
|
* Copyright (c) 2019 Dario Deledda. All rights reserved.
|
||||||
* Use of this source code is governed by an MIT license
|
* Use of this source code is governed by an MIT license
|
||||||
|
@ -15,9 +15,10 @@
|
||||||
*
|
*
|
||||||
**********************************************************************/
|
**********************************************************************/
|
||||||
module regex
|
module regex
|
||||||
|
import strings
|
||||||
|
|
||||||
pub const(
|
pub const(
|
||||||
V_REGEX_VERSION = "0.9a" // regex module version
|
V_REGEX_VERSION = "0.9b" // regex module version
|
||||||
|
|
||||||
MAX_CODE_LEN = 256 // default small base code len for the regex programs
|
MAX_CODE_LEN = 256 // default small base code len for the regex programs
|
||||||
MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
|
MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
|
||||||
|
@ -196,6 +197,22 @@ pub fn (re RE) get_parse_error_string(err int) string {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// utf8_str convert and utf8 sequence to a printable string
|
||||||
|
[inline]
|
||||||
|
fn utf8_str(ch u32) string {
|
||||||
|
mut i := 4
|
||||||
|
mut res := ""
|
||||||
|
for i > 0 {
|
||||||
|
v := byte((ch >> ((i-1)*8)) & 0xFF)
|
||||||
|
if v != 0{
|
||||||
|
res += "${v:1c}"
|
||||||
|
}
|
||||||
|
i--
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
// simple_log default log function
|
// simple_log default log function
|
||||||
fn simple_log(txt string) {
|
fn simple_log(txt string) {
|
||||||
C.fprintf(C.stdout, "%s",txt.str)
|
C.fprintf(C.stdout, "%s",txt.str)
|
||||||
|
@ -1013,191 +1030,152 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
|
||||||
|
|
||||||
// get_code return the compiled code as regex string, note: may be different from the source!
|
// get_code return the compiled code as regex string, note: may be different from the source!
|
||||||
pub fn (re RE) get_code() string {
|
pub fn (re RE) get_code() string {
|
||||||
mut result := ""
|
|
||||||
|
|
||||||
// use the best buffer possible
|
|
||||||
mut tmp_len := 256+128
|
|
||||||
if tmp_len < re.cc.len+128 {
|
|
||||||
tmp_len = re.cc.len+128
|
|
||||||
}
|
|
||||||
// some memory buffer
|
|
||||||
buf1 := [byte(0)].repeat(tmp_len)
|
|
||||||
buf := &buf1[0]
|
|
||||||
|
|
||||||
mut buf_ptr := buf
|
|
||||||
mut pc1 := 0
|
mut pc1 := 0
|
||||||
C.sprintf(buf_ptr, "========================================\nv RegEx compiler v%s output:\n", V_REGEX_VERSION)
|
mut res := strings.new_builder(re.cc.len*2*re.prog.len)
|
||||||
result += tos_clone(buf)
|
res.write("========================================\nv RegEx compiler v $V_REGEX_VERSION output:\n")
|
||||||
|
|
||||||
mut stop_flag := false
|
mut stop_flag := false
|
||||||
|
|
||||||
for pc1 <= re.prog.len {
|
for pc1 <= re.prog.len {
|
||||||
buf_ptr = buf
|
res.write("PC:${pc1:3d}")
|
||||||
C.sprintf(buf_ptr, "PC:%3d ist:%08x ",pc1, re.prog[pc1].ist)
|
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
res.write(" ist: ")
|
||||||
|
res.write("${re.prog[pc1].ist:8x}".replace(" ","0") )
|
||||||
|
res.write(" ")
|
||||||
ist :=re.prog[pc1].ist
|
ist :=re.prog[pc1].ist
|
||||||
if ist == IST_BSLS_CHAR {
|
if ist == IST_BSLS_CHAR {
|
||||||
C.sprintf(buf_ptr, "[\\%c] BSLS", re.prog[pc1].v_ch)
|
res.write("[\\${re.prog[pc1].v_ch:1c}] BSLS")
|
||||||
} else if ist == IST_PROG_END {
|
} else if ist == IST_PROG_END {
|
||||||
C.sprintf(buf_ptr, "PROG_END")
|
res.write("PROG_END")
|
||||||
stop_flag = true
|
stop_flag = true
|
||||||
} else if ist == IST_OR_BRANCH {
|
} else if ist == IST_OR_BRANCH {
|
||||||
C.sprintf(buf_ptr, "OR ")
|
res.write("OR ")
|
||||||
} else if ist == IST_CHAR_CLASS_POS {
|
} else if ist == IST_CHAR_CLASS_POS {
|
||||||
C.sprintf(buf_ptr, "[%s] CHAR_CLASS_POS", re.get_char_class(pc1))
|
res.write("[${re.get_char_class(pc1)}] CHAR_CLASS_POS")
|
||||||
} else if ist == IST_CHAR_CLASS_NEG {
|
} else if ist == IST_CHAR_CLASS_NEG {
|
||||||
C.sprintf(buf_ptr, "[^] CHAR_CLASS_NEG[%s]", re.get_char_class(pc1))
|
res.write("[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG")
|
||||||
} else if ist == IST_DOT_CHAR {
|
} else if ist == IST_DOT_CHAR {
|
||||||
C.sprintf(buf_ptr, ". DOT_CHAR")
|
res.write(". DOT_CHAR")
|
||||||
} else if ist == IST_GROUP_START {
|
} else if ist == IST_GROUP_START {
|
||||||
C.sprintf(buf_ptr, "( GROUP_START #:%d", re.prog[pc1].group_id)
|
res.write("( GROUP_START #:${re.prog[pc1].group_id}")
|
||||||
} else if ist == IST_GROUP_END {
|
} else if ist == IST_GROUP_END {
|
||||||
C.sprintf(buf_ptr, ") GROUP_END #:%d", re.prog[pc1].group_id)
|
res.write(") GROUP_END #:${re.prog[pc1].group_id}")
|
||||||
} else if ist & SIMPLE_CHAR_MASK == 0 {
|
} else if ist & SIMPLE_CHAR_MASK == 0 {
|
||||||
C.sprintf(buf_ptr, "[%c] query_ch", ist & IST_SIMPLE_CHAR)
|
res.write("[${ist & IST_SIMPLE_CHAR:1c}] query_ch")
|
||||||
}
|
}
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
|
|
||||||
if re.prog[pc1].rep_max == MAX_QUANTIFIER {
|
if re.prog[pc1].rep_max == MAX_QUANTIFIER {
|
||||||
C.sprintf(buf_ptr, " {%3d,MAX}",re.prog[pc1].rep_min)
|
res.write(" {${re.prog[pc1].rep_min:3d},MAX}")
|
||||||
}else{
|
}else{
|
||||||
if ist == IST_OR_BRANCH {
|
if ist == IST_OR_BRANCH {
|
||||||
C.sprintf(buf_ptr, " if false go: %3d if true go: %3d", re.prog[pc1].rep_min, re.prog[pc1].rep_max)
|
res.write(" if false go: ${re.prog[pc1].rep_min:3d} if true go: ${re.prog[pc1].rep_max:3d}")
|
||||||
} else {
|
} else {
|
||||||
C.sprintf(buf_ptr, " {%3d,%3d}", re.prog[pc1].rep_min, re.prog[pc1].rep_max)
|
res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
res.write("\n")
|
||||||
C.sprintf(buf_ptr, "\n")
|
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
result += tos_clone(buf)
|
|
||||||
if stop_flag {
|
if stop_flag {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
pc1++
|
pc1++
|
||||||
}
|
}
|
||||||
|
|
||||||
buf_ptr = buf
|
res.write("========================================\n")
|
||||||
C.sprintf(buf_ptr, "========================================\n")
|
return res.str()
|
||||||
|
|
||||||
result += tos_clone(buf)
|
|
||||||
return result
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// get_query return a string with a reconstruction of the query starting from the regex program code
|
// get_query return a string with a reconstruction of the query starting from the regex program code
|
||||||
|
|
||||||
pub fn (re RE) get_query() string {
|
pub fn (re RE) get_query() string {
|
||||||
// use the best buffer possible
|
mut res := strings.new_builder(re.query.len*2)
|
||||||
buf1 := [byte(0)].repeat(re.cc.len*2)
|
|
||||||
buf := &buf1[0]
|
|
||||||
mut buf_ptr := buf
|
|
||||||
|
|
||||||
if (re.flag & F_MS) != 0 {
|
if (re.flag & F_MS) != 0 {
|
||||||
C.sprintf(buf_ptr, "^")
|
res.write("^")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mut i := 0
|
mut i := 0
|
||||||
for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{
|
for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{
|
||||||
ch := re.prog[i].ist
|
ch := re.prog[i].ist
|
||||||
|
|
||||||
//C.printf("ty: %08x\n", ch)
|
|
||||||
|
|
||||||
// GROUP start
|
// GROUP start
|
||||||
if ch == IST_GROUP_START {
|
if ch == IST_GROUP_START {
|
||||||
if re.debug == 0 {
|
if re.debug == 0 {
|
||||||
C.sprintf(buf_ptr, "(")
|
res.write("(")
|
||||||
} else {
|
} else {
|
||||||
C.sprintf(buf_ptr, "#%d(", re.prog[i].group_id)
|
res.write("#${re.prog[i].group_id}(")
|
||||||
}
|
}
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
i++
|
i++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// GROUP end
|
// GROUP end
|
||||||
if ch == IST_GROUP_END {
|
if ch == IST_GROUP_END {
|
||||||
C.sprintf(buf_ptr, ")")
|
res.write(")")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// OR branch
|
// OR branch
|
||||||
if ch == IST_OR_BRANCH {
|
if ch == IST_OR_BRANCH {
|
||||||
C.sprintf(buf_ptr, "|")
|
res.write("|")
|
||||||
if re.debug > 0 {
|
if re.debug > 0 {
|
||||||
C.sprintf(buf_ptr, "{%d,%d}", re.prog[i].rep_min, re.prog[i].rep_max)
|
res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
|
||||||
}
|
}
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
i++
|
i++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// char class
|
// char class
|
||||||
if ch == IST_CHAR_CLASS_NEG || ch == IST_CHAR_CLASS_POS {
|
if ch == IST_CHAR_CLASS_NEG || ch == IST_CHAR_CLASS_POS {
|
||||||
C.sprintf(buf_ptr, "[")
|
res.write("[")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
|
|
||||||
if ch == IST_CHAR_CLASS_NEG {
|
if ch == IST_CHAR_CLASS_NEG {
|
||||||
C.sprintf(buf_ptr, "^")
|
res.write("^")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
|
res.write("${re.get_char_class(i)}")
|
||||||
C.sprintf(buf_ptr,"%s", re.get_char_class(i))
|
res.write("]")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
|
|
||||||
C.sprintf(buf_ptr, "]")
|
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// bsls char
|
// bsls char
|
||||||
if ch == IST_BSLS_CHAR {
|
if ch == IST_BSLS_CHAR {
|
||||||
C.sprintf(buf_ptr, "\\%c", re.prog[i].v_ch)
|
res.write("\\${re.prog[i].v_ch:1c}")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// IST_DOT_CHAR
|
// IST_DOT_CHAR
|
||||||
if ch == IST_DOT_CHAR {
|
if ch == IST_DOT_CHAR {
|
||||||
C.sprintf(buf_ptr, ".")
|
res.write(".")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// char alone
|
// char alone
|
||||||
if ch & SIMPLE_CHAR_MASK == 0 {
|
if ch & SIMPLE_CHAR_MASK == 0 {
|
||||||
if byte(ch) in BSLS_ESCAPE_LIST {
|
if byte(ch) in BSLS_ESCAPE_LIST {
|
||||||
C.sprintf(buf_ptr, "\\")
|
res.write("\\")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
C.sprintf(buf_ptr, "%c", re.prog[i].ist)
|
res.write("${re.prog[i].ist:c}")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// quantifier
|
// quantifier
|
||||||
if !(re.prog[i].rep_min == 1 && re.prog[i].rep_max == 1) {
|
if !(re.prog[i].rep_min == 1 && re.prog[i].rep_max == 1) {
|
||||||
if re.prog[i].rep_min == 0 && re.prog[i].rep_max == 1 {
|
if re.prog[i].rep_min == 0 && re.prog[i].rep_max == 1 {
|
||||||
C.sprintf(buf_ptr, "?")
|
res.write("?")
|
||||||
} else if re.prog[i].rep_min == 1 && re.prog[i].rep_max == MAX_QUANTIFIER {
|
} else if re.prog[i].rep_min == 1 && re.prog[i].rep_max == MAX_QUANTIFIER {
|
||||||
C.sprintf(buf_ptr, "+")
|
res.write("+")
|
||||||
} else if re.prog[i].rep_min == 0 && re.prog[i].rep_max == MAX_QUANTIFIER {
|
} else if re.prog[i].rep_min == 0 && re.prog[i].rep_max == MAX_QUANTIFIER {
|
||||||
C.sprintf(buf_ptr, "*")
|
res.write("*")
|
||||||
} else {
|
} else {
|
||||||
if re.prog[i].rep_max == MAX_QUANTIFIER {
|
if re.prog[i].rep_max == MAX_QUANTIFIER {
|
||||||
C.sprintf(buf_ptr, "{%d,MAX}", re.prog[i].rep_min)
|
res.write("{${re.prog[i].rep_min},MAX}")
|
||||||
} else {
|
} else {
|
||||||
C.sprintf(buf_ptr, "{%d,%d}", re.prog[i].rep_min, re.prog[i].rep_max)
|
res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
if (re.flag & F_ME) != 0 {
|
if (re.flag & F_ME) != 0 {
|
||||||
C.sprintf(buf_ptr, "$")
|
res.write("$")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
res := tos_clone(buf)
|
|
||||||
|
|
||||||
return res
|
return res.str()
|
||||||
}
|
}
|
||||||
|
|
||||||
/******************************************************************************
|
/******************************************************************************
|
||||||
|
@ -1269,9 +1247,11 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
|
|
||||||
if re.debug>0 {
|
if re.debug>0 {
|
||||||
// print header
|
// print header
|
||||||
h_buf := [byte(0)].repeat(64)
|
mut h_buf := strings.new_builder(32)
|
||||||
C.sprintf(&h_buf[0], "flags: %08x\n",re.flag)
|
h_buf.write("flags: ")
|
||||||
re.log_func(tos_clone(&h_buf[0]))
|
h_buf.write("${re.flag:8x}".replace(" ","0"))
|
||||||
|
h_buf.write("\n")
|
||||||
|
re.log_func(h_buf.str())
|
||||||
}
|
}
|
||||||
|
|
||||||
for m_state != .end {
|
for m_state != .end {
|
||||||
|
@ -1279,7 +1259,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
if pc >= 0 && pc < re.prog.len {
|
if pc >= 0 && pc < re.prog.len {
|
||||||
ist = re.prog[pc].ist
|
ist = re.prog[pc].ist
|
||||||
}else if pc >= re.prog.len {
|
}else if pc >= re.prog.len {
|
||||||
C.printf("ERROR!! PC overflow!!\n")
|
//C.printf("ERROR!! PC overflow!!\n")
|
||||||
return ERR_INTERNAL_ERROR, i
|
return ERR_INTERNAL_ERROR, i
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1287,86 +1267,69 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
// DEBUG LOG
|
// DEBUG LOG
|
||||||
//******************************************
|
//******************************************
|
||||||
if re.debug>0 {
|
if re.debug>0 {
|
||||||
// use the best buffer possible
|
mut buf2 := strings.new_builder(re.cc.len+128)
|
||||||
mut tmp_len := 256
|
|
||||||
if tmp_len < re.cc.len+128 {
|
|
||||||
tmp_len = re.cc.len+128
|
|
||||||
}
|
|
||||||
|
|
||||||
// some memory buffer
|
// print all the instructions
|
||||||
buf1 := [byte(0)].repeat(tmp_len)
|
|
||||||
buf := &buf1[0]
|
|
||||||
|
|
||||||
// print all the instructions
|
|
||||||
mut buf_ptr := buf
|
|
||||||
|
|
||||||
// end of the input text
|
// end of the input text
|
||||||
if i >= in_txt_len {
|
if i >= in_txt_len {
|
||||||
C.sprintf(buf_ptr, "# %3d END OF INPUT TEXT\n",step_count)
|
buf2.write("# ${step_count:3d} END OF INPUT TEXT\n")
|
||||||
re.log_func(tos_clone(buf))
|
re.log_func(buf2.str())
|
||||||
}else{
|
}else{
|
||||||
|
|
||||||
// print only the exe istruction
|
// print only the exe istruction
|
||||||
if (re.debug == 1 && m_state == .ist_load) ||
|
if (re.debug == 1 && m_state == .ist_load) ||
|
||||||
re.debug == 2
|
re.debug == 2
|
||||||
{
|
{
|
||||||
|
|
||||||
if ist == IST_PROG_END {
|
if ist == IST_PROG_END {
|
||||||
C.sprintf(buf_ptr, "# %3d PROG_END\n",step_count)
|
buf2.write("# ${step_count:3d} PROG_END\n")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
else if ist == 0 || m_state in [.start,.ist_next,.stop] {
|
else if ist == 0 || m_state in [.start,.ist_next,.stop] {
|
||||||
C.sprintf(buf_ptr, "# %3d s: %12s PC: NA\n",step_count, state_str(m_state).str)
|
buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n")
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}else{
|
}else{
|
||||||
ch, char_len = get_charb(in_txt,i)
|
ch, char_len = get_charb(in_txt,i)
|
||||||
|
|
||||||
tmp_bl:=[byte(ch >> 24), byte((ch >> 16) & 0xFF), byte((ch >> 8) & 0xFF), byte(ch & 0xFF), 0]
|
buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${pc:3d}=>")
|
||||||
tmp_un_ch := byteptr(&tmp_bl[4-char_len])
|
buf2.write("${ist:8x}".replace(" ","0"))
|
||||||
|
buf2.write(" i,ch,len:[${i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${first_match:3d},${state.match_index:3d}] ")
|
||||||
C.sprintf(buf_ptr, "# %3d s: %12s PC: %3d=>%08x i,ch,len:[%3d,'%s',%d] f.m:[%3d,%3d] ",
|
|
||||||
step_count, state_str(m_state).str , pc, ist, i, tmp_un_ch, char_len, first_match,state.match_index)
|
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
|
|
||||||
if ist & SIMPLE_CHAR_MASK == 0 {
|
if ist & SIMPLE_CHAR_MASK == 0 {
|
||||||
if char_len < 4 {
|
if char_len < 4 {
|
||||||
C.sprintf(buf_ptr, "query_ch: [%c]", ist & IST_SIMPLE_CHAR)
|
tmp_c := ist & IST_SIMPLE_CHAR
|
||||||
|
buf2.write("query_ch: [${tmp_c:1c}]")
|
||||||
} else {
|
} else {
|
||||||
C.sprintf(buf_ptr, "query_ch: [%c]", ist | SIMPLE_CHAR_MASK)
|
tmp_c := ist | IST_SIMPLE_CHAR
|
||||||
|
buf2.write("query_ch: [${tmp_c:1c}]")
|
||||||
}
|
}
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
} else {
|
} else {
|
||||||
if ist == IST_BSLS_CHAR {
|
if ist == IST_BSLS_CHAR {
|
||||||
C.sprintf(buf_ptr, "BSLS [\\%c]",re.prog[pc].v_ch)
|
buf2.write("BSLS [\\${re.prog[pc].v_ch:1c}]")
|
||||||
} else if ist == IST_PROG_END {
|
} else if ist == IST_PROG_END {
|
||||||
C.sprintf(buf_ptr, "PROG_END")
|
buf2.write("PROG_END")
|
||||||
} else if ist == IST_OR_BRANCH {
|
} else if ist == IST_OR_BRANCH {
|
||||||
C.sprintf(buf_ptr, "OR")
|
buf2.write("OR")
|
||||||
} else if ist == IST_CHAR_CLASS_POS {
|
} else if ist == IST_CHAR_CLASS_POS {
|
||||||
C.sprintf(buf_ptr, "CHAR_CLASS_POS[%s]",re.get_char_class(pc))
|
buf2.write("CHAR_CLASS_POS[${re.get_char_class(pc)}]")
|
||||||
} else if ist == IST_CHAR_CLASS_NEG {
|
} else if ist == IST_CHAR_CLASS_NEG {
|
||||||
C.sprintf(buf_ptr, "CHAR_CLASS_NEG[%s]",re.get_char_class(pc))
|
buf2.write("CHAR_CLASS_NEG[${re.get_char_class(pc)}]")
|
||||||
} else if ist == IST_DOT_CHAR {
|
} else if ist == IST_DOT_CHAR {
|
||||||
C.sprintf(buf_ptr, "DOT_CHAR")
|
buf2.write("DOT_CHAR")
|
||||||
} else if ist == IST_GROUP_START {
|
} else if ist == IST_GROUP_START {
|
||||||
C.sprintf(buf_ptr, "GROUP_START #:%d rep:%d ",re.prog[pc].group_id, re.prog[re.prog[pc].goto_pc].group_rep)
|
tmp_gi :=re.prog[pc].group_id
|
||||||
|
tmp_gr := re.prog[re.prog[pc].goto_pc].group_rep
|
||||||
|
buf2.write("GROUP_START #:${tmp_gi} rep:${tmp_gr} ")
|
||||||
} else if ist == IST_GROUP_END {
|
} else if ist == IST_GROUP_END {
|
||||||
C.sprintf(buf_ptr, "GROUP_END #:%d deep:%d ",re.prog[pc].group_id, group_index)
|
buf2.write("GROUP_END #:${re.prog[pc].group_id} deep:${group_index} ")
|
||||||
}
|
}
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
|
||||||
}
|
}
|
||||||
if re.prog[pc].rep_max == MAX_QUANTIFIER {
|
if re.prog[pc].rep_max == MAX_QUANTIFIER {
|
||||||
C.sprintf(buf_ptr, "{%d,MAX}:%d",re.prog[pc].rep_min,re.prog[pc].rep)
|
buf2.write("{${re.prog[pc].rep_min},MAX}:${re.prog[pc].rep}")
|
||||||
} else {
|
} else {
|
||||||
C.sprintf(buf_ptr, "{%d,%d}:%d",re.prog[pc].rep_min,re.prog[pc].rep_max,re.prog[pc].rep)
|
buf2.write("{${re.prog[pc].rep_min},${re.prog[pc].rep_max}}:${re.prog[pc].rep}")
|
||||||
}
|
}
|
||||||
buf_ptr += vstrlen(buf_ptr)
|
buf2.write(" (#${group_index})\n")
|
||||||
C.sprintf(buf_ptr, " (#%d)\n",group_index)
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
re.log_func(buf2.str())
|
||||||
re.log_func(tos_clone(buf))
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
step_count++
|
step_count++
|
||||||
|
@ -1438,7 +1401,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
re.prog[pc].reset()
|
re.prog[pc].reset()
|
||||||
// check if we are in the program bounds
|
// check if we are in the program bounds
|
||||||
if pc < 0 || pc > re.prog.len {
|
if pc < 0 || pc > re.prog.len {
|
||||||
C.printf("ERROR!! PC overflow!!\n")
|
//C.printf("ERROR!! PC overflow!!\n")
|
||||||
return ERR_INTERNAL_ERROR, i
|
return ERR_INTERNAL_ERROR, i
|
||||||
}
|
}
|
||||||
m_state = .ist_load
|
m_state = .ist_load
|
||||||
|
@ -1450,7 +1413,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
pc = pc + 1
|
pc = pc + 1
|
||||||
// check if we are in the program bounds
|
// check if we are in the program bounds
|
||||||
if pc < 0 || pc > re.prog.len {
|
if pc < 0 || pc > re.prog.len {
|
||||||
C.printf("ERROR!! PC overflow!!\n")
|
//C.printf("ERROR!! PC overflow!!\n")
|
||||||
return ERR_INTERNAL_ERROR, i
|
return ERR_INTERNAL_ERROR, i
|
||||||
}
|
}
|
||||||
m_state = .ist_load
|
m_state = .ist_load
|
||||||
|
@ -1999,3 +1962,48 @@ pub fn (re mut RE) find(in_txt string) (int,int) {
|
||||||
}
|
}
|
||||||
return NO_MATCH_FOUND, 0
|
return NO_MATCH_FOUND, 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// find all the non overlapping occurrences of the match pattern
|
||||||
|
pub fn (re mut RE) find_all(in_txt string) []int {
|
||||||
|
mut i := 0
|
||||||
|
mut res := []int
|
||||||
|
mut ls := -1
|
||||||
|
for i < in_txt.len {
|
||||||
|
s,e := re.find(in_txt[i..])
|
||||||
|
if s >= 0 && e > s && i+s > ls {
|
||||||
|
//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
|
||||||
|
res << i+s
|
||||||
|
res << i+e
|
||||||
|
ls = i+s
|
||||||
|
i = i+e
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
// replace return a string where the matches are replaced with the replace string
|
||||||
|
pub fn (re mut RE) replace(in_txt string, repl string) string {
|
||||||
|
pos := re.find_all(in_txt)
|
||||||
|
if pos.len > 0 {
|
||||||
|
mut res := ""
|
||||||
|
mut i := 0
|
||||||
|
|
||||||
|
mut s1 := 0
|
||||||
|
mut e1 := in_txt.len
|
||||||
|
|
||||||
|
for i < pos.len {
|
||||||
|
e1 = pos[i]
|
||||||
|
res += in_txt[s1..e1] + repl
|
||||||
|
s1 = pos[i+1]
|
||||||
|
i += 2
|
||||||
|
}
|
||||||
|
|
||||||
|
res += in_txt[s1..]
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
return in_txt
|
||||||
|
}
|
||||||
|
|
|
@ -1,10 +1,15 @@
|
||||||
import regex
|
import regex
|
||||||
|
|
||||||
|
/******************************************************************************
|
||||||
|
*
|
||||||
|
* Test section
|
||||||
|
*
|
||||||
|
******************************************************************************/
|
||||||
struct TestItem {
|
struct TestItem {
|
||||||
src string
|
src string
|
||||||
q string
|
q string
|
||||||
s int = 0
|
s int = 0
|
||||||
e int = 0
|
e int = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
const(
|
const(
|
||||||
|
@ -77,7 +82,113 @@ match_test_suite = [
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
struct TestItemFa {
|
||||||
|
src string
|
||||||
|
q string
|
||||||
|
r []int
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
match_test_suite_fa = [
|
||||||
|
|
||||||
|
// find_all tests
|
||||||
|
|
||||||
|
TestItemFa{
|
||||||
|
"oggi pippo è andato a casa di pluto ed ha trovato pippo",
|
||||||
|
r"p[iplut]+o",
|
||||||
|
[5, 10, 31, 36, 51, 56]
|
||||||
|
},
|
||||||
|
TestItemFa{
|
||||||
|
"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
|
||||||
|
r"(pi?(ba)+o)",
|
||||||
|
[5, 10, 31, 39, 54, 65]
|
||||||
|
},
|
||||||
|
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
struct TestItemRe {
|
||||||
|
src string
|
||||||
|
q string
|
||||||
|
rep string
|
||||||
|
r string
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
match_test_suite_re = [
|
||||||
|
|
||||||
|
// replace tests
|
||||||
|
|
||||||
|
TestItemRe{
|
||||||
|
"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
|
||||||
|
r"(pi?(ba)+o)",
|
||||||
|
"CIAO",
|
||||||
|
"oggi CIAO è andato a casa di CIAO ed ha trovato CIAO"
|
||||||
|
},
|
||||||
|
TestItemRe{
|
||||||
|
"Today is a good day and tomorrow will be for sure.",
|
||||||
|
r"[Tt]o\w+",
|
||||||
|
"CIAO",
|
||||||
|
"CIAO is a good day and CIAO will be for sure."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
fn test_regex(){
|
fn test_regex(){
|
||||||
|
// check find_all
|
||||||
|
for c,to in match_test_suite_fa{
|
||||||
|
// debug print
|
||||||
|
//println("#$c [$to.src] q[$to.q] $to.r")
|
||||||
|
|
||||||
|
mut re, re_err, err_pos := regex.regex(to.q)
|
||||||
|
if re_err == regex.COMPILE_OK {
|
||||||
|
res := re.find_all(to.src)
|
||||||
|
if res.len != to.r.len {
|
||||||
|
println("ERROR: find_all, array of different size.")
|
||||||
|
assert false
|
||||||
|
}
|
||||||
|
|
||||||
|
for c1,i in res {
|
||||||
|
if i != to.r[c1] {
|
||||||
|
println("ERROR: find_all, different indexes.")
|
||||||
|
assert false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
println("query: $to.q")
|
||||||
|
lc := "-".repeat(err_pos-1)
|
||||||
|
println("err : $lc^")
|
||||||
|
err_str := re.get_parse_error_string(re_err)
|
||||||
|
println("ERROR: $err_str")
|
||||||
|
assert false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check replace
|
||||||
|
for c,to in match_test_suite_re{
|
||||||
|
// debug print
|
||||||
|
//println("#$c [$to.src] q[$to.q] $to.r")
|
||||||
|
|
||||||
|
mut re, re_err, err_pos := regex.regex(to.q)
|
||||||
|
if re_err == regex.COMPILE_OK {
|
||||||
|
res := re.replace(to.src,to.rep)
|
||||||
|
if res != to.r {
|
||||||
|
println("ERROR: replace.")
|
||||||
|
assert false
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
println("query: $to.q")
|
||||||
|
lc := "-".repeat(err_pos-1)
|
||||||
|
println("err : $lc^")
|
||||||
|
err_str := re.get_parse_error_string(re_err)
|
||||||
|
println("ERROR: $err_str")
|
||||||
|
assert false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check match and find
|
||||||
for c,to in match_test_suite {
|
for c,to in match_test_suite {
|
||||||
// debug print
|
// debug print
|
||||||
//println("#$c [$to.src] q[$to.q] $to.s")
|
//println("#$c [$to.src] q[$to.q] $to.s")
|
||||||
|
@ -128,11 +239,9 @@ fn test_regex(){
|
||||||
if start != to.s || end != to.e {
|
if start != to.s || end != to.e {
|
||||||
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
|
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
|
||||||
println("ERROR!")
|
println("ERROR!")
|
||||||
C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
|
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
|
||||||
assert false
|
assert false
|
||||||
break
|
break
|
||||||
} else {
|
|
||||||
assert true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// rerun to test consistency
|
// rerun to test consistency
|
||||||
|
@ -147,7 +256,7 @@ fn test_regex(){
|
||||||
} else {
|
} else {
|
||||||
println("query: $to.q")
|
println("query: $to.q")
|
||||||
lc := "-".repeat(err_pos-1)
|
lc := "-".repeat(err_pos-1)
|
||||||
println("err : $lc")
|
println("err : $lc^")
|
||||||
err_str := re.get_parse_error_string(re_err)
|
err_str := re.get_parse_error_string(re_err)
|
||||||
println("ERROR: $err_str")
|
println("ERROR: $err_str")
|
||||||
assert false
|
assert false
|
||||||
|
@ -155,3 +264,4 @@ fn test_regex(){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue