regex: new options returning interface to the regex module (#6062)

pull/6093/head
Maciej Obarski 2020-08-08 08:04:12 +02:00 committed by GitHub
parent 664c26ab4b
commit 2a4ef2acbd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 268 additions and 274 deletions

View File

@ -1,4 +1,4 @@
# V RegEx (Regular expression) 0.9d
# V RegEx (Regular expression) 0.9g
[TOC]
@ -137,7 +137,7 @@ The "capture groups" are store as couple of index in the field `groups` that is
```v
text := "cpaz cpapaz cpapapaz"
query:= r"(c(pa)+z ?)+"
re, _, _ := regex.regex(query)
mut re := regex.regex_opt(query) or { panic(err) }
println(re.get_query())
// #0(c#1(pa)+z ?)+ // #0 and #1 are the ids of the groups, are shown if re.debug is 1 or 2
@ -155,8 +155,6 @@ for gi < re.groups.len {
// groups captured
// 0 :[cpapapaz]
// 1 :[pa]
```
**note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`*
@ -187,48 +185,41 @@ fn example2() {
text := "tst: 01,23,45 ,56, 78"
query:= r".*:(\s*\d+[\s,]*)+"
mut re := regex.new_regex()
mut re := new() or { panic(err) }
//re.debug = 2
re.group_csave = [-1].repeat(3*20+1) // we expect max 20 records
re_err, err_pos := re.compile(query)
if re_err == regex.COMPILE_OK {
q_str := re.get_query()
println("Query: $q_str")
start, end := re.match_string(text)
if start < 0 {
println("ERROR : ${re.get_parse_error_string(start)}, $start")
} else {
println("found in [$start, $end] => [${text[start..end]}]")
}
re.compile_opt(query) or { println(err) return }
// groups capture
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
q_str := re.get_query()
println("Query: $q_str")
// continuous saving
gi = 0
println("num: ${re.group_csave[0]}")
for gi < re.group_csave[0] {
id := re.group_csave[1+gi*3]
st := re.group_csave[1+gi*3+1]
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
gi++
}
} else {
println("query: $query")
lc := "-".repeat(err_pos)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
}
start, end := re.match_string(text)
if start < 0 {
println("ERROR : ${re.get_parse_error_string(start)}, $start")
} else {
println("found in [$start, $end] => [${text[start..end]}]")
}
// groups capture
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
// continuous saving
gi = 0
println("num: ${re.group_csave[0]}")
for gi < re.group_csave[0] {
id := re.group_csave[1+gi*3]
st := re.group_csave[1+gi*3+1]
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
gi++
}
}
```
@ -261,73 +252,65 @@ Have a look at the example for the use of them.
example:
```v
import regex
fn main() {
test_regex()
text := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query:= r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+"
mut re := new_regex()
mut re := new()
re.debug = 2
// must provide an array of the right size if want the continuos saving of the groups
re.group_csave = [-1].repeat(3*20+1)
re_err, err_pos := re.compile(query)
if re_err == COMPILE_OK {
q_str := re.get_query()
println("O.Query: $query")
println("Query : $q_str")
re.debug = 0
start, end := re.match_string(text)
if start < 0 {
err_str := re.get_parse_error_string(start)
println("ERROR : $err_str, $start")
} else {
text1 := text[start..end]
println("found in [$start, $end] => [$text1]")
}
re.compile_opt(query) or { println(err) return }
// groups
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
// continuous saving
gi = 0
println("num of group item saved: ${re.group_csave[0]}")
for gi < re.group_csave[0] {
id := re.group_csave[1+gi*3]
st := re.group_csave[1+gi*3+1]
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
gi++
}
println("raw array: ${re.group_csave[0..gi*3+2-1]}")
q_str := re.get_query()
println("O.Query: $query")
println("Query : $q_str")
re.debug = 0
start, end := re.match_string(text)
if start < 0 {
err_str := re.get_parse_error_string(start)
println("ERROR : $err_str, $start")
} else {
text1 := text[start..end]
println("found in [$start, $end] => [$text1]")
}
// named capturing groups
println("named capturing groups:")
for g_name in re.group_map.keys() {
s,e := re.get_group(g_name)
if s >= 0 && e > s {
println("'${g_name}':[$s, $e] => '${text[s..e]}'")
} else {
println("Group [${g_name}] doesn't exist.")
}
}
} else {
println("query: $query")
lc := "-".repeat(err_pos)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
}
// groups
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
// continuous saving
gi = 0
println("num of group item saved: ${re.group_csave[0]}")
for gi < re.group_csave[0] {
id := re.group_csave[1+gi*3]
st := re.group_csave[1+gi*3+1]
en := re.group_csave[1+gi*3+2]
println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
gi++
}
println("raw array: ${re.group_csave[0..gi*3+2-1]}")
// named capturing groups
println("named capturing groups:")
for g_name in re.group_map.keys() {
s,e := re.get_group(g_name)
if s >= 0 && e > s {
println("'${g_name}':[$s, $e] => '${text[s..e]}'")
} else {
println("Group [${g_name}] doesn't exist.")
}
}
}
```
@ -360,7 +343,7 @@ It is possible to set some flags in the regex parser that change the behavior of
```v
// example of flag settings
mut re := regex.new_regex()
mut re := regex.new()
re.flag = regex.F_BIN
```
@ -382,22 +365,22 @@ These functions are helper that create the `RE` struct, a `RE` struct can be cre
```v
// regex create a regex object from the query string and compile it
pub fn regex(in_query string) (RE,int,int)
pub fn regex_opt(in_query string) ?RE
```
#### **Base initializer**
```v
// new_regex create a REgex of small size, usually sufficient for ordinary use
pub fn new_regex() RE
pub fn new() RE
// new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated
pub fn new_regex_by_size(mult int) RE
pub fn new_by_size(mult int) RE
```
After a base initializer is used, the regex expression must be compiled with:
```v
// compile return (return code, index) where index is the index of the error in the query string if return code is an error code
pub fn (re mut RE) compile(in_txt string) (int,int)
// compile compiles the REgex returning an error if the compilation fails
pub fn (re mut RE) compile_opt(in_txt string) ?
```
### Operative Functions
@ -428,20 +411,9 @@ the following example code show how to visualize the syntax errors in the compil
```v
query:= r"ciao da ab[ab-]" // there is an error, a range not closed!!
mut re := new_regex()
mut re := new()
// re_err ==> is the return value, if < 0 it is an error
// re_pos ==> if re_err < 0, re_pos is the error index in the query string
re_err, err_pos := re.compile(query)
// print the error if one happen
if re_err != COMPILE_OK {
println("query: $query")
lc := "-".repeat(err_pos)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err) // get the error string
println("ERROR: $err_str")
}
re.compile_opt(query) or { println(err) }
// output!!
@ -543,7 +515,7 @@ fn custom_print(txt string) {
println("my log: $txt")
}
mut re := new_regex()
mut re := new()
re.log_func = custom_print // every debug output from now will call this function
```
@ -571,38 +543,29 @@ tests = [
fn example() {
for c,tst in tests {
mut re := regex.new_regex()
re_err, err_pos := re.compile(tst.query)
if re_err == regex.COMPILE_OK {
mut re := regex.new()
re.compile_opt(tst.query) or { println(err) continue }
// print the query parsed with the groups ids
re.debug = 1 // set debug on at minimum level
println("#${c:2d} query parsed: ${re.get_query()}")
re.debug = 0
// do the match
start, end := re.match_string(tst.source)
if start >= 0 && end > start {
println("#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]")
}
// print the groups
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("group ${gi/2:2d} :[${tst.source[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
println("")
} else {
// print the compile error
println("query: $tst.query")
lc := "-".repeat(err_pos-1)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
}
// print the query parsed with the groups ids
re.debug = 1 // set debug on at minimum level
println("#${c:2d} query parsed: ${re.get_query()}")
re.debug = 0
// do the match
start, end := re.match_string(tst.source)
if start >= 0 && end > start {
println("#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]")
}
// print the groups
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println("group ${gi/2:2d} :[${tst.source[re.groups[gi]..re.groups[gi+1]]}]")
}
gi += 2
}
println("")
}
}

View File

@ -1,6 +1,6 @@
/*
regex 0.9e
regex 0.9g
Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
Use of this source code is governed by an MIT license
@ -19,7 +19,7 @@ module regex
import strings
pub const(
v_regex_version = "0.9e" // regex module version
v_regex_version = "0.9g" // regex module version
max_code_len = 256 // default small base code len for the regex programs
max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
@ -912,7 +912,12 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
// main compiler
//
// compile return (return code, index) where index is the index of the error in the query string if return code is an error code
[deprecated]
pub fn (mut re RE) compile(in_txt string) (int,int) {
return re.impl_compile(in_txt)
}
fn (mut re RE) impl_compile(in_txt string) (int,int) {
mut i := 0 // input string index
mut pc := 0 // program counter
mut tmp_code := u32(0)
@ -2187,6 +2192,7 @@ Public functions
//
// regex create a regex object from the query string
[deprecated]
pub fn regex(in_query string) (RE,int,int){
mut re := RE{}
re.prog = [Token{}].repeat(in_query.len+1)
@ -2198,12 +2204,17 @@ pub fn regex(in_query string) (RE,int,int){
}
// new_regex create a RE of small size, usually sufficient for ordinary use
[deprecated]
pub fn new_regex() RE {
return new_regex_by_size(1)
return impl_new_regex_by_size(1)
}
// new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
[deprecated]
pub fn new_regex_by_size(mult int) RE {
return impl_new_regex_by_size(mult)
}
fn impl_new_regex_by_size(mult int) RE {
mut re := RE{}
re.prog = [Token{}].repeat(max_code_len*mult) // max program length, default 256 istructions
re.cc = [CharClass{}].repeat(max_code_len*mult) // char class list

View File

@ -0,0 +1,34 @@
module regex
import strings
// compile_opt compile RE pattern string
pub fn (mut re RE) compile_opt(pattern string) ? {
re_err,err_pos := re.impl_compile(pattern)
if re_err != compile_ok {
mut err_msg := strings.new_builder(300)
err_msg.write("query: $pattern\n")
line := "-".repeat(err_pos)
err_msg.write("err : ${line}^\n")
err_str := re.get_parse_error_string(re_err)
err_msg.write("ERROR: $err_str\n")
return error_with_code(err_msg.str(), re_err)
}
}
// new_regex create a RE of small size, usually sufficient for ordinary use
pub fn new() RE {
return impl_new_regex_by_size(1)
}
// new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
pub fn new_by_size(mult int) RE {
return impl_new_regex_by_size(mult)
}
// regex_opt create new RE object from RE pattern string
pub fn regex_opt(pattern string) ?RE {
mut re := new()
re.compile_opt(pattern)?
return re
}

View File

@ -175,48 +175,52 @@ fn test_regex(){
// debug print
//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)")
mut re, re_err, _ := regex.regex(to.q)
mut re := regex.regex_opt(to.q) or {
eprintln('err: $err')
assert false
continue
}
re.group_csave = [-1].repeat(3*20+1)
if re_err == regex.compile_ok {
start, end := re.match_string(to.src)
start, end := re.match_string(to.src)
mut tmp_str := ""
if start >= 0 && end > start{
tmp_str = to.src[start..end]
}
mut tmp_str := ""
if start >= 0 && end > start{
tmp_str = to.src[start..end]
}
if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false
break
}
if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false
continue
}
// check cgroups
if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
println("Capturing group len error!")
// check cgroups
if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
println("Capturing group len error!")
assert false
continue
}
// check captured groups
mut ln := re.group_csave[0]*3
for ln > 0 {
if re.group_csave[ln] != to.cg[ln] {
assert false
}
ln--
}
// check captured groups
mut ln := re.group_csave[0]*3
for ln > 0 {
if re.group_csave[ln] != to.cg[ln] {
assert false
}
ln--
// check named captured groups
for k in to.cgn.keys() {
if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
println("Named capturing group error! [$k]")
assert false
continue
}
// check named captured groups
for k in to.cgn.keys() {
if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
println("Named capturing group error! [$k]")
assert false
}
}
}
}
@ -225,29 +229,27 @@ fn test_regex(){
// debug print
//println("#$c [$to.src] q[$to.q] $to.r")
mut re, re_err, err_pos := regex.regex(to.q)
if re_err == regex.compile_ok {
res := re.find_all(to.src)
if res.len != to.r.len {
println("ERROR: find_all, array of different size.")
assert false
}
for c1,i in res {
if i != to.r[c1] {
println("ERROR: find_all, different indexes.")
assert false
}
}
} else {
println("query: $to.q")
lc := "-".repeat(err_pos-1)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
mut re := regex.regex_opt(to.q) or {
eprintln('err: $err')
assert false
continue
}
res := re.find_all(to.src)
if res.len != to.r.len {
println("ERROR: find_all, array of different size.")
assert false
continue
}
for c1,i in res {
if i != to.r[c1] {
println("ERROR: find_all, different indexes.")
assert false
continue
}
}
}
// check replace
@ -255,97 +257,81 @@ fn test_regex(){
// debug print
//println("#$c [$to.src] q[$to.q] $to.r")
mut re, re_err, err_pos := regex.regex(to.q)
if re_err == regex.compile_ok {
res := re.replace(to.src,to.rep)
if res != to.r {
println("ERROR: replace.")
assert false
}
} else {
println("query: $to.q")
lc := "-".repeat(err_pos-1)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
mut re := regex.regex_opt(to.q) or {
eprintln('err: $err')
assert false
continue
}
res := re.replace(to.src,to.rep)
if res != to.r {
println("ERROR: replace.")
assert false
continue
}
}
// check match and find
for c,to in match_test_suite {
// debug print
//println("#$c [$to.src] q[$to.q] $to.s")
println("#$c [$to.src] q[$to.q] $to.s $to.e")
// test the find
if to.s > 0 {
mut re, re_err, err_pos := regex.regex(to.q)
if re_err == regex.compile_ok {
//q_str := re.get_query()
//println("Query: $q_str")
start,end := re.find(to.src)
if start != to.s || end != to.e {
err_str := re.get_parse_error_string(start)
println("ERROR : $err_str")
assert false
} else {
//tmp_str := text[start..end]
//println("found in [$start, $end] => [$tmp_str]")
assert true
}
} else {
println("query: $to.q")
lc := "-".repeat(err_pos-1)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
mut re := regex.regex_opt(to.q) or {
eprintln('err: $err')
assert false
continue
}
// q_str := re.get_query()
// println("Query: $q_str")
start,end := re.find(to.src)
if start != to.s || end != to.e {
err_str := re.get_parse_error_string(start)
println("ERROR : $err_str")
assert false
} else {
//tmp_str := text[start..end]
//println("found in [$start, $end] => [$tmp_str]")
assert true
}
continue
}
// test the match
mut re := regex.new_regex()
mut re := regex.new()
//re.debug = true
re_err,err_pos := re.compile(to.q)
if re_err == regex.compile_ok {
//println("#$c [$to.src] q[$to.q]")
start, end := re.match_string(to.src)
mut tmp_str := ""
if start >= 0 && end > start{
tmp_str = to.src[start..end]
}
if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false
break
}
// rerun to test consistency
tmp_str1 := to.src.clone()
start1, end1 := re.match_string(tmp_str1)
if start1 != start || end1 != end {
println("two run ERROR!!")
assert false
break
}
} else {
println("query: $to.q")
lc := "-".repeat(err_pos-1)
println("err : $lc^")
err_str := re.get_parse_error_string(re_err)
println("ERROR: $err_str")
re.compile_opt(to.q) or {
eprintln('err: $err')
assert false
break
continue
}
//println("#$c [$to.src] q[$to.q]")
start, end := re.match_string(to.src)
mut tmp_str := ""
if start >= 0 && end > start{
tmp_str = to.src[start..end]
}
if start != to.s || end != to.e {
println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
println("ERROR!")
//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
assert false
continue
}
// rerun to test consistency
tmp_str1 := to.src.clone()
start1, end1 := re.match_string(tmp_str1)
if start1 != start || end1 != end {
println("two run ERROR!!")
assert false
continue
}
}
}