regex 0.9c
parent
d5f6e37c65
commit
25fabac059
|
@ -0,0 +1,411 @@
|
||||||
|
# V RegEx (Regular expression) 0.9c
|
||||||
|
|
||||||
|
[TOC]
|
||||||
|
|
||||||
|
## introduction
|
||||||
|
|
||||||
|
Write here the introduction
|
||||||
|
|
||||||
|
## Basic assumption
|
||||||
|
|
||||||
|
In this release, during the writing of the code some assumption are made and are valid for all the features.
|
||||||
|
|
||||||
|
1. The matching stop at the end of the string not at the newline chars
|
||||||
|
2. The basic element of this regex engine are the tokens, in aquery string a simple char is a token. The token is the atomic unit of this regex engine.
|
||||||
|
|
||||||
|
## Match positional limiter
|
||||||
|
|
||||||
|
The module supports the following features:
|
||||||
|
|
||||||
|
- `$` `^` delimiter
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
`^` (Caret.) Matches the start of the string
|
||||||
|
|
||||||
|
`?` Matches the end of the string
|
||||||
|
|
||||||
|
## Tokens
|
||||||
|
|
||||||
|
The token are the atomic unit used by this regex engine and can be one of the following:
|
||||||
|
|
||||||
|
### Simple char
|
||||||
|
|
||||||
|
this token is a simple single character like `a`.
|
||||||
|
|
||||||
|
### Char class (cc)
|
||||||
|
|
||||||
|
The cc match all the char specified in its inside, it is delimited by square brackets `[ ]`
|
||||||
|
|
||||||
|
the sequence of chars in the class is evaluated with an OR operation.
|
||||||
|
|
||||||
|
For example the following cc `[abc]` match any char that is or `a` or `b` or `c` but doesn't match `C` or `z`.
|
||||||
|
|
||||||
|
Inside a cc is possible to specify a "range" of chars, for example `[ad-f]` is equivalent to write `[adef]`.
|
||||||
|
|
||||||
|
A cc can have different ranges in the same like `[a-zA-z0-9]` that match all the lowercase,uppercase and numeric chars.
|
||||||
|
|
||||||
|
It is possible negate the cc using the caret char at the start of the cc like: `[^abc]` that match every char that is not `a` or `b` or `c`.
|
||||||
|
|
||||||
|
A cc can contain meta-chars like: `[a-z\d]` that match all the lowercase latin chars `a-z` and all the digits `\d`.
|
||||||
|
|
||||||
|
It is possible to mix all the properties of the char class together.
|
||||||
|
|
||||||
|
### Meta-chars
|
||||||
|
|
||||||
|
A meta-char is specified by a back slash before a char like `\w` in this case the meta-char is `w`.
|
||||||
|
|
||||||
|
A meta-char can match different type of chars.
|
||||||
|
|
||||||
|
* `\w` match an alphanumeric char `[a-zA-Z0-9]`
|
||||||
|
* `\W` match a non alphanumeric char
|
||||||
|
* `\d` match a digit `[0-9]`
|
||||||
|
* `\D` match a non digit
|
||||||
|
* `\s`match a space char, one of `[' ','\t','\n','\r','\v','\f']`
|
||||||
|
* `\S` match a non space char
|
||||||
|
* `\a` match only a lowercase char `[a-z]`
|
||||||
|
* `\A` match only an uppercase char `[A-Z]`
|
||||||
|
|
||||||
|
### Quantifier
|
||||||
|
|
||||||
|
Each token can have a quantifier that specify how many times the char can or must be matched.
|
||||||
|
|
||||||
|
**Short quantifier**
|
||||||
|
|
||||||
|
- `?` match 0 or 1 time, `a?b` match both `ab` or `b`
|
||||||
|
- `+` match at minimum 1 time, `a+` match both `aaa` or `a`
|
||||||
|
- `*` match 0 or more time, `a*b` match both `aaab` or `ab` or `b`
|
||||||
|
|
||||||
|
**Long quantifier**
|
||||||
|
|
||||||
|
- `{x}` match exactly x time, `a{2}` match `aa` but doesn't match `aaa` or `a`
|
||||||
|
- `{min,}` match at minimum min time, `a{2,}` match `aaa` or `aa` bit doesn't march `a`
|
||||||
|
- `{,max}` match at least 1 and maximum max time, `a{,2}` match `a` and `aa` but doesn't match `aaa`
|
||||||
|
- `{min,max}` match from min times to max times, `a{2,3}` match `aa` and `aaa` but doesn't match `a` or `aaaa`
|
||||||
|
|
||||||
|
a long quantifier may have a `greedy` flag that is the `?` char after the brackets, `{2,4}?` means to match at the minimum possible tokens thus 2.
|
||||||
|
|
||||||
|
### dot char
|
||||||
|
|
||||||
|
the dot is a particular meta char that match "any char", is more simple explain it with an example:
|
||||||
|
|
||||||
|
supposed to have `abccc ddeef` as string to parse with regex, the following table show the query strings and the result of parsing source string.
|
||||||
|
|
||||||
|
| query string | result |
|
||||||
|
| ------------ | ------ |
|
||||||
|
| `.*c` | `abc` |
|
||||||
|
| `.*dd` | `abcc dd` |
|
||||||
|
| `ab.*e` | `abccc dde` |
|
||||||
|
| `ab.{3} .*e` | `abccc dde` |
|
||||||
|
|
||||||
|
the dot char match any char until the next token match is satisfied.
|
||||||
|
|
||||||
|
### OR token
|
||||||
|
|
||||||
|
the token `|` is an logic OR operation between two consecutive tokens, `a|b` match a char that is `a` or `b`.
|
||||||
|
|
||||||
|
The or token can work in a "chained way": `a|(b)|cd ` test first `a` if the char is not `a` the test the group `(b)` and if the group doesn't match test the token `c`.
|
||||||
|
|
||||||
|
**note: The OR work at token level! It doesn't work at concatenation level!**
|
||||||
|
|
||||||
|
A query string like `abc|bde` is not equal to `(abc)|(bde)`!!
|
||||||
|
|
||||||
|
The OR work only on `c|b` not at char concatenation level.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Groups
|
||||||
|
|
||||||
|
Groups are a method to create complex patterns with repetition of blocks of token.
|
||||||
|
|
||||||
|
The groups a delimited by round brackets `( )`, groups can be nested and can have a quantifier as all the tokens.
|
||||||
|
|
||||||
|
`c(pa)+z` match `cpapaz` or `cpaz` or `cpapapaz` .
|
||||||
|
|
||||||
|
`(c(pa)+z ?)+` match `cpaz cpapaz cpapapaz` or `cpapaz`
|
||||||
|
|
||||||
|
let analyze this last case, first we have the group 0 that are the most outer round brackets `(...)+`, this group has a quantifier that say to match its content at least one time `+`.
|
||||||
|
|
||||||
|
After we have a simple char token `c` and a second group that is the number 1 `(pa)+`, this group try to match the sequence `pa` at least one time as specified by the `+` quantifier.
|
||||||
|
|
||||||
|
After we have another simple token `z` and another simple token ` ?` that is the space char (ascii code 32) with the `?` quantifier that say to capture this char or 0 or 1 time
|
||||||
|
|
||||||
|
This explain because the `(c(pa)+z ?)+` query string can match `cpaz cpapaz cpapapaz` .
|
||||||
|
|
||||||
|
In this implementation the groups are capturing groups that means that the last result for each group can be retrieved from the `RE` struct.
|
||||||
|
|
||||||
|
The captured groups are store as couple of index in the field `groups` that is an `[]int` each captured group
|
||||||
|
|
||||||
|
**example:**
|
||||||
|
|
||||||
|
```v
|
||||||
|
text := "cpaz cpapaz cpapapaz"
|
||||||
|
query:= r"(c(pa)+z ?)+"
|
||||||
|
re, _, _ := regex.regex(query)
|
||||||
|
|
||||||
|
println(re.get_query())
|
||||||
|
// #0(c#1(pa)+z ?)+ // #0 and #1 are the ids of the groups, are shown if re.debug is 1 or 2
|
||||||
|
|
||||||
|
start, end := re.match_string(text)
|
||||||
|
// [start=0, end=20] match => [cpaz cpapaz cpapapaz]
|
||||||
|
|
||||||
|
mut gi := 0
|
||||||
|
for gi < re.groups.len {
|
||||||
|
if re.groups[gi] >= 0 {
|
||||||
|
println("${gi/2} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
|
||||||
|
}
|
||||||
|
gi += 2
|
||||||
|
}
|
||||||
|
// groups captured
|
||||||
|
// 0 :[cpapapaz]
|
||||||
|
// 1 :[pa]
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
**note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`*
|
||||||
|
|
||||||
|
## Flags
|
||||||
|
|
||||||
|
It is possible to set some flag in the regex parser that change the behavior of the parser itself.
|
||||||
|
|
||||||
|
```v
|
||||||
|
// example of flag settings
|
||||||
|
mut re := regex.new_regex()
|
||||||
|
re.flag = regex.F_BIN
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
- `F_BIN`: parse a string as bytes, utf-8 management disabled.
|
||||||
|
|
||||||
|
- `F_EFM`: exit on the first char match in the query, used by the find function
|
||||||
|
- `F_MS`: match only if the index of the start match is 0, same as `^` at the start of query string
|
||||||
|
- `F_ME`: match only if the end index of the match is the last char of the input string, same as `$` end of query string
|
||||||
|
- `F_NL`: stop the matching if found a new line char `\n` or `\r`
|
||||||
|
|
||||||
|
## Functions
|
||||||
|
|
||||||
|
### Initializer
|
||||||
|
|
||||||
|
These function are helper that create the `RE` struct, the struct can be manually create if you need it
|
||||||
|
|
||||||
|
**Simplified initializer**
|
||||||
|
|
||||||
|
```v
|
||||||
|
// regex create a regex object from the query string and compile it
|
||||||
|
pub fn regex(in_query string) (RE,int,int)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Base initializer**
|
||||||
|
|
||||||
|
```v
|
||||||
|
// new_regex create a REgex of small size, usually sufficient for ordinary use
|
||||||
|
pub fn new_regex() RE
|
||||||
|
|
||||||
|
// new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated
|
||||||
|
pub fn new_regex_by_size(mult int) RE
|
||||||
|
```
|
||||||
|
After the base initializer use the regex expression must be compiled with:
|
||||||
|
```v
|
||||||
|
// compile return (return code, index) where index is the index of the error in the query string if return code is an error code
|
||||||
|
pub fn (re mut RE) compile(in_txt string) (int,int)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Functions
|
||||||
|
|
||||||
|
These are the operative functions
|
||||||
|
|
||||||
|
```v
|
||||||
|
// match_string try to match the input string, return start and end index if found else start is -1
|
||||||
|
pub fn (re mut RE) match_string(in_txt string) (int,int)
|
||||||
|
|
||||||
|
// find try to find the first match in the input string, return start and end index if found else start is -1
|
||||||
|
pub fn (re mut RE) find(in_txt string) (int,int)
|
||||||
|
|
||||||
|
// find all the non overlapping occurrences of the match pattern, return a list of start end indexes
|
||||||
|
pub fn (re mut RE) find_all(in_txt string) []int
|
||||||
|
|
||||||
|
// replace return a string where the matches are replaced with the replace string, only non overlapped match are used
|
||||||
|
pub fn (re mut RE) replace(in_txt string, repl string) string
|
||||||
|
```
|
||||||
|
|
||||||
|
## Debugging
|
||||||
|
|
||||||
|
This module has few small utilities to help the writing of regex expressions.
|
||||||
|
|
||||||
|
**Syntax errors highlight**
|
||||||
|
|
||||||
|
the following example code show how to visualize the syntax errors in the compiling pahse:
|
||||||
|
|
||||||
|
```v
|
||||||
|
query:= r"ciao da ab[ab-]" // there is an error, a range not closed
|
||||||
|
mut re := new_regex()
|
||||||
|
|
||||||
|
// re_err ==> is the return value, if < 0 it is an error
|
||||||
|
// re_pos ==> if re_err < 0, re_pos is the error index in the query string
|
||||||
|
re_err, err_pos := re.compile(query)
|
||||||
|
|
||||||
|
// print the error if one happen
|
||||||
|
if re_err != COMPILE_OK {
|
||||||
|
println("query: $query")
|
||||||
|
lc := "-".repeat(err_pos)
|
||||||
|
println("err : $lc^")
|
||||||
|
err_str := re.get_parse_error_string(re_err) // get the error string
|
||||||
|
println("ERROR: $err_str")
|
||||||
|
}
|
||||||
|
|
||||||
|
// output!!
|
||||||
|
|
||||||
|
//query: ciao da ab[ab-]
|
||||||
|
//err : ----------^
|
||||||
|
//ERROR: ERR_SYNTAX_ERROR
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
**Compiled code**
|
||||||
|
|
||||||
|
It is possible view the compiled code calling the function `get_query()` the result will something like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
========================================
|
||||||
|
v RegEx compiler v 0.9c output:
|
||||||
|
PC: 0 ist: 7fffffff [a] query_ch { 1, 1}
|
||||||
|
PC: 1 ist: 7fffffff [b] query_ch { 1,MAX}
|
||||||
|
PC: 2 ist: 88000000 PROG_END { 0, 0}
|
||||||
|
========================================
|
||||||
|
```
|
||||||
|
|
||||||
|
`PC`:`int` is the program counter or step of execution, each single step is a token
|
||||||
|
|
||||||
|
`ist`:`hex` is the token instruction id
|
||||||
|
|
||||||
|
`[a]` is the char used by the token
|
||||||
|
|
||||||
|
`query_ch` is the type of token
|
||||||
|
|
||||||
|
`{m,n}` are the quantifier, the greedy flag `?` will be showed if present in the token
|
||||||
|
|
||||||
|
**Log debug**
|
||||||
|
|
||||||
|
The log debugger allow to print the status of the regex parser when the parser is running.
|
||||||
|
|
||||||
|
It is possible to have two different level of debug: 1 is normal while 2 is verbose.
|
||||||
|
|
||||||
|
here an example:
|
||||||
|
|
||||||
|
*normal*
|
||||||
|
|
||||||
|
list only the token instruction with the values
|
||||||
|
|
||||||
|
```
|
||||||
|
// re.flag = 1 // log level normal
|
||||||
|
flags: 00000000
|
||||||
|
# 2 s: ist_load PC: 0=>7fffffff i,ch,len:[ 0,'a',1] f.m:[ -1, -1] query_ch: [a]{1,1}:0 (#-1)
|
||||||
|
# 5 s: ist_load PC: 1=>7fffffff i,ch,len:[ 1,'b',1] f.m:[ 0, 0] query_ch: [b]{2,3}:0? (#-1)
|
||||||
|
# 7 s: ist_load PC: 1=>7fffffff i,ch,len:[ 2,'b',1] f.m:[ 0, 1] query_ch: [b]{2,3}:1? (#-1)
|
||||||
|
# 10 PROG_END
|
||||||
|
```
|
||||||
|
|
||||||
|
*verbose*
|
||||||
|
|
||||||
|
list all the instruction and states of the parser
|
||||||
|
|
||||||
|
```
|
||||||
|
flags: 00000000
|
||||||
|
# 0 s: start PC: NA
|
||||||
|
# 1 s: ist_next PC: NA
|
||||||
|
# 2 s: ist_load PC: 0=>7fffffff i,ch,len:[ 0,'a',1] f.m:[ -1, -1] query_ch: [a]{1,1}:0 (#-1)
|
||||||
|
# 3 s: ist_quant_p PC: 0=>7fffffff i,ch,len:[ 1,'b',1] f.m:[ 0, 0] query_ch: [a]{1,1}:1 (#-1)
|
||||||
|
# 4 s: ist_next PC: NA
|
||||||
|
# 5 s: ist_load PC: 1=>7fffffff i,ch,len:[ 1,'b',1] f.m:[ 0, 0] query_ch: [b]{2,3}:0? (#-1)
|
||||||
|
# 6 s: ist_quant_p PC: 1=>7fffffff i,ch,len:[ 2,'b',1] f.m:[ 0, 1] query_ch: [b]{2,3}:1? (#-1)
|
||||||
|
# 7 s: ist_load PC: 1=>7fffffff i,ch,len:[ 2,'b',1] f.m:[ 0, 1] query_ch: [b]{2,3}:1? (#-1)
|
||||||
|
# 8 s: ist_quant_p PC: 1=>7fffffff i,ch,len:[ 3,'b',1] f.m:[ 0, 2] query_ch: [b]{2,3}:2? (#-1)
|
||||||
|
# 9 s: ist_next PC: NA
|
||||||
|
# 10 PROG_END
|
||||||
|
# 11 PROG_END
|
||||||
|
```
|
||||||
|
|
||||||
|
the column have the following meaning:
|
||||||
|
|
||||||
|
`# 2` number of actual steps from the start of parsing
|
||||||
|
|
||||||
|
`s: ist_next` state of the present step
|
||||||
|
|
||||||
|
`PC: 1` program counter of the step
|
||||||
|
|
||||||
|
`=>7fffffff ` hex code of the instruction
|
||||||
|
|
||||||
|
`i,ch,len:[ 0,'a',1]` `i` index in the source string, `ch` the char parsed, `len` the length in byte of the char parsed
|
||||||
|
|
||||||
|
`f.m:[ 0, 1]` `f` index of the first match in the source string, `m` index that is actual matching
|
||||||
|
|
||||||
|
`query_ch: [b]` token in use and its char
|
||||||
|
|
||||||
|
`{2,3}:1?` quantifier `{min,max}`, `:1` is the actual counter of repetition, `?` is the greedy flag if present
|
||||||
|
|
||||||
|
## Example code
|
||||||
|
|
||||||
|
Here there is a simple code to perform some basically match of strings
|
||||||
|
|
||||||
|
```v
|
||||||
|
struct TestObj {
|
||||||
|
source string // source string to parse
|
||||||
|
query string // regex query string
|
||||||
|
s int // expected match start index
|
||||||
|
e int // expected match end index
|
||||||
|
}
|
||||||
|
const (
|
||||||
|
tests = [
|
||||||
|
TestObj{"this is a good.",r"this (\w+) a",0,9},
|
||||||
|
TestObj{"this,these,those. over",r"(th[eio]se?[,. ])+",0,17},
|
||||||
|
TestObj{"test1@post.pip.com, pera",r"[\w]+@([\w]+\.)+\w+",0,18},
|
||||||
|
TestObj{"cpapaz ole. pippo,",r".*c.+ole.*pi",0,14},
|
||||||
|
TestObj{"adce aabe",r"(a(ab)+)|(a(dc)+)e",0,4},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
fn example() {
|
||||||
|
for c,tst in tests {
|
||||||
|
mut re := regex.new_regex()
|
||||||
|
re_err, err_pos := re.compile(tst.query)
|
||||||
|
if re_err == regex.COMPILE_OK {
|
||||||
|
|
||||||
|
// print the query parsed with the groups ids
|
||||||
|
re.debug = 1 // set debug on at minimum level
|
||||||
|
println("#${c:2d} query parsed: ${re.get_query()}")
|
||||||
|
re.debug = 0
|
||||||
|
|
||||||
|
// do the match
|
||||||
|
start, end := re.match_string(tst.source)
|
||||||
|
if start >= 0 && end > start {
|
||||||
|
println("#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]")
|
||||||
|
}
|
||||||
|
|
||||||
|
// print the groups
|
||||||
|
mut gi := 0
|
||||||
|
for gi < re.groups.len {
|
||||||
|
if re.groups[gi] >= 0 {
|
||||||
|
println("group ${gi/2:2d} :[${tst.source[re.groups[gi]..re.groups[gi+1]]}]")
|
||||||
|
}
|
||||||
|
gi += 2
|
||||||
|
}
|
||||||
|
println("")
|
||||||
|
} else {
|
||||||
|
// print the compile error
|
||||||
|
println("query: $tst.query")
|
||||||
|
lc := "-".repeat(err_pos-1)
|
||||||
|
println("err : $lc^")
|
||||||
|
err_str := re.get_parse_error_string(re_err)
|
||||||
|
println("ERROR: $err_str")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
example()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
more example code is available in the test code for the `regex` module `vlib\regex\regex_test.v`.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
/**********************************************************************
|
/**********************************************************************
|
||||||
*
|
*
|
||||||
* regex 0.9b
|
* regex 0.9c
|
||||||
*
|
*
|
||||||
* Copyright (c) 2019 Dario Deledda. All rights reserved.
|
* Copyright (c) 2019 Dario Deledda. All rights reserved.
|
||||||
* Use of this source code is governed by an MIT license
|
* Use of this source code is governed by an MIT license
|
||||||
|
@ -18,7 +18,7 @@ module regex
|
||||||
import strings
|
import strings
|
||||||
|
|
||||||
pub const(
|
pub const(
|
||||||
V_REGEX_VERSION = "0.9b" // regex module version
|
V_REGEX_VERSION = "0.9c" // regex module version
|
||||||
|
|
||||||
MAX_CODE_LEN = 256 // default small base code len for the regex programs
|
MAX_CODE_LEN = 256 // default small base code len for the regex programs
|
||||||
MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
|
MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
|
||||||
|
@ -47,7 +47,6 @@ const(
|
||||||
//*************************************
|
//*************************************
|
||||||
// regex program instructions
|
// regex program instructions
|
||||||
//*************************************
|
//*************************************
|
||||||
SIMPLE_CHAR_MASK = u32(0x80000000) // single char mask
|
|
||||||
IST_SIMPLE_CHAR = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char
|
IST_SIMPLE_CHAR = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char
|
||||||
|
|
||||||
// char class 11 0100 AA xxxxxxxx
|
// char class 11 0100 AA xxxxxxxx
|
||||||
|
@ -88,9 +87,11 @@ fn utf8util_char_len(b byte) int {
|
||||||
|
|
||||||
// get_char get a char from position i and return an u32 with the unicode code
|
// get_char get a char from position i and return an u32 with the unicode code
|
||||||
[inline]
|
[inline]
|
||||||
fn get_char(in_txt string, i int) (u32,int) {
|
fn (re RE) get_char(in_txt string, i int) (u32,int) {
|
||||||
// ascii 8 bit
|
// ascii 8 bit
|
||||||
if in_txt.str[i] & 0x80 == 0 {
|
if (re.flag & F_BIN) !=0 ||
|
||||||
|
in_txt.str[i] & 0x80 == 0
|
||||||
|
{
|
||||||
return u32(in_txt.str[i]), 1
|
return u32(in_txt.str[i]), 1
|
||||||
}
|
}
|
||||||
// unicode char
|
// unicode char
|
||||||
|
@ -106,9 +107,11 @@ fn get_char(in_txt string, i int) (u32,int) {
|
||||||
|
|
||||||
// get_charb get a char from position i and return an u32 with the unicode code
|
// get_charb get a char from position i and return an u32 with the unicode code
|
||||||
[inline]
|
[inline]
|
||||||
fn get_charb(in_txt byteptr, i int) (u32,int) {
|
fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) {
|
||||||
// ascii 8 bit
|
// ascii 8 bit
|
||||||
if in_txt[i] & 0x80 == 0 {
|
if (re.flag & F_BIN) !=0 ||
|
||||||
|
in_txt[i] & 0x80 == 0
|
||||||
|
{
|
||||||
return u32(in_txt[i]), 1
|
return u32(in_txt[i]), 1
|
||||||
}
|
}
|
||||||
// unicode char
|
// unicode char
|
||||||
|
@ -215,8 +218,7 @@ fn utf8_str(ch u32) string {
|
||||||
|
|
||||||
// simple_log default log function
|
// simple_log default log function
|
||||||
fn simple_log(txt string) {
|
fn simple_log(txt string) {
|
||||||
C.fprintf(C.stdout, "%s",txt.str)
|
print(txt)
|
||||||
C.fflush(stdout)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/******************************************************************************
|
/******************************************************************************
|
||||||
|
@ -228,9 +230,14 @@ struct Token{
|
||||||
mut:
|
mut:
|
||||||
ist u32 = u32(0)
|
ist u32 = u32(0)
|
||||||
|
|
||||||
|
// char
|
||||||
|
ch u32 = u32(0)// char of the token if any
|
||||||
|
ch_len byte = byte(0) // char len
|
||||||
|
|
||||||
// Quantifiers / branch
|
// Quantifiers / branch
|
||||||
rep_min int = 0 // used also for jump next in the OR branch [no match] pc jump
|
rep_min int = 0 // used also for jump next in the OR branch [no match] pc jump
|
||||||
rep_max int = 0 // used also for jump next in the OR branch [ match] pc jump
|
rep_max int = 0 // used also for jump next in the OR branch [ match] pc jump
|
||||||
|
greedy bool = false // greedy quantifier flag
|
||||||
|
|
||||||
// Char class
|
// Char class
|
||||||
cc_index int = -1
|
cc_index int = -1
|
||||||
|
@ -240,15 +247,14 @@ mut:
|
||||||
|
|
||||||
// validator function pointer and control char
|
// validator function pointer and control char
|
||||||
validator fn (byte) bool
|
validator fn (byte) bool
|
||||||
v_ch u32 = u32(0) // debug, helper for recreate the query string
|
|
||||||
|
|
||||||
// groups variables
|
// groups variables
|
||||||
group_rep int = 0 // repetition of the group
|
group_rep int = 0 // repetition of the group
|
||||||
group_id int = -1 // id of the group
|
group_id int = -1 // id of the group
|
||||||
goto_pc int = -1 // jump to this PC if is needed
|
goto_pc int = -1 // jump to this PC if is needed
|
||||||
|
|
||||||
// OR flag for the token
|
// OR flag for the token
|
||||||
next_is_or bool = false // true if the next token is an OR
|
next_is_or bool = false // true if the next token is an OR
|
||||||
}
|
}
|
||||||
|
|
||||||
fn (tok mut Token) reset() {
|
fn (tok mut Token) reset() {
|
||||||
|
@ -262,13 +268,14 @@ fn (tok mut Token) reset() {
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
pub const (
|
pub const (
|
||||||
//F_FND = 0x00000001 // check until the end of the input string, it act like a "find first match", not efficient!!
|
//F_FND = 0x00000001 // check until the end of the input string, it act like a "find first match", not efficient!!
|
||||||
//F_NL = 0x00000002 // end the match when find a new line symbol
|
|
||||||
//F_PM = 0x00000004 // partial match: if the source text finish and the match is positive until then return true
|
//F_PM = 0x00000004 // partial match: if the source text finish and the match is positive until then return true
|
||||||
|
|
||||||
|
F_NL = 0x00000002 // end the match when find a new line symbol
|
||||||
F_MS = 0x00000008 // match true only if the match is at the start of the string
|
F_MS = 0x00000008 // match true only if the match is at the start of the string
|
||||||
F_ME = 0x00000010 // match true only if the match is at the end of the string
|
F_ME = 0x00000010 // match true only if the match is at the end of the string
|
||||||
|
|
||||||
F_EFM = 0x01000000 // exit on first token matched, used by search
|
F_EFM = 0x01000000 // exit on first token matched, used by search
|
||||||
|
F_BIN = 0x02000000 // work only on bytes, ignore utf-8
|
||||||
)
|
)
|
||||||
|
|
||||||
struct StateDotObj{
|
struct StateDotObj{
|
||||||
|
@ -364,7 +371,7 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){
|
||||||
|
|
||||||
for i < in_txt.len {
|
for i < in_txt.len {
|
||||||
// get our char
|
// get our char
|
||||||
char_tmp,char_len := get_char(in_txt,i)
|
char_tmp,char_len := re.get_char(in_txt,i)
|
||||||
ch := byte(char_tmp)
|
ch := byte(char_tmp)
|
||||||
|
|
||||||
if status == .start && ch == `\\` {
|
if status == .start && ch == `\\` {
|
||||||
|
@ -512,7 +519,7 @@ fn (re mut RE) parse_char_class(in_txt string, in_i int) (int, int, u32) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// get our char
|
// get our char
|
||||||
char_tmp,char_len := get_char(in_txt,i)
|
char_tmp,char_len := re.get_char(in_txt,i)
|
||||||
ch := byte(char_tmp)
|
ch := byte(char_tmp)
|
||||||
|
|
||||||
//C.printf("CC #%3d ch: %c\n",i,ch)
|
//C.printf("CC #%3d ch: %c\n",i,ch)
|
||||||
|
@ -614,11 +621,13 @@ enum Quant_parse_state {
|
||||||
min_parse,
|
min_parse,
|
||||||
comma_checked,
|
comma_checked,
|
||||||
max_parse,
|
max_parse,
|
||||||
|
greedy,
|
||||||
|
gredy_parse,
|
||||||
finish
|
finish
|
||||||
}
|
}
|
||||||
|
|
||||||
// parse_quantifier return (min, max, str_len) of a {min,max} quantifier starting after the { char
|
// parse_quantifier return (min, max, str_len) of a {min,max}? quantifier starting after the { char
|
||||||
fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int) {
|
fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
|
||||||
mut status := Quant_parse_state.start
|
mut status := Quant_parse_state.start
|
||||||
mut i := in_i
|
mut i := in_i
|
||||||
|
|
||||||
|
@ -634,7 +643,7 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int) {
|
||||||
|
|
||||||
// exit on no compatible char with {} quantifier
|
// exit on no compatible char with {} quantifier
|
||||||
if utf8util_char_len(ch) != 1 {
|
if utf8util_char_len(ch) != 1 {
|
||||||
return ERR_SYNTAX_ERROR,i,0
|
return ERR_SYNTAX_ERROR,i,0,false
|
||||||
}
|
}
|
||||||
|
|
||||||
// min parsing skip if comma present
|
// min parsing skip if comma present
|
||||||
|
@ -670,13 +679,17 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int) {
|
||||||
// single value {4}
|
// single value {4}
|
||||||
if status == .min_parse && ch == `}` {
|
if status == .min_parse && ch == `}` {
|
||||||
q_max = q_min
|
q_max = q_min
|
||||||
return q_min, q_max, i-in_i+2
|
|
||||||
|
status = .greedy
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// end without max
|
// end without max
|
||||||
if status == .comma_checked && ch == `}` {
|
if status == .comma_checked && ch == `}` {
|
||||||
q_max = MAX_QUANTIFIER
|
q_max = MAX_QUANTIFIER
|
||||||
return q_min, q_max, i-in_i+2
|
|
||||||
|
status = .greedy
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// start max parsing
|
// start max parsing
|
||||||
|
@ -696,17 +709,40 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// end the parsing
|
// finished the quantifier
|
||||||
if status == .max_parse && ch == `}` {
|
if status == .max_parse && ch == `}` {
|
||||||
return q_min, q_max, i-in_i+2
|
status = .greedy
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check if greedy flag char ? is present
|
||||||
|
if status == .greedy {
|
||||||
|
if i+1 < in_txt.len {
|
||||||
|
i++
|
||||||
|
status = .gredy_parse
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return q_min, q_max, i-in_i+2, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// check the greedy flag
|
||||||
|
if status == .gredy_parse {
|
||||||
|
if ch == `?` {
|
||||||
|
return q_min, q_max, i-in_i+2, true
|
||||||
|
} else {
|
||||||
|
i--
|
||||||
|
return q_min, q_max, i-in_i+2, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// not a {} quantifier, exit
|
// not a {} quantifier, exit
|
||||||
return ERR_SYNTAX_ERROR,i,0
|
return ERR_SYNTAX_ERROR, i, 0, false
|
||||||
}
|
}
|
||||||
|
|
||||||
// not a conform {} quantifier
|
// not a conform {} quantifier
|
||||||
return ERR_SYNTAX_ERROR,i,0
|
return ERR_SYNTAX_ERROR, i, 0, false
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -733,7 +769,7 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
|
||||||
mut char_len := 0
|
mut char_len := 0
|
||||||
//C.printf("i: %3d ch: %c\n", i, in_txt.str[i])
|
//C.printf("i: %3d ch: %c\n", i, in_txt.str[i])
|
||||||
|
|
||||||
char_tmp,char_len = get_char(in_txt,i)
|
char_tmp,char_len = re.get_char(in_txt,i)
|
||||||
|
|
||||||
//
|
//
|
||||||
// check special cases: $ ^
|
// check special cases: $ ^
|
||||||
|
@ -848,13 +884,14 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
|
||||||
}
|
}
|
||||||
|
|
||||||
`{` {
|
`{` {
|
||||||
min,max,tmp := re.parse_quantifier(in_txt, i+1)
|
min, max, tmp, greedy := re.parse_quantifier(in_txt, i+1)
|
||||||
// it is a quantifier
|
// it is a quantifier
|
||||||
if min >= 0 {
|
if min >= 0 {
|
||||||
//C.printf("{%d,%d}\n str:[%s]\n",min,max,in_txt[i..i+tmp])
|
//C.printf("{%d,%d}\n str:[%s] greedy: %d\n", min, max, in_txt[i..i+tmp], greedy)
|
||||||
i = i + tmp
|
i = i + tmp
|
||||||
re.prog[pc-1].rep_min = min
|
re.prog[pc-1].rep_min = min
|
||||||
re.prog[pc-1].rep_max = max
|
re.prog[pc-1].rep_max = max
|
||||||
|
re.prog[pc-1].greedy = greedy
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -879,7 +916,7 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// IST_CHAR_CLASS
|
// IST_CHAR_CLASS_*
|
||||||
if char_len==1 && pc >= 0{
|
if char_len==1 && pc >= 0{
|
||||||
if byte(char_tmp) == `[` {
|
if byte(char_tmp) == `[` {
|
||||||
cc_index,tmp,cc_type := re.parse_char_class(in_txt, i+1)
|
cc_index,tmp,cc_type := re.parse_char_class(in_txt, i+1)
|
||||||
|
@ -912,14 +949,14 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
|
||||||
re.prog[pc].rep_min = 1
|
re.prog[pc].rep_min = 1
|
||||||
re.prog[pc].rep_max = 1
|
re.prog[pc].rep_max = 1
|
||||||
re.prog[pc].validator = BSLS_VALIDATOR_ARRAY[bsls_index].validator
|
re.prog[pc].validator = BSLS_VALIDATOR_ARRAY[bsls_index].validator
|
||||||
re.prog[pc].v_ch = BSLS_VALIDATOR_ARRAY[bsls_index].ch
|
re.prog[pc].ch = BSLS_VALIDATOR_ARRAY[bsls_index].ch
|
||||||
pc = pc + 1
|
pc = pc + 1
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// this is an escape char, skip the bsls and continue as a normal char
|
// this is an escape char, skip the bsls and continue as a normal char
|
||||||
else if bsls_index == NO_MATCH_FOUND {
|
else if bsls_index == NO_MATCH_FOUND {
|
||||||
i += char_len
|
i += char_len
|
||||||
char_tmp,char_len = get_char(in_txt,i)
|
char_tmp,char_len = re.get_char(in_txt,i)
|
||||||
// continue as simple char
|
// continue as simple char
|
||||||
}
|
}
|
||||||
// if not an escape or a bsls char then it is an error (at least for now!)
|
// if not an escape or a bsls char then it is an error (at least for now!)
|
||||||
|
@ -930,8 +967,9 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// IST_SIMPLE_CHAR
|
// IST_SIMPLE_CHAR
|
||||||
tmp_code = (tmp_code | char_tmp) & IST_SIMPLE_CHAR
|
re.prog[pc].ist = IST_SIMPLE_CHAR
|
||||||
re.prog[pc].ist = tmp_code
|
re.prog[pc].ch = char_tmp
|
||||||
|
re.prog[pc].ch_len = char_len
|
||||||
re.prog[pc].rep_min = 1
|
re.prog[pc].rep_min = 1
|
||||||
re.prog[pc].rep_max = 1
|
re.prog[pc].rep_max = 1
|
||||||
//C.printf("char: %c\n",char_tmp)
|
//C.printf("char: %c\n",char_tmp)
|
||||||
|
@ -1044,7 +1082,7 @@ pub fn (re RE) get_code() string {
|
||||||
res.write(" ")
|
res.write(" ")
|
||||||
ist :=re.prog[pc1].ist
|
ist :=re.prog[pc1].ist
|
||||||
if ist == IST_BSLS_CHAR {
|
if ist == IST_BSLS_CHAR {
|
||||||
res.write("[\\${re.prog[pc1].v_ch:1c}] BSLS")
|
res.write("[\\${re.prog[pc1].ch:1c}] BSLS")
|
||||||
} else if ist == IST_PROG_END {
|
} else if ist == IST_PROG_END {
|
||||||
res.write("PROG_END")
|
res.write("PROG_END")
|
||||||
stop_flag = true
|
stop_flag = true
|
||||||
|
@ -1060,8 +1098,8 @@ pub fn (re RE) get_code() string {
|
||||||
res.write("( GROUP_START #:${re.prog[pc1].group_id}")
|
res.write("( GROUP_START #:${re.prog[pc1].group_id}")
|
||||||
} else if ist == IST_GROUP_END {
|
} else if ist == IST_GROUP_END {
|
||||||
res.write(") GROUP_END #:${re.prog[pc1].group_id}")
|
res.write(") GROUP_END #:${re.prog[pc1].group_id}")
|
||||||
} else if ist & SIMPLE_CHAR_MASK == 0 {
|
} else if ist == IST_SIMPLE_CHAR {
|
||||||
res.write("[${ist & IST_SIMPLE_CHAR:1c}] query_ch")
|
res.write("[${re.prog[pc1].ch:1c}] query_ch")
|
||||||
}
|
}
|
||||||
|
|
||||||
if re.prog[pc1].rep_max == MAX_QUANTIFIER {
|
if re.prog[pc1].rep_max == MAX_QUANTIFIER {
|
||||||
|
@ -1072,6 +1110,9 @@ pub fn (re RE) get_code() string {
|
||||||
} else {
|
} else {
|
||||||
res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}")
|
res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}")
|
||||||
}
|
}
|
||||||
|
if re.prog[pc1].greedy == true {
|
||||||
|
res.write("?")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
res.write("\n")
|
res.write("\n")
|
||||||
if stop_flag {
|
if stop_flag {
|
||||||
|
@ -1136,7 +1177,7 @@ pub fn (re RE) get_query() string {
|
||||||
|
|
||||||
// bsls char
|
// bsls char
|
||||||
if ch == IST_BSLS_CHAR {
|
if ch == IST_BSLS_CHAR {
|
||||||
res.write("\\${re.prog[i].v_ch:1c}")
|
res.write("\\${re.prog[i].ch:1c}")
|
||||||
}
|
}
|
||||||
|
|
||||||
// IST_DOT_CHAR
|
// IST_DOT_CHAR
|
||||||
|
@ -1145,11 +1186,11 @@ pub fn (re RE) get_query() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
// char alone
|
// char alone
|
||||||
if ch & SIMPLE_CHAR_MASK == 0 {
|
if ch == IST_SIMPLE_CHAR {
|
||||||
if byte(ch) in BSLS_ESCAPE_LIST {
|
if byte(ch) in BSLS_ESCAPE_LIST {
|
||||||
res.write("\\")
|
res.write("\\")
|
||||||
}
|
}
|
||||||
res.write("${re.prog[i].ist:c}")
|
res.write("${re.prog[i].ch:c}")
|
||||||
}
|
}
|
||||||
|
|
||||||
// quantifier
|
// quantifier
|
||||||
|
@ -1166,6 +1207,9 @@ pub fn (re RE) get_query() string {
|
||||||
} else {
|
} else {
|
||||||
res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
|
res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
|
||||||
}
|
}
|
||||||
|
if re.prog[i].greedy == true {
|
||||||
|
res.write("?")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1187,10 +1231,11 @@ enum match_state{
|
||||||
start = 0,
|
start = 0,
|
||||||
stop,
|
stop,
|
||||||
end,
|
end,
|
||||||
|
new_line,
|
||||||
|
|
||||||
ist_load, // load and execute istruction
|
ist_load, // load and execute instruction
|
||||||
ist_next, // go to next istruction
|
ist_next, // go to next instruction
|
||||||
ist_next_ks, // go to next istruction without clenaning the state
|
ist_next_ks, // go to next instruction without clenaning the state
|
||||||
ist_quant_p, // match positive ,quantifier check
|
ist_quant_p, // match positive ,quantifier check
|
||||||
ist_quant_n, // match negative, quantifier check
|
ist_quant_n, // match negative, quantifier check
|
||||||
ist_quant_pg, // match positive ,group quantifier check
|
ist_quant_pg, // match positive ,group quantifier check
|
||||||
|
@ -1202,6 +1247,7 @@ fn state_str(s match_state) string {
|
||||||
.start { return "start" }
|
.start { return "start" }
|
||||||
.stop { return "stop" }
|
.stop { return "stop" }
|
||||||
.end { return "end" }
|
.end { return "end" }
|
||||||
|
.new_line { return "new line" }
|
||||||
|
|
||||||
.ist_load { return "ist_load" }
|
.ist_load { return "ist_load" }
|
||||||
.ist_next { return "ist_next" }
|
.ist_next { return "ist_next" }
|
||||||
|
@ -1277,7 +1323,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
re.log_func(buf2.str())
|
re.log_func(buf2.str())
|
||||||
}else{
|
}else{
|
||||||
|
|
||||||
// print only the exe istruction
|
// print only the exe instruction
|
||||||
if (re.debug == 1 && m_state == .ist_load) ||
|
if (re.debug == 1 && m_state == .ist_load) ||
|
||||||
re.debug == 2
|
re.debug == 2
|
||||||
{
|
{
|
||||||
|
@ -1287,23 +1333,17 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
else if ist == 0 || m_state in [.start,.ist_next,.stop] {
|
else if ist == 0 || m_state in [.start,.ist_next,.stop] {
|
||||||
buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n")
|
buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n")
|
||||||
}else{
|
}else{
|
||||||
ch, char_len = get_charb(in_txt,i)
|
ch, char_len = re.get_charb(in_txt,i)
|
||||||
|
|
||||||
buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${pc:3d}=>")
|
buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${pc:3d}=>")
|
||||||
buf2.write("${ist:8x}".replace(" ","0"))
|
buf2.write("${ist:8x}".replace(" ","0"))
|
||||||
buf2.write(" i,ch,len:[${i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${first_match:3d},${state.match_index:3d}] ")
|
buf2.write(" i,ch,len:[${i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${first_match:3d},${state.match_index:3d}] ")
|
||||||
|
|
||||||
if ist & SIMPLE_CHAR_MASK == 0 {
|
if ist == IST_SIMPLE_CHAR {
|
||||||
if char_len < 4 {
|
buf2.write("query_ch: [${re.prog[pc].ch:1c}]")
|
||||||
tmp_c := ist & IST_SIMPLE_CHAR
|
|
||||||
buf2.write("query_ch: [${tmp_c:1c}]")
|
|
||||||
} else {
|
|
||||||
tmp_c := ist | IST_SIMPLE_CHAR
|
|
||||||
buf2.write("query_ch: [${tmp_c:1c}]")
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if ist == IST_BSLS_CHAR {
|
if ist == IST_BSLS_CHAR {
|
||||||
buf2.write("BSLS [\\${re.prog[pc].v_ch:1c}]")
|
buf2.write("BSLS [\\${re.prog[pc].ch:1c}]")
|
||||||
} else if ist == IST_PROG_END {
|
} else if ist == IST_PROG_END {
|
||||||
buf2.write("PROG_END")
|
buf2.write("PROG_END")
|
||||||
} else if ist == IST_OR_BRANCH {
|
} else if ist == IST_OR_BRANCH {
|
||||||
|
@ -1327,6 +1367,9 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
} else {
|
} else {
|
||||||
buf2.write("{${re.prog[pc].rep_min},${re.prog[pc].rep_max}}:${re.prog[pc].rep}")
|
buf2.write("{${re.prog[pc].rep_min},${re.prog[pc].rep_max}}:${re.prog[pc].rep}")
|
||||||
}
|
}
|
||||||
|
if re.prog[pc].greedy == true {
|
||||||
|
buf2.write("?")
|
||||||
|
}
|
||||||
buf2.write(" (#${group_index})\n")
|
buf2.write(" (#${group_index})\n")
|
||||||
}
|
}
|
||||||
re.log_func(buf2.str())
|
re.log_func(buf2.str())
|
||||||
|
@ -1338,7 +1381,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
//******************************************
|
//******************************************
|
||||||
|
|
||||||
// we're out of text, manage it
|
// we're out of text, manage it
|
||||||
if i >= in_txt_len {
|
if i >= in_txt_len || m_state == .new_line {
|
||||||
|
|
||||||
// manage groups
|
// manage groups
|
||||||
if group_index >= 0 && state.match_index >= 0 {
|
if group_index >= 0 && state.match_index >= 0 {
|
||||||
|
@ -1376,7 +1419,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
// manage IST_DOT_CHAR
|
// manage IST_DOT_CHAR
|
||||||
if re.state_stack_index >= 0 {
|
if re.state_stack_index >= 0 {
|
||||||
//C.printf("DOT CHAR text end management!\n")
|
//C.printf("DOT CHAR text end management!\n")
|
||||||
// if DOT CHAR is not the last istruction and we are still going, then no match!!
|
// if DOT CHAR is not the last instruction and we are still going, then no match!!
|
||||||
if pc < re.prog.len && re.prog[pc+1].ist != IST_PROG_END {
|
if pc < re.prog.len && re.prog[pc+1].ist != IST_PROG_END {
|
||||||
return NO_MATCH_FOUND,0
|
return NO_MATCH_FOUND,0
|
||||||
}
|
}
|
||||||
|
@ -1395,7 +1438,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// ist_next, next istruction reseting its state
|
// ist_next, next instruction reseting its state
|
||||||
if m_state == .ist_next {
|
if m_state == .ist_next {
|
||||||
pc = pc + 1
|
pc = pc + 1
|
||||||
re.prog[pc].reset()
|
re.prog[pc].reset()
|
||||||
|
@ -1408,7 +1451,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// ist_next_ks, next istruction keeping its state
|
// ist_next_ks, next instruction keeping its state
|
||||||
if m_state == .ist_next_ks {
|
if m_state == .ist_next_ks {
|
||||||
pc = pc + 1
|
pc = pc + 1
|
||||||
// check if we are in the program bounds
|
// check if we are in the program bounds
|
||||||
|
@ -1421,7 +1464,13 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// load the char
|
// load the char
|
||||||
ch, char_len = get_charb(in_txt,i)
|
ch, char_len = re.get_charb(in_txt,i)
|
||||||
|
|
||||||
|
// check new line if flag F_NL enabled
|
||||||
|
if (re.flag & F_NL) != 0 && char_len == 1 && byte(ch) in NEW_LINE_LIST {
|
||||||
|
m_state = .new_line
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
// check if stop
|
// check if stop
|
||||||
if m_state == .stop {
|
if m_state == .stop {
|
||||||
|
@ -1547,7 +1596,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
m_state = .ist_next
|
m_state = .ist_next
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// IST_DOT_CHAR is the last istruction, get all
|
// IST_DOT_CHAR is the last instruction, get all
|
||||||
else {
|
else {
|
||||||
//C.printf("We are the last one!\n")
|
//C.printf("We are the last one!\n")
|
||||||
pc--
|
pc--
|
||||||
|
@ -1613,12 +1662,11 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// simple char IST
|
// simple char IST
|
||||||
else if ist & IST_SIMPLE_CHAR != 0 {
|
else if ist == IST_SIMPLE_CHAR {
|
||||||
//C.printf("IST_SIMPLE_CHAR\n")
|
//C.printf("IST_SIMPLE_CHAR\n")
|
||||||
state.match_flag = false
|
state.match_flag = false
|
||||||
|
|
||||||
if (char_len<4 && ist == ch) ||
|
if re.prog[pc].ch == ch
|
||||||
(char_len == 4 && (ist | SIMPLE_CHAR_MASK) == ch )
|
|
||||||
{
|
{
|
||||||
state.match_flag = true
|
state.match_flag = true
|
||||||
|
|
||||||
|
@ -1749,6 +1797,15 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
}
|
}
|
||||||
else if rep >= re.prog[tmp_pc].rep_min {
|
else if rep >= re.prog[tmp_pc].rep_min {
|
||||||
//C.printf("ist_quant_pg IN RANGE group_index:%d\n", group_index)
|
//C.printf("ist_quant_pg IN RANGE group_index:%d\n", group_index)
|
||||||
|
|
||||||
|
// check greedy flag, if true exit on minimum
|
||||||
|
if re.prog[tmp_pc].greedy == true {
|
||||||
|
re.prog[tmp_pc].group_rep = 0 // clear the repetitions
|
||||||
|
group_index--
|
||||||
|
m_state = .ist_next
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
pc = re.prog[tmp_pc].goto_pc - 1
|
pc = re.prog[tmp_pc].goto_pc - 1
|
||||||
group_index--
|
group_index--
|
||||||
m_state = .ist_next
|
m_state = .ist_next
|
||||||
|
@ -1832,6 +1889,13 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
|
||||||
// range ok, continue loop
|
// range ok, continue loop
|
||||||
else if rep >= re.prog[pc].rep_min && rep < re.prog[pc].rep_max {
|
else if rep >= re.prog[pc].rep_min && rep < re.prog[pc].rep_max {
|
||||||
//C.printf("ist_quant_p IN RANGE\n")
|
//C.printf("ist_quant_p IN RANGE\n")
|
||||||
|
|
||||||
|
// check greedy flag, if true exit on minimum
|
||||||
|
if re.prog[pc].greedy == true {
|
||||||
|
m_state = .ist_next
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
m_state = .ist_load
|
m_state = .ist_load
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
|
@ -65,6 +65,7 @@ match_test_suite = [
|
||||||
TestItem{"cpapaz ole. pippo,",r".*c.+ole.*pi",0,14},
|
TestItem{"cpapaz ole. pippo,",r".*c.+ole.*pi",0,14},
|
||||||
TestItem{"cpapaz ole. pipipo,",r".*c.+ole.*p([ip])+o",0,18},
|
TestItem{"cpapaz ole. pipipo,",r".*c.+ole.*p([ip])+o",0,18},
|
||||||
TestItem{"cpapaz ole. pipipo",r"^.*c.+ol?e.*p([ip])+o$",0,18},
|
TestItem{"cpapaz ole. pipipo",r"^.*c.+ol?e.*p([ip])+o$",0,18},
|
||||||
|
TestItem{"abbb",r"ab{2,3}?",0,3},
|
||||||
|
|
||||||
// negative
|
// negative
|
||||||
TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
|
TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
|
||||||
|
|
Loading…
Reference in New Issue