builtin: add `s.match_glob(wildcard_pattern)`

pull/12908/head
Delyan Angelov 2021-12-20 14:15:51 +02:00
parent 7c85c2ab1f
commit f81654e3a7
No known key found for this signature in database
GPG Key ID: 66886C0F12D595ED
2 changed files with 212 additions and 0 deletions

View File

@ -1753,3 +1753,112 @@ pub fn (s string) strip_margin_custom(del byte) string {
return ret.vstring_with_len(count)
}
}
// match_glob matches the string, with a Unix shell-style wildcard pattern.
// NB: wildcard patterns are NOT the same as regular expressions.
// They are much simpler, and do not allow backtracking, captures, etc.
// The special characters used in shell-style wildcards are:
// `*` - matches everything
// `?` - matches any single character
// `[seq]` - matches any of the characters in the sequence
// `[^seq]` - matches any character that is NOT in the sequence
// Any other character in `pattern`, is matched 1:1 to the corresponding
// character in `name`, including / and \.
// You can wrap the meta-characters in brackets too, i.e. `[?]` matches `?`
// in the string, and `[*]` matches `*` in the string.
// Example: assert 'ABCD'.match_glob('AB*')
// Example: assert 'ABCD'.match_glob('*D')
// Example: assert 'ABCD'.match_glob('*B*')
// Example: assert !'ABCD'.match_glob('AB')
[direct_array_access]
pub fn (name string) match_glob(pattern string) bool {
// Initial port based on https://research.swtch.com/glob.go
// See also https://research.swtch.com/glob
mut px := 0
mut nx := 0
mut next_px := 0
mut next_nx := 0
plen := pattern.len
nlen := name.len
for px < plen || nx < nlen {
if px < plen {
c := pattern[px]
match c {
`?` {
// single-character wildcard
if nx < nlen {
px++
nx++
continue
}
}
`*` {
// zero-or-more-character wildcard
// Try to match at nx.
// If that doesn't work out, restart at nx+1 next.
next_px = px
next_nx = nx + 1
px++
continue
}
`[` {
if nx < nlen {
wanted_c := name[nx]
mut bstart := px
mut is_inverted := false
mut inner_match := false
mut inner_idx := bstart + 1
mut inner_c := 0
if inner_idx < plen {
inner_c = pattern[inner_idx]
if inner_c == `^` {
is_inverted = true
inner_idx++
}
}
for ; inner_idx < plen; inner_idx++ {
inner_c = pattern[inner_idx]
if inner_c == `]` {
break
}
if inner_c == wanted_c {
inner_match = true
for px < plen && pattern[px] != `]` {
px++
}
break
}
}
if is_inverted {
if inner_match {
return false
} else {
px = inner_idx
}
}
}
px++
nx++
continue
}
else {
// an ordinary character
if nx < nlen && name[nx] == c {
px++
nx++
continue
}
}
}
}
if 0 < next_nx && next_nx <= nlen {
// A mismatch, try restarting:
px = next_px
nx = next_nx
continue
}
return false
}
// Matched all of `pattern` to all of `name`
return true
}

View File

@ -0,0 +1,103 @@
import time
fn test_match_glob_on_empty_string() {
assert ''.match_glob('')
assert !''.match_glob('x')
}
fn test_match_glob_on_x() {
assert !'x'.match_glob('')
assert 'x'.match_glob('x')
assert 'xxx'.match_glob('*x')
assert 'xxx'.match_glob('x*')
}
fn test_match_glob_on_abc() {
assert !'abc'.match_glob('')
assert 'abc'.match_glob('*')
//
assert !'abc'.match_glob('ab')
assert 'abc'.match_glob('abc')
assert 'abc'.match_glob('abc*')
//
assert 'abc'.match_glob('*c')
assert !'abc'.match_glob('*b')
assert 'abc'.match_glob('*bc')
assert 'abc'.match_glob('*abc')
//
assert 'abc'.match_glob('a*')
assert !'abc'.match_glob('b*')
assert 'abc'.match_glob('a*c')
//
assert 'abc'.match_glob('ab?')
assert 'abc'.match_glob('a??')
assert 'abc'.match_glob('???')
assert !'abc'.match_glob('??')
assert !'abc'.match_glob('?')
}
fn test_match_glob_on_a() {
assert 'a'.match_glob('a')
assert 'a'.match_glob('?')
assert !'a'.match_glob('??')
assert 'a'.match_glob('*')
assert 'a'.match_glob('a*')
assert 'a'.match_glob('*a')
}
fn test_match_glob_with_any_charset_patterns() {
assert 'axbxcxdxe'.match_glob('*c[xyz]d*')
assert 'axbxcxdxe'.match_glob('*c[yxz]d*')
assert 'axbxcxdxe'.match_glob('*c[zyx]d*')
//
assert 'axbxcxdxe'.match_glob('*dx[QeW]')
assert 'axbxcxdxe'.match_glob('*dx[QeW]*')
//
assert !'axbxcxdxe'.match_glob('*bx[QcW]')
assert 'axbxcxdxe'.match_glob('*bx[QcW]*')
//
assert !'axbxcxdxe'.match_glob('*zx[QeW]')
assert !'axbxcxdxe'.match_glob('*zx[QeW]*')
}
fn test_match_glob_with_none_of_charset_patterns() {
assert 'axbxcxdxe'.match_glob('*c[^XYZ]d*')
assert !'axbxcxdxe'.match_glob('*c[^xYZ]d*')
assert !'axbxcxdxe'.match_glob('*c[^YxZ]d*')
assert !'axbxcxdxe'.match_glob('*c[^YZx]d*')
}
fn test_match_glob_with_escaped_metachars() {
assert 'axbx?cxdxe'.match_glob('*x[?]c*')
assert !'axbxXcxdxe'.match_glob('*x[?]c*')
assert 'zaxbx*cxdxez'.match_glob('*x[Q*W]c*')
assert 'zaxbx*cxdxez'.match_glob('*x[QW*]c*')
assert 'zaxbx*cxdxez'.match_glob('*bx[*QW]c*')
assert 'zaxbW*cxdxez'.match_glob('*W[*nmk]c*')
assert 'zaxbW*cxdxez'.match_glob('*W[n*mk]c*')
assert 'zaxbW*cxdxez'.match_glob('*W[nm*k]c*')
assert 'zaxbW*cxdxez'.match_glob('*W[nmk*]c*')
}
fn test_match_glob_with_complex_patterns() {
assert 'axbxcxdxe'.match_glob('*xdx*')
assert !'axbxcxdxe'.match_glob('*xzx*')
assert 'axbxcxdxe'.match_glob('a*b*c*d*e*')
assert 'axbxcxdxexxx'.match_glob('a*b*c*d*e*')
assert 'abxbbxdbxebxczzx'.match_glob('a*b?c*x')
assert !'abxbbxdbxebxczzy'.match_glob('a*b?c*x')
}
fn test_match_glob_search_is_linear() {
// NB: these are pathological cases, when matches are performed
// using the exponential recursive approach, that can take many
// seconds, even minutes, but take usually only microseconds,
// using the linear approach from https://research.swtch.com/glob
// that does not backtrack.
long_a := 'a'.repeat(500)
sw := time.new_stopwatch()
assert !long_a.match_glob('a*a*a*a*b')
assert sw.elapsed().milliseconds() < 10
assert !long_a.match_glob('a*a*a*a*a*a*a*a*a*b')
assert sw.elapsed().milliseconds() < 10
}