forked from Elara6331/pcre
Implement Callouts
This commit is contained in:
parent
daf2d2e697
commit
23f1d484df
135
pcre.go
135
pcre.go
@ -27,7 +27,11 @@ type Regexp struct {
|
|||||||
mtx *sync.Mutex
|
mtx *sync.Mutex
|
||||||
expr string
|
expr string
|
||||||
re uintptr
|
re uintptr
|
||||||
|
mctx uintptr
|
||||||
tls *libc.TLS
|
tls *libc.TLS
|
||||||
|
|
||||||
|
calloutMtx *sync.Mutex
|
||||||
|
callout *func(tls *libc.TLS, cbptr, data uintptr) int32
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compile runs CompileOpts with no options.
|
// Compile runs CompileOpts with no options.
|
||||||
@ -73,10 +77,12 @@ func CompileOpts(pattern string, options CompileOption) (*Regexp, error) {
|
|||||||
|
|
||||||
// Create regexp instance
|
// Create regexp instance
|
||||||
regex := Regexp{
|
regex := Regexp{
|
||||||
expr: pattern,
|
expr: pattern,
|
||||||
mtx: &sync.Mutex{},
|
mtx: &sync.Mutex{},
|
||||||
re: r,
|
re: r,
|
||||||
tls: tls,
|
mctx: lib.Xpcre2_match_context_create_8(tls, 0),
|
||||||
|
tls: tls,
|
||||||
|
calloutMtx: &sync.Mutex{},
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure resources are freed if GC collects the
|
// Make sure resources are freed if GC collects the
|
||||||
@ -298,7 +304,7 @@ func (r *Regexp) FindStringIndex(s string) []int {
|
|||||||
// FinAllString is the String version of FindAll
|
// FinAllString is the String version of FindAll
|
||||||
func (r *Regexp) FindAllString(s string, n int) []string {
|
func (r *Regexp) FindAllString(s string, n int) []string {
|
||||||
matches := r.FindAll([]byte(s), n)
|
matches := r.FindAll([]byte(s), n)
|
||||||
|
|
||||||
out := make([]string, len(matches))
|
out := make([]string, len(matches))
|
||||||
for index, match := range matches {
|
for index, match := range matches {
|
||||||
out[index] = string(match)
|
out[index] = string(match)
|
||||||
@ -483,9 +489,12 @@ func (r *Regexp) ReplaceAllLiteralString(src, repl string) string {
|
|||||||
// between those expression matches.
|
// between those expression matches.
|
||||||
//
|
//
|
||||||
// Example:
|
// Example:
|
||||||
|
//
|
||||||
// s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
|
// s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
|
||||||
// // s: ["", "b", "b", "c", "cadaaae"]
|
// // s: ["", "b", "b", "c", "cadaaae"]
|
||||||
|
//
|
||||||
// The count determines the number of substrings to return:
|
// The count determines the number of substrings to return:
|
||||||
|
//
|
||||||
// n > 0: at most n substrings; the last substring will be the unsplit remainder.
|
// n > 0: at most n substrings; the last substring will be the unsplit remainder.
|
||||||
// n == 0: the result is nil (zero substrings)
|
// n == 0: the result is nil (zero substrings)
|
||||||
// n < 0: all substrings
|
// n < 0: all substrings
|
||||||
@ -556,6 +565,116 @@ func (r *Regexp) SubexpIndex(name string) int {
|
|||||||
return int(ret)
|
return int(ret)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type CalloutFlags uint32
|
||||||
|
|
||||||
|
const (
|
||||||
|
CalloutStartMatch = CalloutFlags(lib.DPCRE2_CALLOUT_STARTMATCH)
|
||||||
|
CalloutBacktrack = CalloutFlags(lib.DPCRE2_CALLOUT_BACKTRACK)
|
||||||
|
)
|
||||||
|
|
||||||
|
type CalloutBlock struct {
|
||||||
|
// Version contains the version number of the block format.
|
||||||
|
// The current version is 2.
|
||||||
|
Version uint32
|
||||||
|
|
||||||
|
// CalloutNumber contains the number of the callout, in the range 0-255.
|
||||||
|
// This is the number that follows "?C". For callouts with string arguments,
|
||||||
|
// this will always be zero.
|
||||||
|
CalloutNumber uint32
|
||||||
|
|
||||||
|
// CaptureTop contains the number of the highest numbered substring
|
||||||
|
// captured so far plus one. If no substrings have yet been captured,
|
||||||
|
// CaptureTop will be set to 1.
|
||||||
|
CaptureTop uint32
|
||||||
|
|
||||||
|
// CaptureLast contains the number of the last substring that was captured.
|
||||||
|
CaptureLast uint32
|
||||||
|
|
||||||
|
// Substrings contains all of the substrings captured so far.
|
||||||
|
Substrings []string
|
||||||
|
|
||||||
|
Mark string
|
||||||
|
|
||||||
|
// Subject contains the string passed to the match function.
|
||||||
|
Subject string
|
||||||
|
|
||||||
|
// StartMatch contains the offset within the subject at which the current match attempt started.
|
||||||
|
StartMatch uint
|
||||||
|
|
||||||
|
// CurrentPosition contains the offset of the current match pointer within the subject.
|
||||||
|
CurrentPosition uint
|
||||||
|
|
||||||
|
// PatternPosition contains the offset within the pattern string to the next item to be matched.
|
||||||
|
PatternPosition uint
|
||||||
|
|
||||||
|
// NextItemLength contains the length of the next item to be processed in the pattern string.
|
||||||
|
NextItemLength uint
|
||||||
|
|
||||||
|
// CalloutStringOffset contains the code unit offset to the start of the callout argument string within the original pattern string.
|
||||||
|
CalloutStringOffset uint
|
||||||
|
|
||||||
|
// CalloutString is the string for the callout. For numerical callouts, this will always be empty.
|
||||||
|
CalloutString string
|
||||||
|
|
||||||
|
// CalloutFlags contains the following flags:
|
||||||
|
// CalloutStartMatch
|
||||||
|
// This is set for the first callout after the start of matching for each new starting position in the subject.
|
||||||
|
// CalloutBacktrack
|
||||||
|
// This is set if there has been a matching backtrack since the previous callout, or since the start of matching if this is the first callout from a pcre2_match() run.
|
||||||
|
//
|
||||||
|
// Both bits are set when a backtrack has caused a "bumpalong" to a new starting position in the subject. Output
|
||||||
|
CalloutFlags CalloutFlags
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Regexp) SetCallout(fn func(cb *CalloutBlock) int32) error {
|
||||||
|
cfn := func(tls *libc.TLS, cbptr, data uintptr) int32 {
|
||||||
|
ccb := (*lib.Tpcre2_callout_block_8)(unsafe.Pointer(cbptr))
|
||||||
|
|
||||||
|
cb := &CalloutBlock{
|
||||||
|
Version: ccb.Fversion,
|
||||||
|
CalloutNumber: ccb.Fcallout_number,
|
||||||
|
CaptureTop: ccb.Fcapture_top,
|
||||||
|
CaptureLast: ccb.Fcapture_last,
|
||||||
|
Mark: libc.GoString(ccb.Fmark),
|
||||||
|
StartMatch: uint(ccb.Fstart_match),
|
||||||
|
CurrentPosition: uint(ccb.Fcurrent_position),
|
||||||
|
PatternPosition: uint(ccb.Fpattern_position),
|
||||||
|
NextItemLength: uint(ccb.Fnext_item_length),
|
||||||
|
CalloutStringOffset: uint(ccb.Fcallout_string_offset),
|
||||||
|
CalloutFlags: CalloutFlags(ccb.Fcallout_flags),
|
||||||
|
}
|
||||||
|
|
||||||
|
subjectBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fsubject)), ccb.Fsubject_length)
|
||||||
|
cb.Subject = string(subjectBytes)
|
||||||
|
|
||||||
|
calloutStrBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fcallout_string)), ccb.Fcallout_string_length)
|
||||||
|
cb.CalloutString = string(calloutStrBytes)
|
||||||
|
|
||||||
|
ovecSlice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ccb.Foffset_vector)), (ccb.Fcapture_top*2)-1)[2:]
|
||||||
|
for i := 0; i < len(ovecSlice); i += 2 {
|
||||||
|
if i+1 >= len(ovecSlice) {
|
||||||
|
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:])
|
||||||
|
} else {
|
||||||
|
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:ovecSlice[i+1]])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
x := fn(cb)
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prevent callout functions from being GC'd
|
||||||
|
r.calloutMtx.Lock()
|
||||||
|
defer r.calloutMtx.Unlock()
|
||||||
|
r.callout = &cfn
|
||||||
|
|
||||||
|
ret := lib.Xpcre2_set_callout_8(r.tls, r.mctx, *(*uintptr)(unsafe.Pointer(&cfn)), 0)
|
||||||
|
if ret < 0 {
|
||||||
|
return codeToError(r.tls, ret)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// replaceBytes replaces the bytes at a given location, and returns a new
|
// replaceBytes replaces the bytes at a given location, and returns a new
|
||||||
// offset, based on how much bigger or smaller the slice got after replacement
|
// offset, based on how much bigger or smaller the slice got after replacement
|
||||||
func replaceBytes(src, repl []byte, sOff, eOff lib.Tsize_t, diff int64) (int64, []byte) {
|
func replaceBytes(src, repl []byte, sOff, eOff lib.Tsize_t, diff int64) (int64, []byte) {
|
||||||
@ -577,7 +696,7 @@ func (r *Regexp) match(b []byte, options uint32, multi bool) ([][]lib.Tsize_t, e
|
|||||||
if len(b) == 0 {
|
if len(b) == 0 {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
r.mtx.Lock()
|
r.mtx.Lock()
|
||||||
defer r.mtx.Unlock()
|
defer r.mtx.Unlock()
|
||||||
|
|
||||||
@ -600,7 +719,7 @@ func (r *Regexp) match(b []byte, options uint32, multi bool) ([][]lib.Tsize_t, e
|
|||||||
// While the offset is less than the length of the subject
|
// While the offset is less than the length of the subject
|
||||||
for offset < cSubjectLen {
|
for offset < cSubjectLen {
|
||||||
// Execute expression on subject
|
// Execute expression on subject
|
||||||
ret := lib.Xpcre2_match_8(r.tls, r.re, cSubject, cSubjectLen, offset, options, md, 0)
|
ret := lib.Xpcre2_match_8(r.tls, r.re, cSubject, cSubjectLen, offset, options, md, r.mctx)
|
||||||
if ret < 0 {
|
if ret < 0 {
|
||||||
// If no match found, break
|
// If no match found, break
|
||||||
if ret == lib.DPCRE2_ERROR_NOMATCH {
|
if ret == lib.DPCRE2_ERROR_NOMATCH {
|
||||||
@ -670,6 +789,8 @@ func (r *Regexp) Close() error {
|
|||||||
|
|
||||||
// Free the compiled code
|
// Free the compiled code
|
||||||
lib.Xpcre2_code_free_8(r.tls, r.re)
|
lib.Xpcre2_code_free_8(r.tls, r.re)
|
||||||
|
// Free the match context
|
||||||
|
lib.Xpcre2_match_context_free_8(r.tls, r.mctx)
|
||||||
// Set regular expression to null
|
// Set regular expression to null
|
||||||
r.re = 0
|
r.re = 0
|
||||||
|
|
||||||
|
55
pcre_test.go
55
pcre_test.go
@ -234,3 +234,58 @@ func TestString(t *testing.T) {
|
|||||||
t.Errorf("expected %s, got %s", expr, r.String())
|
t.Errorf("expected %s, got %s", expr, r.String())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCallout(t *testing.T) {
|
||||||
|
const expr = `(https?)://([.\w\d]+\.[\w\d]{2,4}[\w\d?&=%/.-]*)(?C2)`
|
||||||
|
subject := "https://www.elara.ws/"
|
||||||
|
|
||||||
|
r := pcre.MustCompile(expr)
|
||||||
|
defer r.Close()
|
||||||
|
|
||||||
|
executed := false
|
||||||
|
r.SetCallout(func(cb *pcre.CalloutBlock) int32 {
|
||||||
|
executed = true
|
||||||
|
|
||||||
|
if cb.CalloutNumber != 2 {
|
||||||
|
t.Errorf("[CalloutNumber] expected %d, got %d", 2, cb.CalloutNumber)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cb.CaptureTop != 3 {
|
||||||
|
t.Errorf("[CaptureTop] expected %d, got %d", 3, cb.CaptureTop)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cb.CaptureLast != 2 {
|
||||||
|
t.Errorf("[CaptureLast] expected %d, got %d", 2, cb.CaptureLast)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cb.Subject != subject {
|
||||||
|
t.Errorf("[Subject] expected %q, got %q", subject, cb.Subject)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cb.StartMatch != 0 {
|
||||||
|
t.Errorf("[StartMatch] expected %d, got %d", 0, cb.StartMatch)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cb.CurrentPosition != 21 {
|
||||||
|
t.Errorf("[CurrentPosition] expected %d, got %d", 0, cb.CurrentPosition)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cb.PatternPosition != 53 {
|
||||||
|
t.Errorf("[PatternPosition] expected %d, got %d", 53, cb.PatternPosition)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cb.NextItemLength != 0 {
|
||||||
|
t.Errorf("[NextItemLength] expected %d, got %d", 0, cb.NextItemLength)
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0
|
||||||
|
})
|
||||||
|
|
||||||
|
m := r.MatchString(subject)
|
||||||
|
|
||||||
|
if !executed {
|
||||||
|
t.Error("expected callout to be executed")
|
||||||
|
} else if !m {
|
||||||
|
t.Error("expected regular expression to match the string")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user