Implement Callouts

This commit is contained in:
Elara 2023-07-16 23:23:47 -07:00
parent daf2d2e697
commit 23f1d484df
2 changed files with 183 additions and 7 deletions

135
pcre.go
View File

@ -27,7 +27,11 @@ type Regexp struct {
mtx *sync.Mutex
expr string
re uintptr
mctx uintptr
tls *libc.TLS
calloutMtx *sync.Mutex
callout *func(tls *libc.TLS, cbptr, data uintptr) int32
}
// Compile runs CompileOpts with no options.
@ -73,10 +77,12 @@ func CompileOpts(pattern string, options CompileOption) (*Regexp, error) {
// Create regexp instance
regex := Regexp{
expr: pattern,
mtx: &sync.Mutex{},
re: r,
tls: tls,
expr: pattern,
mtx: &sync.Mutex{},
re: r,
mctx: lib.Xpcre2_match_context_create_8(tls, 0),
tls: tls,
calloutMtx: &sync.Mutex{},
}
// Make sure resources are freed if GC collects the
@ -298,7 +304,7 @@ func (r *Regexp) FindStringIndex(s string) []int {
// FinAllString is the String version of FindAll
func (r *Regexp) FindAllString(s string, n int) []string {
matches := r.FindAll([]byte(s), n)
out := make([]string, len(matches))
for index, match := range matches {
out[index] = string(match)
@ -483,9 +489,12 @@ func (r *Regexp) ReplaceAllLiteralString(src, repl string) string {
// between those expression matches.
//
// Example:
//
// s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
// // s: ["", "b", "b", "c", "cadaaae"]
//
// The count determines the number of substrings to return:
//
// n > 0: at most n substrings; the last substring will be the unsplit remainder.
// n == 0: the result is nil (zero substrings)
// n < 0: all substrings
@ -556,6 +565,116 @@ func (r *Regexp) SubexpIndex(name string) int {
return int(ret)
}
type CalloutFlags uint32
const (
CalloutStartMatch = CalloutFlags(lib.DPCRE2_CALLOUT_STARTMATCH)
CalloutBacktrack = CalloutFlags(lib.DPCRE2_CALLOUT_BACKTRACK)
)
type CalloutBlock struct {
// Version contains the version number of the block format.
// The current version is 2.
Version uint32
// CalloutNumber contains the number of the callout, in the range 0-255.
// This is the number that follows "?C". For callouts with string arguments,
// this will always be zero.
CalloutNumber uint32
// CaptureTop contains the number of the highest numbered substring
// captured so far plus one. If no substrings have yet been captured,
// CaptureTop will be set to 1.
CaptureTop uint32
// CaptureLast contains the number of the last substring that was captured.
CaptureLast uint32
// Substrings contains all of the substrings captured so far.
Substrings []string
Mark string
// Subject contains the string passed to the match function.
Subject string
// StartMatch contains the offset within the subject at which the current match attempt started.
StartMatch uint
// CurrentPosition contains the offset of the current match pointer within the subject.
CurrentPosition uint
// PatternPosition contains the offset within the pattern string to the next item to be matched.
PatternPosition uint
// NextItemLength contains the length of the next item to be processed in the pattern string.
NextItemLength uint
// CalloutStringOffset contains the code unit offset to the start of the callout argument string within the original pattern string.
CalloutStringOffset uint
// CalloutString is the string for the callout. For numerical callouts, this will always be empty.
CalloutString string
// CalloutFlags contains the following flags:
// CalloutStartMatch
// This is set for the first callout after the start of matching for each new starting position in the subject.
// CalloutBacktrack
// This is set if there has been a matching backtrack since the previous callout, or since the start of matching if this is the first callout from a pcre2_match() run.
//
// Both bits are set when a backtrack has caused a "bumpalong" to a new starting position in the subject. Output
CalloutFlags CalloutFlags
}
func (r *Regexp) SetCallout(fn func(cb *CalloutBlock) int32) error {
cfn := func(tls *libc.TLS, cbptr, data uintptr) int32 {
ccb := (*lib.Tpcre2_callout_block_8)(unsafe.Pointer(cbptr))
cb := &CalloutBlock{
Version: ccb.Fversion,
CalloutNumber: ccb.Fcallout_number,
CaptureTop: ccb.Fcapture_top,
CaptureLast: ccb.Fcapture_last,
Mark: libc.GoString(ccb.Fmark),
StartMatch: uint(ccb.Fstart_match),
CurrentPosition: uint(ccb.Fcurrent_position),
PatternPosition: uint(ccb.Fpattern_position),
NextItemLength: uint(ccb.Fnext_item_length),
CalloutStringOffset: uint(ccb.Fcallout_string_offset),
CalloutFlags: CalloutFlags(ccb.Fcallout_flags),
}
subjectBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fsubject)), ccb.Fsubject_length)
cb.Subject = string(subjectBytes)
calloutStrBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fcallout_string)), ccb.Fcallout_string_length)
cb.CalloutString = string(calloutStrBytes)
ovecSlice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ccb.Foffset_vector)), (ccb.Fcapture_top*2)-1)[2:]
for i := 0; i < len(ovecSlice); i += 2 {
if i+1 >= len(ovecSlice) {
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:])
} else {
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:ovecSlice[i+1]])
}
}
x := fn(cb)
return x
}
// Prevent callout functions from being GC'd
r.calloutMtx.Lock()
defer r.calloutMtx.Unlock()
r.callout = &cfn
ret := lib.Xpcre2_set_callout_8(r.tls, r.mctx, *(*uintptr)(unsafe.Pointer(&cfn)), 0)
if ret < 0 {
return codeToError(r.tls, ret)
}
return nil
}
// replaceBytes replaces the bytes at a given location, and returns a new
// offset, based on how much bigger or smaller the slice got after replacement
func replaceBytes(src, repl []byte, sOff, eOff lib.Tsize_t, diff int64) (int64, []byte) {
@ -577,7 +696,7 @@ func (r *Regexp) match(b []byte, options uint32, multi bool) ([][]lib.Tsize_t, e
if len(b) == 0 {
return nil, nil
}
r.mtx.Lock()
defer r.mtx.Unlock()
@ -600,7 +719,7 @@ func (r *Regexp) match(b []byte, options uint32, multi bool) ([][]lib.Tsize_t, e
// While the offset is less than the length of the subject
for offset < cSubjectLen {
// Execute expression on subject
ret := lib.Xpcre2_match_8(r.tls, r.re, cSubject, cSubjectLen, offset, options, md, 0)
ret := lib.Xpcre2_match_8(r.tls, r.re, cSubject, cSubjectLen, offset, options, md, r.mctx)
if ret < 0 {
// If no match found, break
if ret == lib.DPCRE2_ERROR_NOMATCH {
@ -670,6 +789,8 @@ func (r *Regexp) Close() error {
// Free the compiled code
lib.Xpcre2_code_free_8(r.tls, r.re)
// Free the match context
lib.Xpcre2_match_context_free_8(r.tls, r.mctx)
// Set regular expression to null
r.re = 0

View File

@ -234,3 +234,58 @@ func TestString(t *testing.T) {
t.Errorf("expected %s, got %s", expr, r.String())
}
}
func TestCallout(t *testing.T) {
const expr = `(https?)://([.\w\d]+\.[\w\d]{2,4}[\w\d?&=%/.-]*)(?C2)`
subject := "https://www.elara.ws/"
r := pcre.MustCompile(expr)
defer r.Close()
executed := false
r.SetCallout(func(cb *pcre.CalloutBlock) int32 {
executed = true
if cb.CalloutNumber != 2 {
t.Errorf("[CalloutNumber] expected %d, got %d", 2, cb.CalloutNumber)
}
if cb.CaptureTop != 3 {
t.Errorf("[CaptureTop] expected %d, got %d", 3, cb.CaptureTop)
}
if cb.CaptureLast != 2 {
t.Errorf("[CaptureLast] expected %d, got %d", 2, cb.CaptureLast)
}
if cb.Subject != subject {
t.Errorf("[Subject] expected %q, got %q", subject, cb.Subject)
}
if cb.StartMatch != 0 {
t.Errorf("[StartMatch] expected %d, got %d", 0, cb.StartMatch)
}
if cb.CurrentPosition != 21 {
t.Errorf("[CurrentPosition] expected %d, got %d", 0, cb.CurrentPosition)
}
if cb.PatternPosition != 53 {
t.Errorf("[PatternPosition] expected %d, got %d", 53, cb.PatternPosition)
}
if cb.NextItemLength != 0 {
t.Errorf("[NextItemLength] expected %d, got %d", 0, cb.NextItemLength)
}
return 0
})
m := r.MatchString(subject)
if !executed {
t.Error("expected callout to be executed")
} else if !m {
t.Error("expected regular expression to match the string")
}
}