pcre/pcre.go

802 lines
21 KiB
Go

// Package pcre is a library that provides pcre2 regular expressions
// in pure Go, allowing for features such as cross-compiling.
//
// The lib directory contains source code automatically translated from
// pcre2's C source code for each supported architecture and/or OS.
// This package wraps the automatically-translated source to provide a
// safe interface as close to Go's regexp library as possible.
package pcre
import (
"os"
"runtime"
"strconv"
"sync"
"unsafe"
"go.elara.ws/pcre/lib"
"modernc.org/libc"
)
// Version returns the version of pcre2 embedded in this library.
func Version() string { return lib.DPACKAGE_VERSION }
// Regexp represents a pcre2 regular expression
type Regexp struct {
mtx *sync.Mutex
expr string
re uintptr
mctx uintptr
tls *libc.TLS
calloutMtx *sync.Mutex
callout *func(tls *libc.TLS, cbptr, data uintptr) int32
}
// Compile runs CompileOpts with no options.
//
// Close() should be called on the returned expression
// once it is no longer needed.
func Compile(pattern string) (*Regexp, error) {
return CompileOpts(pattern, 0)
}
// CompileOpts compiles the provided pattern using the given options.
//
// Close() should be called on the returned expression
// once it is no longer needed.
func CompileOpts(pattern string, options CompileOption) (*Regexp, error) {
tls := libc.NewTLS()
// Get C string of pattern
cPattern, err := libc.CString(pattern)
if err != nil {
return nil, err
}
// Free the string when done
defer libc.Xfree(tls, cPattern)
// Allocate new error
cErr := allocError(tls)
// Free error when done
defer libc.Xfree(tls, cErr)
// Get error offsets
errPtr := addErrCodeOffset(cErr)
errOffsetPtr := addErrOffsetOffset(cErr)
// Convert pattern length to size_t type
cPatLen := lib.Tsize_t(len(pattern))
// Compile expression
r := lib.Xpcre2_compile_8(tls, cPattern, cPatLen, uint32(options), errPtr, errOffsetPtr, 0)
if r == 0 {
return nil, ptrToError(tls, cErr)
}
// Create regexp instance
regex := Regexp{
expr: pattern,
mtx: &sync.Mutex{},
re: r,
mctx: lib.Xpcre2_match_context_create_8(tls, 0),
tls: tls,
calloutMtx: &sync.Mutex{},
}
// Make sure resources are freed if GC collects the
// regular expression.
runtime.SetFinalizer(&regex, func(r *Regexp) error {
return r.Close()
})
return &regex, nil
}
// MustCompile compiles the given pattern and panics
// if there was an error
//
// Close() should be called on the returned expression
// once it is no longer needed.
func MustCompile(pattern string) *Regexp {
rgx, err := Compile(pattern)
if err != nil {
panic(err)
}
return rgx
}
// MustCompileOpts compiles the given pattern with the given
// options and panics if there was an error.
//
// Close() should be called on the returned expression
// once it is no longer needed.
func MustCompileOpts(pattern string, options CompileOption) *Regexp {
rgx, err := CompileOpts(pattern, options)
if err != nil {
panic(err)
}
return rgx
}
// Find returns the leftmost match of the regular expression.
// A return value of nil indicates no match.
func (r *Regexp) Find(b []byte) []byte {
matches, err := r.match(b, 0, false)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return nil
}
match := matches[0]
return b[match[0]:match[1]]
}
// FindIndex returns a two-element slice of integers
// representing the location of the leftmost match of the
// regular expression.
func (r *Regexp) FindIndex(b []byte) []int {
matches, err := r.match(b, 0, false)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return nil
}
match := matches[0]
return []int{int(match[0]), int(match[1])}
}
// FindAll returns all matches of the regular expression.
// A return value of nil indicates no match.
func (r *Regexp) FindAll(b []byte, n int) [][]byte {
matches, err := r.match(b, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 || n == 0 {
return nil
}
if n > 0 && len(matches) > n {
matches = matches[:n]
}
out := make([][]byte, len(matches))
for index, match := range matches {
out[index] = b[match[0]:match[1]]
}
return out
}
// FindAll returns indices of all matches of the
// regular expression. A return value of nil indicates
// no match.
func (r *Regexp) FindAllIndex(b []byte, n int) [][]int {
matches, err := r.match(b, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 || n == 0 {
return nil
}
if n > 0 && len(matches) > n {
matches = matches[:n]
}
out := make([][]int, len(matches))
for index, match := range matches {
out[index] = []int{int(match[0]), int(match[1])}
}
return out
}
// FindSubmatch returns a slice containing the match as the
// first element, and the submatches as the subsequent elements.
func (r *Regexp) FindSubmatch(b []byte) [][]byte {
matches, err := r.match(b, 0, false)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return nil
}
match := matches[0]
out := make([][]byte, 0, len(match)/2)
for i := 0; i < len(match); i += 2 {
out = append(out, b[match[i]:match[i+1]])
}
return out
}
// FindSubmatchIndex returns a slice of index pairs representing
// the match and submatches, if any.
func (r *Regexp) FindSubmatchIndex(b []byte) []int {
matches, err := r.match(b, 0, false)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return nil
}
match := matches[0]
out := make([]int, len(match))
for index, offset := range match {
out[index] = int(offset)
}
return out
}
// FindAllSubmatch returns a slice of all matches and submatches
// of the regular expression. It will return no more than n matches.
// If n < 0, it will return all matches.
func (r *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
matches, err := r.match(b, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 || n == 0 {
return nil
}
if n > 0 && len(matches) > n {
matches = matches[:n]
}
out := make([][][]byte, len(matches))
for index, match := range matches {
outMatch := make([][]byte, 0, len(match)/2)
for i := 0; i < len(match); i += 2 {
outMatch = append(outMatch, b[match[i]:match[i+1]])
}
out[index] = outMatch
}
return out
}
// FindAllSubmatch returns a slice of all indeces representing the
// locations of matches and submatches, if any, of the regular expression.
// It will return no more than n matches. If n < 0, it will return all matches.
func (r *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
matches, err := r.match(b, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 || n == 0 {
return nil
}
if n > 0 && len(matches) > n {
matches = matches[:n]
}
out := make([][]int, len(matches))
for index, match := range matches {
offsets := make([]int, len(match))
for index, offset := range match {
offsets[index] = int(offset)
}
out[index] = offsets
}
return out
}
// FindString is the String version of Find
func (r *Regexp) FindString(s string) string {
return string(r.Find([]byte(s)))
}
// FindStringIndex is the String version of FindIndex
func (r *Regexp) FindStringIndex(s string) []int {
return r.FindIndex([]byte(s))
}
// FinAllString is the String version of FindAll
func (r *Regexp) FindAllString(s string, n int) []string {
matches := r.FindAll([]byte(s), n)
out := make([]string, len(matches))
for index, match := range matches {
out[index] = string(match)
}
return out
}
// FindAllStringIndex is the String version of FindIndex
func (r *Regexp) FindAllStringIndex(s string, n int) [][]int {
return r.FindAllIndex([]byte(s), n)
}
// FindStringSubmatch is the string version of FindSubmatch
func (r *Regexp) FindStringSubmatch(s string) []string {
matches := r.FindSubmatch([]byte(s))
out := make([]string, len(matches))
for index, match := range matches {
out[index] = string(match)
}
return out
}
// FindStringSubmatchIndex is the String version of FindSubmatchIndex
func (r *Regexp) FindStringSubmatchIndex(s string) []int {
return r.FindSubmatchIndex([]byte(s))
}
// FindAllStringSubmatch is the String version of FindAllSubmatch
func (r *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
matches := r.FindAllSubmatch([]byte(s), n)
out := make([][]string, len(matches))
for index, match := range matches {
outMatch := make([]string, len(match))
for index, byteMatch := range match {
outMatch[index] = string(byteMatch)
}
out[index] = outMatch
}
return out
}
// FindAllStringSubmatchIndex is the String version of FindAllSubmatchIndex
func (r *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
return r.FindAllSubmatchIndex([]byte(s), n)
}
// Match reports whether b contains a match of the regular expression
func (r *Regexp) Match(b []byte) bool {
return r.Find(b) != nil
}
// MatchString is the String version of Match
func (r *Regexp) MatchString(s string) bool {
return r.Find([]byte(s)) != nil
}
// NumSubexp returns the number of parenthesized subexpressions
// in the regular expression.
func (r *Regexp) NumSubexp() int {
return int(r.patternInfo(lib.DPCRE2_INFO_CAPTURECOUNT))
}
// ReplaceAll returns a copy of src, replacing matches of the
// regular expression with the replacement text repl.
// Inside repl, $ signs are interpreted as in Expand,
// so for instance $1 represents the text of the first
// submatch and $name would represent the text of the
// subexpression called "name".
func (r *Regexp) ReplaceAll(src, repl []byte) []byte {
matches, err := r.match(src, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return src
}
out := make([]byte, len(src))
copy(out, src)
var diff int64
for _, match := range matches {
replStr := os.Expand(string(repl), func(s string) string {
i, err := strconv.Atoi(s)
if err != nil {
i = r.SubexpIndex(s)
if i == -1 {
return ""
}
}
// If there given match does not exist, return empty string
if i == 0 || len(match) < (2*i)+1 {
return ""
}
// Return match
return string(src[match[2*i]:match[(2*i)+1]])
})
// Replace replacement string with expanded string
repl := []byte(replStr)
// Replace bytes with new replacement string
diff, out = replaceBytes(out, repl, match[0], match[1], diff)
}
return out
}
// ReplaceAllFunc returns a copy of src in which all matches of the
// regular expression have been replaced by the return value of function
// repl applied to the matched byte slice. The replacement returned by
// repl is substituted directly, without using Expand.
func (r *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
matches, err := r.match(src, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return src
}
out := make([]byte, len(src))
copy(out, src)
var diff int64
for _, match := range matches {
replBytes := repl(src[match[0]:match[1]])
diff, out = replaceBytes(out, replBytes, match[0], match[1], diff)
}
return out
}
// ReplaceAllLiteral returns a copy of src, replacing matches of
// the regular expression with the replacement bytes repl.
// The replacement is substituted directly, without using Expand.
func (r *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
matches, err := r.match(src, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return src
}
out := make([]byte, len(src))
copy(out, src)
var diff int64
for _, match := range matches {
diff, out = replaceBytes(out, repl, match[0], match[1], diff)
}
return out
}
// ReplaceAllString is the String version of ReplaceAll
func (r *Regexp) ReplaceAllString(src, repl string) string {
return string(r.ReplaceAll([]byte(src), []byte(repl)))
}
// ReplaceAllStringFunc is the String version of ReplaceAllFunc
func (r *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
return string(r.ReplaceAllFunc([]byte(src), func(b []byte) []byte {
return []byte(repl(string(b)))
}))
}
// ReplaceAllLiteralString is the String version of ReplaceAllLiteral
func (r *Regexp) ReplaceAllLiteralString(src, repl string) string {
return string(r.ReplaceAllLiteral([]byte(src), []byte(repl)))
}
// Split slices s into substrings separated by the
// expression and returns a slice of the substrings
// between those expression matches.
//
// Example:
//
// s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
// // s: ["", "b", "b", "c", "cadaaae"]
//
// The count determines the number of substrings to return:
//
// n > 0: at most n substrings; the last substring will be the unsplit remainder.
// n == 0: the result is nil (zero substrings)
// n < 0: all substrings
func (r *Regexp) Split(s string, n int) []string {
if n == 0 {
return nil
}
if len(r.expr) > 0 && len(s) == 0 {
return []string{""}
}
matches := r.FindAllStringIndex(s, n)
strings := make([]string, 0, len(matches))
beg := 0
end := 0
for _, match := range matches {
if n > 0 && len(strings) >= n-1 {
break
}
end = match[0]
if match[1] != 0 {
strings = append(strings, s[beg:end])
}
beg = match[1]
}
if end != len(s) {
strings = append(strings, s[beg:])
}
return strings
}
// String returns the text of the regular expression
// used for compilation.
func (r *Regexp) String() string {
return r.expr
}
// SubexpIndex returns the index of the subexpression
// with the given name, or -1 if there is no subexpression
// with that name.
func (r *Regexp) SubexpIndex(name string) int {
r.mtx.Lock()
defer r.mtx.Unlock()
// Get C string of name
cName, err := libc.CString(name)
if err != nil {
panic(err)
}
// Get substring index from name
ret := lib.Xpcre2_substring_number_from_name_8(r.tls, r.re, cName)
// If no substring error returned, return -1.
// If a different error is returned, panic.
if ret == lib.DPCRE2_ERROR_NOSUBSTRING {
return -1
} else if ret < 0 {
panic(codeToError(r.tls, ret))
}
// Return the index of the subexpression
return int(ret)
}
type CalloutFlags uint32
const (
CalloutStartMatch = CalloutFlags(lib.DPCRE2_CALLOUT_STARTMATCH)
CalloutBacktrack = CalloutFlags(lib.DPCRE2_CALLOUT_BACKTRACK)
)
type CalloutBlock struct {
// Version contains the version number of the block format.
// The current version is 2.
Version uint32
// CalloutNumber contains the number of the callout, in the range 0-255.
// This is the number that follows "?C". For callouts with string arguments,
// this will always be zero.
CalloutNumber uint32
// CaptureTop contains the number of the highest numbered substring
// captured so far plus one. If no substrings have yet been captured,
// CaptureTop will be set to 1.
CaptureTop uint32
// CaptureLast contains the number of the last substring that was captured.
CaptureLast uint32
// Substrings contains all of the substrings captured so far.
Substrings []string
Mark string
// Subject contains the string passed to the match function.
Subject string
// StartMatch contains the offset within the subject at which the current match attempt started.
StartMatch uint
// CurrentPosition contains the offset of the current match pointer within the subject.
CurrentPosition uint
// PatternPosition contains the offset within the pattern string to the next item to be matched.
PatternPosition uint
// NextItemLength contains the length of the next item to be processed in the pattern string.
NextItemLength uint
// CalloutStringOffset contains the code unit offset to the start of the callout argument string within the original pattern string.
CalloutStringOffset uint
// CalloutString is the string for the callout. For numerical callouts, this will always be empty.
CalloutString string
// CalloutFlags contains the following flags:
// CalloutStartMatch
// This is set for the first callout after the start of matching for each new starting position in the subject.
// CalloutBacktrack
// This is set if there has been a matching backtrack since the previous callout, or since the start of matching if this is the first callout from a pcre2_match() run.
//
// Both bits are set when a backtrack has caused a "bumpalong" to a new starting position in the subject. Output
CalloutFlags CalloutFlags
}
// SetCallout sets a callout function that will be called at specified points in the matching operation.
// fn should return zero if it ran successfully or a non-zero integer to force an error.
// See https://www.pcre.org/current/doc/html/pcre2callout.html for more information.
func (r *Regexp) SetCallout(fn func(cb *CalloutBlock) int32) error {
cfn := func(tls *libc.TLS, cbptr, data uintptr) int32 {
ccb := (*lib.Tpcre2_callout_block_8)(unsafe.Pointer(cbptr))
cb := &CalloutBlock{
Version: ccb.Fversion,
CalloutNumber: ccb.Fcallout_number,
CaptureTop: ccb.Fcapture_top,
CaptureLast: ccb.Fcapture_last,
Mark: libc.GoString(ccb.Fmark),
StartMatch: uint(ccb.Fstart_match),
CurrentPosition: uint(ccb.Fcurrent_position),
PatternPosition: uint(ccb.Fpattern_position),
NextItemLength: uint(ccb.Fnext_item_length),
CalloutStringOffset: uint(ccb.Fcallout_string_offset),
CalloutFlags: CalloutFlags(ccb.Fcallout_flags),
}
subjectBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fsubject)), ccb.Fsubject_length)
cb.Subject = string(subjectBytes)
calloutStrBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fcallout_string)), ccb.Fcallout_string_length)
cb.CalloutString = string(calloutStrBytes)
ovecSlice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ccb.Foffset_vector)), (ccb.Fcapture_top*2)-1)[2:]
for i := 0; i < len(ovecSlice); i += 2 {
if i+1 >= len(ovecSlice) {
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:])
} else {
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:ovecSlice[i+1]])
}
}
x := fn(cb)
return x
}
// Prevent callout functions from being GC'd
r.calloutMtx.Lock()
defer r.calloutMtx.Unlock()
r.callout = &cfn
ret := lib.Xpcre2_set_callout_8(r.tls, r.mctx, *(*uintptr)(unsafe.Pointer(&cfn)), 0)
if ret < 0 {
return codeToError(r.tls, ret)
}
return nil
}
// replaceBytes replaces the bytes at a given location, and returns a new
// offset, based on how much bigger or smaller the slice got after replacement
func replaceBytes(src, repl []byte, sOff, eOff lib.Tsize_t, diff int64) (int64, []byte) {
var out []byte
out = append(
src[:int64(sOff)+diff],
append(
repl,
src[int64(eOff)+diff:]...,
)...,
)
return diff + int64(len(out)-len(src)), out
}
// match calls the underlying pcre match functions. It re-runs the functions
// until no matches are found if multi is set to true.
func (r *Regexp) match(b []byte, options uint32, multi bool) ([][]lib.Tsize_t, error) {
if len(b) == 0 {
return nil, nil
}
r.mtx.Lock()
defer r.mtx.Unlock()
// Create a C pointer to the subject
sp := unsafe.Pointer(&b[0])
cSubject := uintptr(sp)
// Convert the size of the subject to a C size_t type
cSubjectLen := lib.Tsize_t(len(b))
// Create match data using the pattern to figure out the buffer size
md := lib.Xpcre2_match_data_create_from_pattern_8(r.tls, r.re, 0)
if md == 0 {
panic("error creating match data")
}
// Free the match data at the end of the function
defer lib.Xpcre2_match_data_free_8(r.tls, md)
var offset lib.Tsize_t
var out [][]lib.Tsize_t
// While the offset is less than the length of the subject
for offset < cSubjectLen {
// Execute expression on subject
ret := lib.Xpcre2_match_8(r.tls, r.re, cSubject, cSubjectLen, offset, options, md, r.mctx)
if ret < 0 {
// If no match found, break
if ret == lib.DPCRE2_ERROR_NOMATCH {
break
}
return nil, codeToError(r.tls, ret)
} else {
// Get amount of pairs in output vector
pairAmt := lib.Xpcre2_get_ovector_count_8(r.tls, md)
// Get pointer to output vector
ovec := lib.Xpcre2_get_ovector_pointer_8(r.tls, md)
// Create a Go slice using the output vector as the underlying array
slice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ovec)), pairAmt*2)
// Create a new slice and copy the elements from the slice
// This is required because the match data will be freed in
// a defer, and that would cause a panic every time the slice
// is used later.
matches := make([]lib.Tsize_t, len(slice))
copy(matches, slice)
// If the two indices are the same (empty string), and the match is not
// immediately after another match, add it to the output and increment the
// offset. Otherwise, increment the offset and ignore the match.
if slice[0] == slice[1] && len(out) > 0 && slice[0] != out[len(out)-1][1] {
out = append(out, matches)
offset = slice[1] + 1
continue
} else if slice[0] == slice[1] {
offset = slice[1] + 1
continue
}
// Add the match to the output
out = append(out, matches)
// Set the next offset to the end index of the match
offset = matches[1]
}
// If multiple matches disabled, break
if !multi {
break
}
}
return out, nil
}
// patternInfo calls the underlying pcre pattern info function
// and returns information about the compiled regular expression
func (r *Regexp) patternInfo(what uint32) (out uint32) {
// Create a C pointer to the output integer
cOut := uintptr(unsafe.Pointer(&out))
// Get information about the compiled pattern
lib.Xpcre2_pattern_info_8(r.tls, r.re, what, cOut)
return
}
// Close frees resources used by the regular expression.
func (r *Regexp) Close() error {
if r == nil {
return nil
}
// Close thread-local storage
defer r.tls.Close()
// Free the compiled code
lib.Xpcre2_code_free_8(r.tls, r.re)
// Free the match context
lib.Xpcre2_match_context_free_8(r.tls, r.mctx)
// Set regular expression to null
r.re = 0
return nil
}