Compare commits

...

11 Commits

Author SHA1 Message Date
4ce849193f Add riscv64 test
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2023-08-04 20:25:57 -07:00
107b5db1fb Add CI status badge to README
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2023-07-27 09:44:57 -07:00
e90cc6feac Add Varnish test case
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
2023-07-27 09:41:27 -07:00
18912728e9 Add woodpecker config
All checks were successful
ci/woodpecker/manual/woodpecker Pipeline was successful
2023-07-27 09:36:28 -07:00
0a0008aef4 Fix typo 2023-07-26 15:53:58 -07:00
6d906d55a6 Fix panic when pcre2 returns PCRE2_UNSET 2023-07-25 23:35:29 -06:00
d1b9df80a1 Fix panic when no substrings are found 2023-07-17 07:11:35 -07:00
bde850752d Move new types to the types.go file 2023-07-17 07:00:24 -07:00
bafc40da8a Run formatter 2023-07-17 06:46:47 -07:00
c6fb3e8489 Fix comment 2023-07-17 06:38:03 -07:00
23df260dfd Fix README badge 2023-07-17 06:37:39 -07:00
7 changed files with 145 additions and 107 deletions

13
.woodpecker.yml Normal file
View File

@@ -0,0 +1,13 @@
matrix:
platform:
- linux/amd64
- linux/arm64
- linux/riscv64
steps:
test:
image: gitea.elara.ws/elara6331/golang:latest
commands:
- go test
when:
- event: push

View File

@@ -1,6 +1,7 @@
# pcre
[![Go Reference](https://pkg.go.dev/badge/go.arsenm.dev/pcre.svg)](https://pkg.go.dev/go.arsenm.dev/pcre)
[![Go Reference](https://pkg.go.dev/badge/go.elara.ws/pcre.svg)](https://pkg.go.dev/go.elara.ws/pcre)
[![status-badge](https://ci.elara.ws/api/badges/49/status.svg)](https://ci.elara.ws/49)
This package provides a CGo-free port of the PCRE2 regular expression library. The [lib](lib) directory contains source code automatically translated from PCRE2's C source. This package wraps that code and provides an interface as close as possible to Go's stdlib [regexp](https://pkg.go.dev/regexp) package
@@ -50,4 +51,4 @@ CC=/usr/bin/gcc ccgo -o pcre2_<os>_<arch>.go -pkgname lib -trace-translation-uni
- If cross-compiling, set the `CCGO_CC` variable to to path of the cross-compiler, and the `CCGO_AR` variable to the path of the cross-compiler's `ar` binary. Also, set `TARGET_GOARCH` to the GOARCH you're targeting and `TARGET_GOOS` to the OS you're targeting.
- Once the command completes, two go files will be created. One will start with `pcre2`, the other with `capi`. Copy both of these to the `lib` directory in this repo.
- Once the command completes, two go files will be created. One will start with `pcre2`, the other with `capi`. Copy both of these to the `lib` directory in this repo.

View File

@@ -108,7 +108,7 @@ func Glob(glob string) ([]string, error) {
// Join splitDir and add filepath separator. This is the directory that will be searched.
dir := filepath.Join(splitDir...)
if filepath.IsAbs(glob) {
dir = string(filepath.Separator) + dir
}

View File

@@ -39,17 +39,17 @@ func TestCompileGlob(t *testing.T) {
}
func TestGlob(t *testing.T) {
err := os.MkdirAll("pcretest/dir1", 0755)
err := os.MkdirAll("pcretest/dir1", 0o755)
if err != nil {
t.Fatal(err)
}
err = os.MkdirAll("pcretest/dir2", 0755)
err = os.MkdirAll("pcretest/dir2", 0o755)
if err != nil {
t.Fatal(err)
}
err = os.MkdirAll("pcretest/test1/dir4", 0755)
err = os.MkdirAll("pcretest/test1/dir4", 0o755)
if err != nil {
t.Fatal(err)
}
@@ -58,7 +58,7 @@ func TestGlob(t *testing.T) {
if err != nil {
t.Fatal(err)
}
err = touch("pcretest/file2")
if err != nil {
t.Fatal(err)
@@ -113,9 +113,9 @@ func TestGlob(t *testing.T) {
}
func touch(path string) error {
fl, err := os.OpenFile(path, os.O_CREATE, 0644)
fl, err := os.OpenFile(path, os.O_CREATE, 0o644)
if err != nil {
return err
}
return fl.Close()
}
}

91
pcre.go
View File

@@ -8,6 +8,7 @@
package pcre
import (
"math"
"os"
"runtime"
"strconv"
@@ -19,6 +20,8 @@ import (
"modernc.org/libc"
)
const Unset = math.MaxUint
// Version returns the version of pcre2 embedded in this library.
func Version() string { return lib.DPACKAGE_VERSION }
@@ -208,7 +211,11 @@ func (r *Regexp) FindSubmatch(b []byte) [][]byte {
out := make([][]byte, 0, len(match)/2)
for i := 0; i < len(match); i += 2 {
out = append(out, b[match[i]:match[i+1]])
if match[i] == Unset {
out = append(out, nil)
} else {
out = append(out, b[match[i]:match[i+1]])
}
}
return out
}
@@ -253,7 +260,11 @@ func (r *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
outMatch := make([][]byte, 0, len(match)/2)
for i := 0; i < len(match); i += 2 {
outMatch = append(outMatch, b[match[i]:match[i+1]])
if match[i] == Unset {
outMatch = append(outMatch, nil)
} else {
outMatch = append(outMatch, b[match[i]:match[i+1]])
}
}
out[index] = outMatch
@@ -565,67 +576,6 @@ func (r *Regexp) SubexpIndex(name string) int {
return int(ret)
}
type CalloutFlags uint32
const (
CalloutStartMatch = CalloutFlags(lib.DPCRE2_CALLOUT_STARTMATCH)
CalloutBacktrack = CalloutFlags(lib.DPCRE2_CALLOUT_BACKTRACK)
)
type CalloutBlock struct {
// Version contains the version number of the block format.
// The current version is 2.
Version uint32
// CalloutNumber contains the number of the callout, in the range 0-255.
// This is the number that follows "?C". For callouts with string arguments,
// this will always be zero.
CalloutNumber uint32
// CaptureTop contains the number of the highest numbered substring
// captured so far plus one. If no substrings have yet been captured,
// CaptureTop will be set to 1.
CaptureTop uint32
// CaptureLast contains the number of the last substring that was captured.
CaptureLast uint32
// Substrings contains all of the substrings captured so far.
Substrings []string
Mark string
// Subject contains the string passed to the match function.
Subject string
// StartMatch contains the offset within the subject at which the current match attempt started.
StartMatch uint
// CurrentPosition contains the offset of the current match pointer within the subject.
CurrentPosition uint
// PatternPosition contains the offset within the pattern string to the next item to be matched.
PatternPosition uint
// NextItemLength contains the length of the next item to be processed in the pattern string.
NextItemLength uint
// CalloutStringOffset contains the code unit offset to the start of the callout argument string within the original pattern string.
CalloutStringOffset uint
// CalloutString is the string for the callout. For numerical callouts, this will always be empty.
CalloutString string
// CalloutFlags contains the following flags:
// CalloutStartMatch
// This is set for the first callout after the start of matching for each new starting position in the subject.
// CalloutBacktrack
// This is set if there has been a matching backtrack since the previous callout, or since the start of matching if this is the first callout from a pcre2_match() run.
//
// Both bits are set when a backtrack has caused a "bumpalong" to a new starting position in the subject. Output
CalloutFlags CalloutFlags
}
// SetCallout sets a callout function that will be called at specified points in the matching operation.
// fn should return zero if it ran successfully or a non-zero integer to force an error.
// See https://www.pcre.org/current/doc/html/pcre2callout.html for more information.
@@ -653,12 +603,15 @@ func (r *Regexp) SetCallout(fn func(cb *CalloutBlock) int32) error {
calloutStrBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fcallout_string)), ccb.Fcallout_string_length)
cb.CalloutString = string(calloutStrBytes)
ovecSlice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ccb.Foffset_vector)), (ccb.Fcapture_top*2)-1)[2:]
for i := 0; i < len(ovecSlice); i += 2 {
if i+1 >= len(ovecSlice) {
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:])
} else {
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:ovecSlice[i+1]])
ovecSlice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ccb.Foffset_vector)), (ccb.Fcapture_top*2)-1)
if len(ovecSlice) > 2 {
ovecSlice = ovecSlice[2:]
for i := 0; i < len(ovecSlice); i += 2 {
if i+1 >= len(ovecSlice) {
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:])
} else {
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:ovecSlice[i+1]])
}
}
}

View File

@@ -4,6 +4,7 @@ import (
"strings"
"sync"
"testing"
"reflect"
"go.elara.ws/pcre"
)
@@ -289,3 +290,11 @@ func TestCallout(t *testing.T) {
t.Error("expected regular expression to match the string")
}
}
func TestVarnish(t *testing.T) {
regex := pcre.MustCompile(`varnish(?: \(Varnish\/([\d.]{1,250})\))?`)
matches := regex.FindStringSubmatch("1.1 varnish")
if !reflect.DeepEqual(matches, []string{"varnish", ""}) {
t.Errorf(`Expected ["varnish" ""], got %q`, matches)
}
}

120
types.go
View File

@@ -6,33 +6,95 @@ type CompileOption uint32
// Compile option bits
const (
Anchored = CompileOption(lib.DPCRE2_ANCHORED)
AllowEmptyClass = CompileOption(lib.DPCRE2_ALLOW_EMPTY_CLASS)
AltBsux = CompileOption(lib.DPCRE2_ALT_BSUX)
AltCircumflex = CompileOption(lib.DPCRE2_ALT_CIRCUMFLEX)
AltVerbnames = CompileOption(lib.DPCRE2_ALT_VERBNAMES)
AutoCallout = CompileOption(lib.DPCRE2_AUTO_CALLOUT)
Caseless = CompileOption(lib.DPCRE2_CASELESS)
DollarEndOnly = CompileOption(lib.DPCRE2_DOLLAR_ENDONLY)
DotAll = CompileOption(lib.DPCRE2_DOTALL)
DupNames = CompileOption(lib.DPCRE2_DUPNAMES)
EndAnchored = CompileOption(lib.DPCRE2_ENDANCHORED)
Extended = CompileOption(lib.DPCRE2_EXTENDED)
FirstLine = CompileOption(lib.DPCRE2_FIRSTLINE)
Literal = CompileOption(lib.DPCRE2_LITERAL)
MatchInvalidUTF = CompileOption(lib.DPCRE2_MATCH_INVALID_UTF)
MactchUnsetBackref = CompileOption(lib.DPCRE2_MATCH_UNSET_BACKREF)
Multiline = CompileOption(lib.DPCRE2_MULTILINE)
NeverBackslashC = CompileOption(lib.DPCRE2_NEVER_BACKSLASH_C)
NeverUCP = CompileOption(lib.DPCRE2_NEVER_UCP)
NeverUTF = CompileOption(lib.DPCRE2_NEVER_UTF)
NoAutoCapture = CompileOption(lib.DPCRE2_NO_AUTO_CAPTURE)
NoAutoPossess = CompileOption(lib.DPCRE2_NO_AUTO_POSSESS)
NoDotStarAnchor = CompileOption(lib.DPCRE2_NO_DOTSTAR_ANCHOR)
NoStartOptimize = CompileOption(lib.DPCRE2_NO_START_OPTIMIZE)
NoUTFCheck = CompileOption(lib.DPCRE2_NO_UTF_CHECK)
UCP = CompileOption(lib.DPCRE2_UCP)
Ungreedy = CompileOption(lib.DPCRE2_UNGREEDY)
UseOffsetLimit = CompileOption(lib.DPCRE2_USE_OFFSET_LIMIT)
UTF = CompileOption(lib.DPCRE2_UTF)
Anchored = CompileOption(lib.DPCRE2_ANCHORED)
AllowEmptyClass = CompileOption(lib.DPCRE2_ALLOW_EMPTY_CLASS)
AltBsux = CompileOption(lib.DPCRE2_ALT_BSUX)
AltCircumflex = CompileOption(lib.DPCRE2_ALT_CIRCUMFLEX)
AltVerbnames = CompileOption(lib.DPCRE2_ALT_VERBNAMES)
AutoCallout = CompileOption(lib.DPCRE2_AUTO_CALLOUT)
Caseless = CompileOption(lib.DPCRE2_CASELESS)
DollarEndOnly = CompileOption(lib.DPCRE2_DOLLAR_ENDONLY)
DotAll = CompileOption(lib.DPCRE2_DOTALL)
DupNames = CompileOption(lib.DPCRE2_DUPNAMES)
EndAnchored = CompileOption(lib.DPCRE2_ENDANCHORED)
Extended = CompileOption(lib.DPCRE2_EXTENDED)
FirstLine = CompileOption(lib.DPCRE2_FIRSTLINE)
Literal = CompileOption(lib.DPCRE2_LITERAL)
MatchInvalidUTF = CompileOption(lib.DPCRE2_MATCH_INVALID_UTF)
MatchUnsetBackref = CompileOption(lib.DPCRE2_MATCH_UNSET_BACKREF)
Multiline = CompileOption(lib.DPCRE2_MULTILINE)
NeverBackslashC = CompileOption(lib.DPCRE2_NEVER_BACKSLASH_C)
NeverUCP = CompileOption(lib.DPCRE2_NEVER_UCP)
NeverUTF = CompileOption(lib.DPCRE2_NEVER_UTF)
NoAutoCapture = CompileOption(lib.DPCRE2_NO_AUTO_CAPTURE)
NoAutoPossess = CompileOption(lib.DPCRE2_NO_AUTO_POSSESS)
NoDotStarAnchor = CompileOption(lib.DPCRE2_NO_DOTSTAR_ANCHOR)
NoStartOptimize = CompileOption(lib.DPCRE2_NO_START_OPTIMIZE)
NoUTFCheck = CompileOption(lib.DPCRE2_NO_UTF_CHECK)
UCP = CompileOption(lib.DPCRE2_UCP)
Ungreedy = CompileOption(lib.DPCRE2_UNGREEDY)
UseOffsetLimit = CompileOption(lib.DPCRE2_USE_OFFSET_LIMIT)
UTF = CompileOption(lib.DPCRE2_UTF)
)
type CalloutFlags uint32
const (
CalloutStartMatch = CalloutFlags(lib.DPCRE2_CALLOUT_STARTMATCH)
CalloutBacktrack = CalloutFlags(lib.DPCRE2_CALLOUT_BACKTRACK)
)
// CalloutBlock contains the data passed to callout functions
type CalloutBlock struct {
// Version contains the version number of the block format.
// The current version is 2.
Version uint32
// CalloutNumber contains the number of the callout, in the range 0-255.
// This is the number that follows "?C". For callouts with string arguments,
// this will always be zero.
CalloutNumber uint32
// CaptureTop contains the number of the highest numbered substring
// captured so far plus one. If no substrings have yet been captured,
// CaptureTop will be set to 1.
CaptureTop uint32
// CaptureLast contains the number of the last substring that was captured.
CaptureLast uint32
// Substrings contains all of the substrings captured so far.
Substrings []string
Mark string
// Subject contains the string passed to the match function.
Subject string
// StartMatch contains the offset within the subject at which the current match attempt started.
StartMatch uint
// CurrentPosition contains the offset of the current match pointer within the subject.
CurrentPosition uint
// PatternPosition contains the offset within the pattern string to the next item to be matched.
PatternPosition uint
// NextItemLength contains the length of the next item to be processed in the pattern string.
NextItemLength uint
// CalloutStringOffset contains the code unit offset to the start of the callout argument string within the original pattern string.
CalloutStringOffset uint
// CalloutString is the string for the callout. For numerical callouts, this will always be empty.
CalloutString string
// CalloutFlags contains the following flags:
// CalloutStartMatch
// This is set for the first callout after the start of matching for each new starting position in the subject.
// CalloutBacktrack
// This is set if there has been a matching backtrack since the previous callout, or since the start of matching if this is the first callout from a pcre2_match() run.
//
// Both bits are set when a backtrack has caused a "bumpalong" to a new starting position in the subject.
CalloutFlags CalloutFlags
}