Initial Commit
All checks were successful
ci/woodpecker/release/woodpecker Pipeline was successful
ci/woodpecker/tag/woodpecker Pipeline was successful

This commit is contained in:
2025-02-12 19:33:11 -08:00
commit 76cbbad74a
43 changed files with 4970 additions and 0 deletions

View File

@@ -0,0 +1,72 @@
/*
* distrohop - A utility for correlating and identifying equivalent software
* packages across different Linux distributions
*
* Copyright (C) 2025 Elara Ivy <elara@elara.ws>
*
* This file is part of distrohop.
*
* distrohop is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* distrohop is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with distrohop. If not, see <http://www.gnu.org/licenses/>.
*/
package cached
import (
"strings"
"time"
"github.com/patrickmn/go-cache"
"go.elara.ws/distrohop/internal/store"
)
var _ store.ReadOnly = (*Store)(nil)
// cacheRecord represents a single item stored in the cache
type cacheRecord struct {
results []store.TagResult
latency time.Duration
}
// Store represents a cached store that caches search results from [go.elara.ws/distrohop/internal/store.ReadOnly] instances.
// It implements [go.elara.ws/distrohop/internal/store.ReadOnly].
type Store struct {
store.ReadOnly
cache *cache.Cache
}
// New creates a new cached store with the provided cache settings and underlying store.
func New(s store.ReadOnly, exp, cleanup time.Duration) Store {
return Store{
ReadOnly: s,
cache: cache.New(exp, cleanup),
}
}
// Search retrieves cached search results for the given tags. If the search doesn't exist
// in the cache, it queries the underlying store and adds the results to the cache.
func (cs Store) Search(tags []string) ([]store.TagResult, time.Duration, error) {
cacheKey := strings.Join(tags, "\x1F")
if results, ok := cs.cache.Get(cacheKey); ok {
record := results.(cacheRecord)
return record.results, record.latency, nil
}
res, latency, err := cs.ReadOnly.Search(tags)
if err != nil {
return nil, 0, err
}
if len(res) != 0 {
cs.cache.Set(cacheKey, cacheRecord{res, latency}, cache.DefaultExpiration)
}
return res, latency, nil
}

View File

@@ -0,0 +1,134 @@
/*
* distrohop - A utility for correlating and identifying equivalent software
* packages across different Linux distributions
*
* Copyright (C) 2025 Elara Ivy <elara@elara.ws>
*
* This file is part of distrohop.
*
* distrohop is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* distrohop is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with distrohop. If not, see <http://www.gnu.org/licenses/>.
*/
package combined
import (
"errors"
"fmt"
"slices"
"sync"
"time"
"github.com/cockroachdb/pebble"
"go.elara.ws/distrohop/internal/store"
"golang.org/x/sync/errgroup"
)
var _ store.ReadOnly = (*Store)(nil)
var ErrNotFound = errors.New("no such package")
// Store represents a combined store that aggregates multiple individual [go.elara.ws/distrohop/internal/store.Store] instances.
// It implements [go.elara.ws/distrohop/internal/store.ReadOnly].
type Store struct {
Stores []store.ReadOnly
}
// New creates a new combined store with the provided individual stores.
func New(stores ...store.ReadOnly) *Store {
return &Store{stores}
}
// Add adds a new store to the combined store.
func (cs *Store) Add(s store.ReadOnly) {
cs.Stores = append(cs.Stores, s)
}
// GetPkg retrieves a package by name from any of the stores in the combined store.
// If the package is not found in any store, it returns [github.com/cockroachdb/pebble.ErrNotFound].
func (cs *Store) GetPkg(name string) (out store.Package, err error) {
mtx := &sync.Mutex{}
wg := &errgroup.Group{}
for _, s := range cs.Stores {
wg.Go(func() error {
if pkg, err := s.GetPkg(name); err == nil {
mtx.Lock()
out = pkg
mtx.Unlock()
} else if !errors.Is(err, pebble.ErrNotFound) {
return err
}
return nil
})
}
if err := wg.Wait(); err != nil {
return out, err
} else if out.Name == "" {
return out, fmt.Errorf("%w: %q", ErrNotFound, name)
} else {
return out, nil
}
}
// GetPkgNamesByPrefix retrieves package names that match the given prefix from all stores.
// It returns a slice of package names limited to the specified number n.
func (cs *Store) GetPkgNamesByPrefix(prefix string, n int) (out []string, err error) {
mtx := &sync.Mutex{}
wg := &errgroup.Group{}
for _, s := range cs.Stores {
wg.Go(func() error {
names, err := s.GetPkgNamesByPrefix(prefix, n)
if err != nil {
return err
}
mtx.Lock()
out = append(out, names...)
mtx.Unlock()
return nil
})
}
if err := wg.Wait(); err != nil {
return nil, err
}
slices.Sort(out)
if len(out) > n {
out = out[:n]
}
return out, nil
}
// Search searches for packages across all stores based on the provided tags.
// It returns a slice of search results and an error.
func (cs *Store) Search(tags []string) (out []store.TagResult, latency time.Duration, err error) {
mtx := &sync.Mutex{}
wg := &errgroup.Group{}
for _, s := range cs.Stores {
wg.Go(func() error {
results, dur, err := s.Search(tags)
if err != nil {
return err
}
mtx.Lock()
latency += dur
out = append(out, results...)
mtx.Unlock()
return nil
})
}
if err := wg.Wait(); err != nil {
return nil, latency, err
} else {
store.SortResults(out)
return out, latency, nil
}
}

236
internal/store/search.go Normal file
View File

@@ -0,0 +1,236 @@
/*
* distrohop - A utility for correlating and identifying equivalent software
* packages across different Linux distributions
*
* Copyright (C) 2025 Elara Ivy <elara@elara.ws>
*
* This file is part of distrohop.
*
* distrohop is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* distrohop is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with distrohop. If not, see <http://www.gnu.org/licenses/>.
*/
package store
import (
"encoding/gob"
"errors"
"fmt"
"regexp"
"slices"
"strings"
"sync"
"time"
"github.com/cespare/xxhash/v2"
"github.com/cockroachdb/pebble"
)
func init() {
gob.Register(&xxhash.Digest{})
}
var ErrInvalidTag = errors.New("invalid tag format")
var (
// startChars is a list of all the possible package name starting characters
startChars = [...]byte{
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
}
// iterOpts contains iterator options with bounds defined such that
// they cover all packages starting with each character defined
// in startChars
iterOpts = make([]*pebble.IterOptions, len(startChars))
// tagRegex validates the format of a given tag
tagRegex = regexp.MustCompile(`\w+=.+`)
)
func init() {
// Populate the iterOpts slice
for i, char := range startChars {
iterOpts[i] = &pebble.IterOptions{
LowerBound: []byte{char},
UpperBound: []byte{char + 1},
}
}
}
// TagResult represents the result of a tag search, including confidence and overlapping tags.
type TagResult struct {
// The confidence score for the tag match. This value will always be between 0 and 1.
Confidence float32
// A list of overlapping tags
Overlap []string
// The package associated with the tag result
Package Package
}
// Search searches for packages in the store that match the given tags.
// Each tag must be in the format "key=value", and an error is returned
// if any tag does not conform to this format. The function spawns multiple
// worker goroutines (defined by s.SearchThreads) to perform a concurrent search.
// The result is a list of [TagResult] structs representing the matching packages.
func (s *Store) Search(tags []string) ([]TagResult, time.Duration, error) {
start := time.Now()
for _, tag := range tags {
if !tagRegex.MatchString(tag) {
return nil, 0, fmt.Errorf("%w: %q", ErrInvalidTag, tag)
}
}
optsMtx := &sync.Mutex{}
opts := iterOpts
var results []TagResult
resultsMtx := &sync.Mutex{}
wg := &sync.WaitGroup{}
errs := make(chan error)
for range s.SearchThreads {
wg.Add(1)
go func() {
defer wg.Done()
for {
optsMtx.Lock()
if len(opts) == 0 {
// If we have no more options structs left,
// we can exit the goroutine
optsMtx.Unlock()
return
}
opt := opts[0]
opts = opts[1:]
optsMtx.Unlock()
found := false
if filter, err := s.GetFilter(opt.LowerBound[0]); err == nil {
for _, tag := range tags {
if filter.Lookup(unsafeBytes(tag)) {
found = true
break
}
}
} else if !errors.Is(err, pebble.ErrNotFound) {
errs <- err
return
}
// Skip the current chunk if the bloom filter
// doesn't contain any of the tags, or if it doesn't
// exist, which indicates that there are no packages
// with the starting character we're looking for.
if !found {
continue
}
// Create a new iterator that scans through the range defined in opt
iter, err := s.db.NewIter(opt)
if err != nil {
errs <- err
return
}
var out []TagResult
for iter.First(); iter.Valid(); iter.Next() {
val, err := iter.ValueAndErr()
if err != nil {
errs <- err
iter.Close()
return
}
// Convert the tag data to a string using an unsafe operation
// so that we can split it by the unit separator character
// and check if it has overlap without incurring the cost
// of copying the value for a string conversion.
//
// If we find that there's overlap, we'll copy the data
// later, before returning it.
ptags := strings.Split(unsafeString(val), "\x1F")
overlapTags, conf := overlap(tags, ptags)
if conf == 0 {
// If the confidence is zero, there's no overlap,
// so we can continue to the next value
continue
}
out = append(out, TagResult{
Confidence: conf,
Overlap: overlapTags,
Package: Package{
Name: string(iter.Key()),
// We need to do a deep copy here because we previously
// used an unsafe operation to convert the tag data to
// a string, and the values created by that will be
// invalidated when the iterator is closed.
Tags: cloneStringSlice(ptags),
},
})
}
if err := iter.Error(); err != nil {
errs <- err
iter.Close()
return
}
iter.Close()
resultsMtx.Lock()
results = append(results, out...)
resultsMtx.Unlock()
}
}()
}
done := make(chan struct{})
go func() {
wg.Wait()
close(done)
}()
select {
case err := <-errs:
if err != nil {
return nil, 0, err
}
case <-done:
SortResults(results)
return results, time.Since(start), nil
}
SortResults(results)
return results, time.Since(start), nil
}
// SortResults sorts tag results by confidence
func SortResults(results []TagResult) {
slices.SortFunc(results, func(a, b TagResult) int {
if a.Confidence < b.Confidence {
return 1
} else if a.Confidence > b.Confidence {
return -1
} else {
return strings.Compare(a.Package.Name, b.Package.Name)
}
})
}
// cloneStringSlice creates a deep copy of a slice of strings
func cloneStringSlice(s []string) []string {
out := make([]string, len(s))
for i := 0; i < len(s); i++ {
out[i] = strings.Clone(s[i])
}
return out
}

389
internal/store/store.go Normal file
View File

@@ -0,0 +1,389 @@
/*
* distrohop - A utility for correlating and identifying equivalent software
* packages across different Linux distributions
*
* Copyright (C) 2025 Elara Ivy <elara@elara.ws>
*
* This file is part of distrohop.
*
* distrohop is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* distrohop is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with distrohop. If not, see <http://www.gnu.org/licenses/>.
*/
package store
import (
"bytes"
"encoding/gob"
"encoding/json"
"errors"
"os"
"path/filepath"
"slices"
"strings"
"sync"
"time"
"unsafe"
"github.com/cespare/xxhash/v2"
"github.com/cockroachdb/pebble"
"github.com/zeebo/sbloom"
"go.elara.ws/distrohop/internal/index"
)
// ErrBlocked is returned when a store's database is being updated
var ErrBlocked = errors.New("database is being updated; please try again later")
func init() {
gob.Register(&xxhash.Digest{})
}
// Package represents a software package with a name and associated tags
type Package struct {
// The name of the package
Name string
// A list of tags associated with the package
Tags []string
}
type nopLogger struct{}
func (nopLogger) Infof(string, ...any) {}
func (nopLogger) Fatalf(string, ...any) {}
// ReadOnly represents a read-only package store
type ReadOnly interface {
GetPkg(name string) (Package, error)
GetPkgNamesByPrefix(prefix string, n int) ([]string, error)
Search(tags []string) ([]TagResult, time.Duration, error)
}
// Store represents persistent storage for package data
type Store struct {
Path string
db *pebble.DB
// blocked is used to ensure that all other operations finish
// running before a [Store.Replace] operation, and cannot
// start running until the replace operation is completed.
// [Store.Replace] does a write lock on the RWMutex. All
// other operations do read locks.
blocked sync.RWMutex
// SearchThreads is the number of worker goroutines to be used
// for searching the database for a tag. The default is 4.
SearchThreads int
}
// Open initializes and opens a [Store] at the specified path
func Open(path string) (*Store, error) {
db, err := pebble.Open(path, &pebble.Options{Logger: nopLogger{}})
if err != nil {
return nil, err
}
return &Store{
Path: path,
db: db,
SearchThreads: 4,
}, err
}
// WriteBatch writes a batch of index records to the store.
// It merges existing tags with new ones and ensures they're unique.
func (s *Store) WriteBatch(batch map[string]index.Record, filters map[byte]*sbloom.Filter) error {
if !s.blocked.TryRLock() {
return ErrBlocked
}
defer s.blocked.RUnlock()
b := s.db.NewBatch()
defer b.Close()
for _, item := range batch {
if len(item.Name) == 0 || len(item.Tags) == 0 {
continue
}
key := unsafeBytes(item.Name)
curVal, cl, err := s.db.Get(key)
if err == pebble.ErrNotFound {
// Remove any duplicate tags
slices.Sort(item.Tags)
tags := slices.Compact(item.Tags)
// Write the new package to the database
err := b.Set(key, joinTags(item.Name[0], tags, filters), nil)
if err != nil {
return err
}
} else if err != nil {
return err
} else {
// Since the package already exists in the database, combine its existing
// tags with the ones we just got
tags := strings.Split(unsafeString(curVal), "\x1F")
tags = append(tags, item.Tags...)
// Remove any duplicate tags
slices.Sort(tags)
tags = slices.Compact(tags)
// Write the updated package to the database
err := b.Set(key, joinTags(item.Name[0], tags, filters), nil)
if err != nil {
cl.Close()
return err
}
cl.Close()
}
}
// Commit the batch to persistent storage
return b.Commit(nil)
}
// WriteFilters writes bloom filters for each package name starting character
// to the database.
func (s *Store) WriteFilters(filters map[byte]*sbloom.Filter) error {
if !s.blocked.TryRLock() {
return ErrBlocked
}
defer s.blocked.RUnlock()
for firstChar, filter := range filters {
data, err := filter.GobEncode()
if err != nil {
return err
}
err = s.db.Set([]byte{0x02, firstChar}, data, nil)
if err != nil {
return err
}
}
return nil
}
// GetFilter gets the bloom filter for the given first package name character
// from the database.
func (s *Store) GetFilter(firstChar byte) (*sbloom.Filter, error) {
if !s.blocked.TryRLock() {
return nil, ErrBlocked
}
defer s.blocked.RUnlock()
data, cl, err := s.db.Get([]byte{0x02, firstChar})
if err != nil {
return nil, err
}
defer cl.Close()
filter := &sbloom.Filter{}
err = filter.GobDecode(data)
if err != nil {
return nil, err
}
return filter, nil
}
// joinTags converts the given tags to bytes, joins them with \x1F as the separator,
// and updates the correct bloom filter for the first character of the package name.
func joinTags(firstChar byte, tags []string, filters map[byte]*sbloom.Filter) []byte {
if _, ok := filters[firstChar]; !ok {
filters[firstChar] = sbloom.NewFilter(xxhash.New(), 10)
}
out := &bytes.Buffer{}
for i, tag := range tags {
btag := unsafeBytes(tag)
filters[firstChar].Add(btag)
out.Write(btag)
if i != len(tags)-1 {
out.WriteByte(0x1F)
}
}
return out.Bytes()
}
// GetPkg retrieves a package from the store by its name
func (s *Store) GetPkg(name string) (Package, error) {
if !s.blocked.TryRLock() {
return Package{}, ErrBlocked
}
defer s.blocked.RUnlock()
data, cl, err := s.db.Get(unsafeBytes(name))
if err != nil {
return Package{}, err
}
defer cl.Close()
return Package{
Name: name,
Tags: strings.Split(string(data), "\x1F"),
}, nil
}
func (s *Store) GetPkgNamesByPrefix(prefix string, n int) ([]string, error) {
if !s.blocked.TryRLock() {
return nil, ErrBlocked
}
defer s.blocked.RUnlock()
out := make([]string, 0, n)
iter, err := s.db.NewIter(&pebble.IterOptions{
LowerBound: unsafeBytes(prefix),
UpperBound: append(unsafeBytes(prefix[:len(prefix)-1]), prefix[len(prefix)-1]+1),
})
if err != nil {
return nil, err
}
defer iter.Close()
i := 0
for iter.First(); iter.Valid(); iter.Next() {
if i == n-1 {
break
}
out = append(out, string(iter.Key()))
i++
}
if err := iter.Error(); err != nil {
return nil, err
}
return out, nil
}
// metaKey is the database key for repository metadata
var metaKey = []byte("\x02META")
// RepoMeta represents repository metadata
type RepoMeta struct {
ETag string
LastModified time.Time
}
// WriteMeta writes the repository metadata to the database
func (s *Store) WriteMeta(meta RepoMeta) error {
if !s.blocked.TryRLock() {
return ErrBlocked
}
defer s.blocked.RUnlock()
data, err := json.Marshal(meta)
if err != nil {
return err
}
return s.db.Set(metaKey, data, nil)
}
// GetMeta reads the repository metadata from the database
func (s *Store) GetMeta() (RepoMeta, error) {
if !s.blocked.TryRLock() {
return RepoMeta{}, ErrBlocked
}
defer s.blocked.RUnlock()
data, cl, err := s.db.Get(metaKey)
if err != nil {
return RepoMeta{}, err
}
defer cl.Close()
var meta RepoMeta
err = json.Unmarshal(data, &meta)
return meta, err
}
// Replace atomically replaces the database from s with the database from s2.
// The store is blocked during the replacement, causing any concurrent operations
// to fail with [ErrBlocked]. The replacement operation closes and moves s2's
// database, so s2 is no longer usable after this operation.
//
// This function attempts to roll back in case of partial failures. However, cleanup
// failures may result in leftover temporary files.
func (s *Store) Replace(s2 *Store) error {
// Clean up any leftover old db files. We don't need to lock at this
// point because concurrent operations are still safe to execute.
oldPath := filepath.Join(filepath.Dir(s.Path), "db-old")
if err := os.RemoveAll(oldPath); err != nil {
return err
}
// Do a write lock, which will prevent any new operations
// from executing and block until all existing operations
// complete.
s.blocked.Lock()
if err := s2.db.Close(); err != nil {
s.blocked.Unlock()
return err
}
if err := s.db.Close(); err != nil {
s.blocked.Unlock()
return err
}
if err := os.Rename(s.Path, oldPath); err != nil {
s.blocked.Unlock()
return err
}
if err := os.Rename(s2.Path, s.Path); err != nil {
s.blocked.Unlock()
return errors.Join(err, os.Rename(oldPath, s.Path))
}
db, err := pebble.Open(s.Path, &pebble.Options{Logger: nopLogger{}})
if err != nil {
s.blocked.Unlock()
return err
}
s.db = db
// We can unlock here even though there's more work to do because the replace
// operation itself is complete and concurrent operations are now safe to
// execute again.
s.blocked.Unlock()
return os.RemoveAll(oldPath)
}
// Close closes the underlying database
func (s *Store) Close() error {
if !s.blocked.TryRLock() {
return ErrBlocked
}
defer s.blocked.RUnlock()
return s.db.Close()
}
// overlap calculates the overlap between two sets of tags.
// It returns the list of overlapping tags and a confidence score.
func overlap(stags, ptags []string) ([]string, float32) {
var overlapTags []string
for _, stag := range stags {
if slices.Contains(ptags, stag) {
overlapTags = append(overlapTags, stag)
}
}
return overlapTags, float32(len(overlapTags)) / float32(len(stags))
}
// unsafeBytes converts a string to a byte slice using unsafe operations
func unsafeBytes(data string) []byte {
return unsafe.Slice(unsafe.StringData(data), len(data))
}
// unsafeString converts a byte slice to a string using unsafe operations
func unsafeString(data []byte) string {
return unsafe.String(unsafe.SliceData(data), len(data))
}