/* * distrohop - A utility for correlating and identifying equivalent software * packages across different Linux distributions * * Copyright (C) 2025 Elara Ivy * * This file is part of distrohop. * * distrohop is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * distrohop is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with distrohop. If not, see . */ package store import ( "encoding/gob" "errors" "fmt" "regexp" "slices" "strings" "sync" "time" "github.com/cespare/xxhash/v2" "github.com/cockroachdb/pebble" ) func init() { gob.Register(&xxhash.Digest{}) } var ErrInvalidTag = errors.New("invalid tag format") var ( // startChars is a list of all the possible package name starting characters startChars = [...]byte{ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', } // iterOpts contains iterator options with bounds defined such that // they cover all packages starting with each character defined // in startChars iterOpts = make([]*pebble.IterOptions, len(startChars)) // tagRegex validates the format of a given tag tagRegex = regexp.MustCompile(`\w+=.+`) ) func init() { // Populate the iterOpts slice for i, char := range startChars { iterOpts[i] = &pebble.IterOptions{ LowerBound: []byte{char}, UpperBound: []byte{char + 1}, } } } // TagResult represents the result of a tag search, including confidence and overlapping tags. type TagResult struct { // The confidence score for the tag match. This value will always be between 0 and 1. Confidence float32 // A list of overlapping tags Overlap []string // The package associated with the tag result Package Package } // Search searches for packages in the store that match the given tags. // Each tag must be in the format "key=value", and an error is returned // if any tag does not conform to this format. The function spawns multiple // worker goroutines (defined by s.SearchThreads) to perform a concurrent search. // The result is a list of [TagResult] structs representing the matching packages. func (s *Store) Search(tags []string) ([]TagResult, time.Duration, error) { start := time.Now() for _, tag := range tags { if !tagRegex.MatchString(tag) { return nil, 0, fmt.Errorf("%w: %q", ErrInvalidTag, tag) } } optsMtx := &sync.Mutex{} opts := iterOpts var results []TagResult resultsMtx := &sync.Mutex{} wg := &sync.WaitGroup{} errs := make(chan error) for range s.SearchThreads { wg.Add(1) go func() { defer wg.Done() for { optsMtx.Lock() if len(opts) == 0 { // If we have no more options structs left, // we can exit the goroutine optsMtx.Unlock() return } opt := opts[0] opts = opts[1:] optsMtx.Unlock() found := false if filter, err := s.GetFilter(opt.LowerBound[0]); err == nil { for _, tag := range tags { if filter.Lookup(unsafeBytes(tag)) { found = true break } } } else if !errors.Is(err, pebble.ErrNotFound) { errs <- err return } // Skip the current chunk if the bloom filter // doesn't contain any of the tags, or if it doesn't // exist, which indicates that there are no packages // with the starting character we're looking for. if !found { continue } // Create a new iterator that scans through the range defined in opt iter, err := s.db.NewIter(opt) if err != nil { errs <- err return } var out []TagResult for iter.First(); iter.Valid(); iter.Next() { val, err := iter.ValueAndErr() if err != nil { errs <- err iter.Close() return } // Convert the tag data to a string using an unsafe operation // so that we can split it by the unit separator character // and check if it has overlap without incurring the cost // of copying the value for a string conversion. // // If we find that there's overlap, we'll copy the data // later, before returning it. ptags := strings.Split(unsafeString(val), "\x1F") overlapTags, conf := overlap(tags, ptags) if conf == 0 { // If the confidence is zero, there's no overlap, // so we can continue to the next value continue } out = append(out, TagResult{ Confidence: conf, Overlap: overlapTags, Package: Package{ Name: string(iter.Key()), // We need to do a deep copy here because we previously // used an unsafe operation to convert the tag data to // a string, and the values created by that will be // invalidated when the iterator is closed. Tags: cloneStringSlice(ptags), }, }) } if err := iter.Error(); err != nil { errs <- err iter.Close() return } iter.Close() resultsMtx.Lock() results = append(results, out...) resultsMtx.Unlock() } }() } done := make(chan struct{}) go func() { wg.Wait() close(done) }() select { case err := <-errs: if err != nil { return nil, 0, err } case <-done: SortResults(results) return results, time.Since(start), nil } SortResults(results) return results, time.Since(start), nil } // SortResults sorts tag results by confidence func SortResults(results []TagResult) { slices.SortFunc(results, func(a, b TagResult) int { if a.Confidence < b.Confidence { return 1 } else if a.Confidence > b.Confidence { return -1 } else { return strings.Compare(a.Package.Name, b.Package.Name) } }) } // cloneStringSlice creates a deep copy of a slice of strings func cloneStringSlice(s []string) []string { out := make([]string, len(s)) for i := 0; i < len(s); i++ { out[i] = strings.Clone(s[i]) } return out }