Initial Commit

This commit is contained in:
2021-12-08 09:24:05 -08:00
commit c8bec472be
22 changed files with 3305 additions and 0 deletions

97
search/web/bing.go Normal file
View File

@@ -0,0 +1,97 @@
package web
import (
"net/http"
"strconv"
"github.com/PuerkitoBio/goquery"
)
var bingURL = urlMustParse("https://www.bing.com/search?count=10")
type Bing struct {
keyword string
userAgent string
first int
doc *goquery.Document
initDone bool
baseSel *goquery.Selection
}
func (b *Bing) SetKeyword(keyword string) {
b.keyword = keyword
}
func (b *Bing) SetPage(page int) {
b.first = page * 10
}
func (b *Bing) SetUserAgent(ua string) {
b.userAgent = ua
}
func (b *Bing) Init() error {
initURL := copyURL(bingURL)
query := initURL.Query()
query.Set("q", b.keyword)
if b.first > 0 {
query.Set("first", strconv.Itoa(b.first))
} else {
query.Set("first", "1")
}
initURL.RawQuery = query.Encode()
req, err := http.NewRequest(
http.MethodGet,
initURL.String(),
nil,
)
if err != nil {
return err
}
if b.userAgent == "" {
b.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
req.Header.Set("User-Agent", b.userAgent)
res, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return err
}
b.doc = doc
b.baseSel = doc.Find(`#b_results > li`)
b.initDone = true
return nil
}
func (b *Bing) Each(eachCb func(int) error) error {
for i := 0; i < b.baseSel.Length(); i++ {
err := eachCb(i)
if err != nil {
return err
}
}
return nil
}
func (b *Bing) Title(i int) (string, error) {
return get(b.baseSel, i).ChildrenFiltered("h2").Children().First().Text(), nil
}
func (b *Bing) Link(i int) (string, error) {
return get(b.baseSel, i).ChildrenFiltered("h2").Children().First().AttrOr("href", ""), nil
}
func (b *Bing) Desc(i int) (string, error) {
return get(b.baseSel, i).ChildrenFiltered(".b_caption").Children().Last().Text(), nil
}
func (b *Bing) Name() string {
return "bing"
}

103
search/web/ddg.go Normal file
View File

@@ -0,0 +1,103 @@
package web
import (
"net/http"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
)
var ddgURL = urlMustParse("https://html.duckduckgo.com/html")
const uddgPrefix = "//duckduckgo.com/l/?uddg="
type DDG struct {
keyword string
userAgent string
page int
doc *goquery.Document
initDone bool
baseSel *goquery.Selection
}
func (d *DDG) SetKeyword(keyword string) {
d.keyword = keyword
}
func (d *DDG) SetPage(page int) {
d.page = page * 30
}
func (d *DDG) SetUserAgent(ua string) {
d.userAgent = "Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10" //ua
}
func (d *DDG) Init() error {
initURL := copyURL(ddgURL)
query := initURL.Query()
query.Set("q", d.keyword)
if d.page > 0 {
query.Set("s", strconv.Itoa(d.page))
query.Set("dc", strconv.Itoa(d.page+1))
}
initURL.RawQuery = query.Encode()
req, err := http.NewRequest(
http.MethodGet,
initURL.String(),
nil,
)
if err != nil {
return err
}
if d.userAgent == "" {
d.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
req.Header.Set("User-Agent", d.userAgent)
res, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return err
}
d.doc = doc
d.baseSel = doc.Find(`#links > .result`)
d.initDone = true
return nil
}
func (d *DDG) Each(eachCb func(int) error) error {
for i := 0; i < d.baseSel.Length(); i++ {
err := eachCb(i)
if err != nil {
return err
}
}
return nil
}
func (d *DDG) Title(i int) (string, error) {
return strings.TrimSpace(get(d.baseSel, i).Children().First().ChildrenFiltered("h2").Text()), nil
}
func (d *DDG) Link(i int) (string, error) {
link := get(d.baseSel, i).Children().First().ChildrenFiltered("a").AttrOr("href", "")
if strings.HasPrefix(link, uddgPrefix) {
link = urlMustParse(link).Query().Get("uddg")
}
return link, nil
}
func (d *DDG) Desc(i int) (string, error) {
return get(d.baseSel, i).Children().First().ChildrenFiltered("a").Text(), nil
}
func (d *DDG) Name() string {
return "ddg"
}

108
search/web/google.go Normal file
View File

@@ -0,0 +1,108 @@
package web
import (
"net/http"
"net/url"
"strconv"
"github.com/PuerkitoBio/goquery"
)
var googleURL = urlMustParse("https://www.google.com/search")
type Google struct {
keyword string
userAgent string
page int
doc *goquery.Document
initDone bool
baseSel *goquery.Selection
}
func (g *Google) SetKeyword(keyword string) {
g.keyword = keyword
}
func (g *Google) SetPage(page int) {
g.page = page * 10
}
func (g *Google) SetUserAgent(ua string) {
g.userAgent = ua
}
func (g *Google) Init() error {
initURL := copyURL(googleURL)
query := initURL.Query()
query.Set("q", g.keyword)
query.Set("start", strconv.Itoa(g.page))
initURL.RawQuery = query.Encode()
req, err := http.NewRequest(
http.MethodGet,
initURL.String(),
nil,
)
if err != nil {
return err
}
if g.userAgent == "" {
g.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
req.Header.Set("User-Agent", g.userAgent)
res, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return err
}
g.doc = doc
g.baseSel = doc.Find(`a > h3`)
g.initDone = true
return nil
}
func (g *Google) Each(eachCb func(int) error) error {
for i := 0; i < g.baseSel.Length(); i++ {
err := eachCb(i)
if err != nil {
return err
}
}
return nil
}
func (g *Google) Title(i int) (string, error) {
return get(g.baseSel, i).Text(), nil
}
func (g *Google) Link(i int) (string, error) {
return get(g.baseSel, i).Parent().AttrOr("href", ""), nil
}
func (g *Google) Desc(i int) (string, error) {
return get(g.baseSel, i).Parent().Parent().Next().Text(), nil
}
func (g *Google) Name() string {
return "google"
}
func get(sel *goquery.Selection, i int) *goquery.Selection {
return sel.Slice(i, i+1)
}
func urlMustParse(urlStr string) *url.URL {
out, _ := url.Parse(urlStr)
return out
}
func copyURL(orig *url.URL) *url.URL {
newURL := new(url.URL)
*newURL = *orig
return newURL
}

151
search/web/search.go Normal file
View File

@@ -0,0 +1,151 @@
package web
import (
"net/http"
"sort"
"sync"
"time"
"golang.org/x/sync/errgroup"
)
func init() {
http.DefaultClient.Timeout = 5 * time.Second
}
type Result struct {
Title string
Link string
Desc string
Engines []string
Rank int
}
type Engine interface {
// Set search keyword for engine
SetKeyword(string)
// Set User Agent. If string is empty,
// an acceptable will should be used.
SetUserAgent(string)
// Set page number to search
SetPage(int)
// Initialize engine (make requests, set variables, etc.)
Init() error
// Run function for each search result,
// inputting index
Each(func(int) error) error
// Get title from index given by Each()
Title(int) (string, error)
// Get link from index given by Each()
Link(int) (string, error)
// Get description from index given by Each()
Desc(int) (string, error)
// Return shortened name of search engine.
// Should be lowercase (e.g. google, ddg, bing)
Name() string
}
type Options struct {
Keyword string
UserAgent string
Page int
}
func Search(opts Options, engines ...Engine) ([]*Result, error) {
var outMtx sync.Mutex
var out []*Result
wg := errgroup.Group{}
for index, engine := range engines {
curIndex, curEngine := index, engine
wg.Go(func() error {
curEngine.SetKeyword(opts.Keyword)
curEngine.SetUserAgent(opts.UserAgent)
curEngine.SetPage(opts.Page)
if err := curEngine.Init(); err != nil {
return err
}
err := curEngine.Each(func(i int) error {
link, err := curEngine.Link(i)
if err != nil {
return err
}
rank := (curIndex * 100) + i
index, exists := linkExists(out, link)
if exists {
out[index].Engines = append(out[index].Engines, curEngine.Name())
if rank < out[index].Rank {
out[index].Rank = rank
}
return nil
}
title, err := curEngine.Title(i)
if err != nil {
return err
}
desc, err := curEngine.Desc(i)
if err != nil {
return err
}
if title == "" || link == "" || desc == "" {
return nil
}
if len(desc) > 500 {
desc = desc[:500] + "..."
}
result := &Result{
Title: title,
Link: link,
Desc: desc,
Rank: rank,
}
result.Engines = append(result.Engines, curEngine.Name())
outMtx.Lock()
out = append(out, result)
outMtx.Unlock()
return nil
})
if err != nil {
return err
}
sort.Slice(out, func(i, j int) bool {
return out[i].Rank < out[j].Rank
})
return nil
})
}
if err := wg.Wait(); err != nil {
return out, err
}
return out, nil
}
func linkExists(results []*Result, link string) (int, bool) {
for index, result := range results {
if result.Link == link {
return index, true
}
}
return -1, false
}