From fd58c9244e417e6263026a6ddb4966d35b258fe5 Mon Sep 17 00:00:00 2001 From: Thomas Stromberg Date: Fri, 1 May 2020 07:53:14 -0700 Subject: [PATCH] Similarity rewrite to improve latency and hit rate --- pkg/hubbub/hubbub.go | 11 +- pkg/hubbub/issue.go | 6 +- pkg/hubbub/pull_requests.go | 3 +- pkg/hubbub/search.go | 12 +-- pkg/hubbub/similar.go | 196 +++++++++++++++--------------------- 5 files changed, 88 insertions(+), 140 deletions(-) diff --git a/pkg/hubbub/hubbub.go b/pkg/hubbub/hubbub.go index dc297bc..5569657 100644 --- a/pkg/hubbub/hubbub.go +++ b/pkg/hubbub/hubbub.go @@ -16,6 +16,7 @@ package hubbub import ( + "sync" "time" "github.com/google/go-github/v31/github" @@ -51,14 +52,11 @@ type Engine struct { debugNumber int + titleToURLs sync.Map + similarTitles sync.Map + // indexes used for similarity matching seen map[string]*Conversation - // stored in-memory only - similarCache map[string][]string - similarCacheUpdated time.Time - - // when did we last see a new item? - lastItemUpdate time.Time // are stale results acceptable? acceptStaleResults bool @@ -72,7 +70,6 @@ func New(cfg Config) *Engine { memberRefresh: cfg.MemberRefresh, seen: map[string]*Conversation{}, - similarCache: map[string][]string{}, MinSimilarity: cfg.MinSimilarity, debugNumber: cfg.DebugNumber, } diff --git a/pkg/hubbub/issue.go b/pkg/hubbub/issue.go index 89b77c2..541874b 100644 --- a/pkg/hubbub/issue.go +++ b/pkg/hubbub/issue.go @@ -68,11 +68,7 @@ func (h *Engine) updateIssues(ctx context.Context, org string, project string, s continue } - if i.GetState() != state { - klog.Errorf("#%d: I asked for state %q, but got issue in %q - open a go-github bug!", i.GetNumber(), state, i.GetState()) - continue - } - + h.updateSimilarityTables(i.GetTitle(), i.GetHTMLURL()) allIssues = append(allIssues, i) } diff --git a/pkg/hubbub/pull_requests.go b/pkg/hubbub/pull_requests.go index 1ab9f17..610d779 100644 --- a/pkg/hubbub/pull_requests.go +++ b/pkg/hubbub/pull_requests.go @@ -67,6 +67,8 @@ func (h *Engine) updatePRs(ctx context.Context, org string, project string, stat break } } + + h.updateSimilarityTables(pr.GetTitle(), pr.GetHTMLURL()) allPRs = append(allPRs, pr) } @@ -80,7 +82,6 @@ func (h *Engine) updatePRs(ctx context.Context, org string, project string, stat klog.Errorf("set %q failed: %v", key, err) } - h.lastItemUpdate = time.Now() klog.V(1).Infof("updatePRs %s returning %d PRs", key, len(allPRs)) return allPRs, nil diff --git a/pkg/hubbub/search.go b/pkg/hubbub/search.go index b05a68e..82932a2 100644 --- a/pkg/hubbub/search.go +++ b/pkg/hubbub/search.go @@ -133,12 +133,9 @@ func (h *Engine) SearchIssues(ctx context.Context, org string, project string, f continue } - filtered = append(filtered, co) - } + co.Similar = h.FindSimilar(co) - // TODO: Make this only happen when caches are missed - if err := h.updateSimilarConversations(filtered); err != nil { - klog.Errorf("update similar: %v", err) + filtered = append(filtered, co) } klog.V(1).Infof("%d of %d issues within %s/%s matched filters %s", len(filtered), len(is), org, project, toYAML(fs)) @@ -226,11 +223,6 @@ func (h *Engine) SearchPullRequests(ctx context.Context, org string, project str filtered = append(filtered, co) } - // TODO: Make this only happen when caches are missed - if err := h.updateSimilarConversations(filtered); err != nil { - klog.Errorf("update similar: %v", err) - } - klog.V(1).Infof("%d of %d PR's within %s/%s matched filters:\n%s", len(filtered), len(prs), org, project, toYAML(fs)) return filtered, nil } diff --git a/pkg/hubbub/similar.go b/pkg/hubbub/similar.go index aac7d15..341814e 100644 --- a/pkg/hubbub/similar.go +++ b/pkg/hubbub/similar.go @@ -1,6 +1,7 @@ package hubbub import ( + "regexp" "time" "github.com/google/go-github/v31/github" @@ -8,6 +9,8 @@ import ( "k8s.io/klog/v2" ) +var nonLetter = regexp.MustCompile(`[^a-zA-Z]`) + // A subset of Conversation for related items (requires less memory than a Conversation) type RelatedConversation struct { Organization string `json:"org"` @@ -21,148 +24,107 @@ type RelatedConversation struct { Created time.Time `json:"created"` } -func related(c *Conversation) *RelatedConversation { - return &RelatedConversation{ - Organization: c.Organization, - Project: c.Project, - ID: c.ID, - - URL: c.URL, - Title: c.Title, - Author: c.Author, - Type: c.Type, - Created: c.Created, - } +func compressTitle(t string) string { + return nonLetter.ReplaceAllString(t, "") } -// updateSimilarConversations updates a slice of conversations with similar ones -func (h *Engine) updateSimilarConversations(cs []*Conversation) error { - klog.V(2).Infof("updating similar conversations for %d conversations ...", len(cs)) - start := time.Now() - found := 0 - defer func() { - klog.V(2).Infof("updated similar conversations for %d conversations within %s", found, time.Since(start)) - }() - - urls, err := h.cachedSimilarURLs() - if err != nil { - return err - } - - for _, c := range cs { - if len(urls[c.URL]) == 0 { - continue +func (h *Engine) updateSimilarityTables(rawTitle, url string) { + title := compressTitle(rawTitle) + + result, existing := h.titleToURLs.LoadOrStore(title, []string{url}) + if existing { + foundURL := false + otherURLs := []string{} + for _, v := range result.([]string) { + if v == url { + foundURL = true + break + } + otherURLs = append(otherURLs, v) } - c.Similar = []*RelatedConversation{} - for _, url := range urls[c.URL] { - c.Similar = append(c.Similar, related(h.seen[url])) + + if !foundURL { + klog.V(1).Infof("updating %q with %v", rawTitle, otherURLs) + h.titleToURLs.Store(title, append(otherURLs, url)) } - found++ + return } - return nil -} -// seenByTitle returns conversations by title -func (h *Engine) seenByTitle() map[string][]*Conversation { - byTitle := map[string][]*Conversation{} + klog.Infof("new title: %q", rawTitle) - for _, c := range h.seen { - _, ok := byTitle[c.Title] - if ok { - byTitle[c.Title] = append(byTitle[c.Title], c) - continue + // Update us -> them title similarity + similarTo := []string{} + + h.titleToURLs.Range(func(k, v interface{}) bool { + otherTitle := k.(string) + if otherTitle == title { + return true } - byTitle[c.Title] = []*Conversation{c} - } - return byTitle -} + if godice.CompareString(title, otherTitle) > h.MinSimilarity { + klog.Infof("%q is similar to %q", rawTitle, otherTitle) + similarTo = append(similarTo, otherTitle) + } + return true + }) -// cachedSimilarURLs returns similar URL's that may be cached -func (h *Engine) cachedSimilarURLs() (map[string][]string, error) { - if h.similarCacheUpdated.After(h.lastItemUpdate) { - return h.similarCache, nil - } + h.similarTitles.Store(title, similarTo) - sim, err := h.similarURLs() - if err != nil { - return sim, err + // Update them -> us title similarity + for _, other := range similarTo { + klog.V(1).Infof("updating %q to map to %s", other, title) + others, ok := h.similarTitles.Load(other) + if ok { + h.similarTitles.Store(other, append(others.([]string), title)) + } } - - h.similarCache = sim - h.similarCacheUpdated = time.Now() - return h.similarCache, nil } -// similarURL's returns issue URL's that are similar to one another - SLOW! -func (h *Engine) similarURLs() (map[string][]string, error) { - if h.MinSimilarity == 0 { - klog.Warningf("min similarity is 0") - return nil, nil - } - - klog.Infof("UPDATING SIMILARITY!") - start := time.Now() - defer func() { - klog.Infof("updateSimilar took %s", time.Since(start)) - }() +func (h *Engine) FindSimilar(co *Conversation) []*RelatedConversation { + simco := []*RelatedConversation{} + title := compressTitle(co.Title) + similarURLs := []string{} - byt := h.seenByTitle() - titles := []string{} - for k := range byt { - titles = append(titles, k) + tres, ok := h.similarTitles.Load(title) + if !ok { + return nil } - sim, err := similarTitles(titles, h.MinSimilarity) - if err != nil { - return nil, err + for _, ot := range tres.([]string) { + ures, ok := h.titleToURLs.Load(ot) + if ok { + similarURLs = append(similarURLs, ures.([]string)...) + } } - similar := map[string][]string{} - - for k, v := range sim { - for _, c := range byt[k] { - similar[c.URL] = []string{} + if len(similarURLs) == 0 { + return nil + } - // identical matches - for _, oc := range byt[k] { - if oc.URL != c.URL { - similar[c.URL] = append(similar[c.URL], oc.URL) - } - } + klog.V(2).Infof("#%d %q is similar to %v", co.ID, co.Title, similarURLs) - // similar matches - for _, otherTitle := range v { - for _, oc := range byt[otherTitle] { - if oc.URL != c.URL { - similar[c.URL] = append(similar[c.URL], oc.URL) - } - } - } + for _, url := range similarURLs { + oco := h.seen[url] + if oco == nil { + klog.Warningf("find similar: no conversation found for %s", url) + continue } + klog.V(2).Infof("found %s: %q", url, oco.Title) + simco = append(simco, makeRelated(h.seen[url])) } - - return similar, nil + return simco } -// similarTitles pairs together similar titles - INEFFICIENT -func similarTitles(titles []string, minSimilarity float64) (map[string][]string, error) { - st := map[string][]string{} - - for _, t := range titles { - matches, err := godice.CompareStrings(t, titles) - if err != nil { - return nil, err - } +func makeRelated(c *Conversation) *RelatedConversation { + return &RelatedConversation{ + Organization: c.Organization, + Project: c.Project, + ID: c.ID, - for _, match := range matches.Candidates { - if match.Score > minSimilarity { - if match.Text != t { - st[t] = append(st[t], match.Text) - } - } - } + URL: c.URL, + Title: c.Title, + Author: c.Author, + Type: c.Type, + Created: c.Created, } - - return st, nil }