-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmemory_bm25.go
More file actions
84 lines (78 loc) · 2.15 KB
/
memory_bm25.go
File metadata and controls
84 lines (78 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
package main
import "math"
// BM25 parameters (standard defaults).
const (
bm25K1 = 1.2
bm25B = 0.75
)
// bm25IdfLocked returns the BM25 IDF for a token: log((N - df + 0.5)/(df + 0.5) + 1).
// Caller must hold at least a read lock.
func (s *MemoryStore) bm25IdfLocked(tok string) float64 {
df := float64(s.tokenDocFreq[tok])
N := float64(s.totalDocs)
if N <= 0 {
return 0
}
idf := math.Log((N-df+0.5)/(df+0.5) + 1)
if idf < 0 {
return 0
}
return idf
}
// bm25MemSimilarityLocked returns a symmetric doc-doc similarity in [0,1) using BM25.
// score = max(bm25(a→b), bm25(b→a)) then normalized as raw/(1+raw) for threshold comparison.
// Caller must hold at least a read lock.
func (s *MemoryStore) bm25MemSimilarityLocked(a, b *storedChunk) float64 {
if a == nil || b == nil {
return 0
}
scoreAB := s.bm25ScoreLocked(a.vector, b)
scoreBA := s.bm25ScoreLocked(b.vector, a)
raw := scoreAB
if scoreBA > raw {
raw = scoreBA
}
if raw <= 0 {
return 0
}
return raw / (1 + raw)
}
// bm25ScoreLocked returns the BM25 score of the document for the query vector.
// Query term weights (qVec values) are incorporated: each term's contribution
// is scaled by its query weight, so synonym-expanded terms at reduced weight
// contribute proportionally less than original query terms.
// Caller must hold at least a read lock.
func (s *MemoryStore) bm25ScoreLocked(qVec map[string]float64, doc *storedChunk) float64 {
if len(qVec) == 0 || doc == nil || len(doc.vector) == 0 {
return 0
}
if s.totalDocs == 0 || s.totalDocLen <= 0 {
return 0
}
avgDocLen := s.totalDocLen / float64(s.totalDocs)
if avgDocLen <= 0 {
return 0
}
if doc.docLen <= 0 {
return 0
}
var score float64
docLenNorm := 1 - bm25B + bm25B*(doc.docLen/avgDocLen)
for tok, qw := range qVec {
dtf, ok := doc.vector[tok]
if !ok || dtf <= 0 {
continue
}
idf := s.bm25IdfLocked(tok)
score += qw * idf * (dtf * (bm25K1 + 1)) / (dtf + bm25K1*docLenNorm)
}
return score
}
// docLenFromVector returns the sum of term frequencies (total token count) for BM25.
func docLenFromVector(vec map[string]float64) float64 {
var sum float64
for _, tf := range vec {
sum += tf
}
return sum
}