-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsynonyms.go
More file actions
executable file
·104 lines (95 loc) · 3 KB
/
synonyms.go
File metadata and controls
executable file
·104 lines (95 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
package main
import (
_ "embed"
"strings"
)
// synonymExpansionWeight is the TF weight given to synonym-expanded tokens.
// Original query tokens keep their full weight; synonyms get this fraction.
const synonymExpansionWeight = 0.3
// wordnetSynonyms is the pre-processed WordNet 3.1 synonym data.
// Generated by: go run ./cmd/gensynonyms /path/to/wordnet/dict > wordnet_synonyms.txt
// Format: one synonym group per line, tab-separated Porter-stemmed words.
//
//go:embed wordnet_synonyms.txt
var wordnetSynonyms string
// synonymIndex maps each stemmed term to all its synonyms (from all synsets
// containing that term). Built once at init(), never mutated — safe for
// concurrent reads. A word appearing in multiple WordNet synsets will have
// the union of all synonym peers across those synsets.
var synonymIndex map[string][]string
func init() {
lines := strings.Split(wordnetSynonyms, "\n")
// First pass: collect all synonyms per term across all groups.
termSyns := make(map[string]map[string]struct{}, len(lines)*2)
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
continue
}
group := strings.Split(line, "\t")
if len(group) < 2 {
continue
}
for _, term := range group {
if _, ok := termSyns[term]; !ok {
termSyns[term] = make(map[string]struct{})
}
for _, other := range group {
if other != term {
termSyns[term][other] = struct{}{}
}
}
}
}
// Second pass: convert sets to slices.
synonymIndex = make(map[string][]string, len(termSyns))
for term, syns := range termSyns {
others := make([]string, 0, len(syns))
for syn := range syns {
others = append(others, syn)
}
synonymIndex[term] = others
}
}
// expandQueryVectorCorpus injects synonym tokens into a query vector, but only
// for synonyms that actually appear in the corpus (tokenIndex). This avoids
// polluting the query with broad WordNet synonyms that can't match anything.
func expandQueryVectorCorpus(qVec map[string]float64, corpusTokens map[string]map[string]struct{}) map[string]float64 {
expanded := make(map[string]float64, len(qVec))
for tok, weight := range qVec {
expanded[tok] = weight
}
for tok, weight := range qVec {
if syns, ok := synonymIndex[tok]; ok {
for _, syn := range syns {
if _, inCorpus := corpusTokens[syn]; !inCorpus {
continue
}
w := weight * synonymExpansionWeight
if expanded[syn] < w {
expanded[syn] = w
}
}
}
}
return expanded
}
// expandTokenSetCorpus expands a token set with synonym group members that
// actually appear in the corpus.
func expandTokenSetCorpus(tokens map[string]struct{}, corpusTokens map[string]map[string]struct{}) map[string]struct{} {
expanded := make(map[string]struct{}, len(tokens))
for tok := range tokens {
expanded[tok] = struct{}{}
}
for tok := range tokens {
if syns, ok := synonymIndex[tok]; ok {
for _, syn := range syns {
if _, inCorpus := corpusTokens[syn]; !inCorpus {
continue
}
expanded[syn] = struct{}{}
}
}
}
return expanded
}