Skip to content

Commit 394f12c

Browse files
committed
elastic additional char_filters
1 parent 761e1ef commit 394f12c

1 file changed

Lines changed: 43 additions & 3 deletions

File tree

scripts/elastic/db-js.json

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
11
{
2+
"//": "to preserve special chars: http://www.fullscale.co/blog/2013/03/04/preserving_specific_characters_during_tokenizing_in_elasticsearch.html",
23
"settings": {
34
"number_of_shards": 1,
45
"number_of_replicas": 0,
56
"analysis": {
67
"analyzer": {
78
"default": {
89
"type": "custom",
9-
"char_filter": ["html_strip"],
10+
"char_filter": [
11+
"delimit-dot-inside-word",
12+
"make-all-headers-sentences",
13+
"remove-pre-code",
14+
"remove-style",
15+
"remove-script",
16+
"html_strip"
17+
],
1018
"tokenizer": "standard",
1119
"filter": [
1220
"lowercase",
@@ -15,8 +23,40 @@
1523
]
1624
}
1725
},
26+
"//": "char_filter applied before tokenization",
27+
"//": "see the result: curl 'localhost:9200/js/_analyze?analyzer=default&pretty=true' -d 'win.resizeBy'",
28+
"char_filter": {
29+
"delimit-dot-inside-word" : {
30+
"//": "win.resizeBy usually a single word, make it 2 words",
31+
"type" : "pattern_replace",
32+
"pattern" : "(\\S)\\.(\\S)",
33+
"replacement" : "$1. $2"
34+
},
35+
"make-all-headers-sentences": {
36+
"//": "<h1>text</h1> will get <h1> & <h2> removed, need to add dot at the end to make it look like a sentence",
37+
"type": "pattern_replace",
38+
"pattern": "(?i)([^.])(</h\\d>)",
39+
"replacement": "$1.$2"
40+
},
41+
"remove-pre-code": {
42+
"type": "pattern_replace",
43+
"pattern": "(?si)<pre class=\"(line-numbers|language-).*?</pre>",
44+
"replacement": ""
45+
},
46+
"remove-style": {
47+
"type": "pattern_replace",
48+
"pattern": "(?si)<style(\\s|>).*?</style>",
49+
"replacement": ""
50+
},
51+
"remove-script": {
52+
"type": "pattern_replace",
53+
"pattern": "(?si)<script(\\s|>).*?</script>",
54+
"replacement": ""
55+
}
56+
},
57+
"//": "not using stopwords, they help performance, but make search phrases with them impossible",
1858
"filter": {
19-
"ru_en_stopwords": {
59+
"ru_en_stopwords_unused": {
2060
"type": "stop",
2161
"stopwords": "а,без,более,бы,был,была,были,было,быть,в,вам,вас,весь,во,вот,все,всего,всех,вы,где,да,даже,для,до,его,ее,если,есть,еще,же,за,здесь,и,из,или,им,их,к,как,ко,когда,кто,ли,либо,мне,может,мы,на,надо,наш,не,него,нее,нет,ни,них,но,ну,о,об,однако,он,она,они,оно,от,очень,по,под,при,с,со,так,также,такой,там,те,тем,то,того,тоже,той,только,том,ты,у,уже,хотя,чего,чей,чем,что,чтобы,чье,чья,эта,эти,это,я,a,an,and,are,as,at,be,but,by,for,if,in,into,is,it,no,not,of,on,or,such,that,the,their,then,there,these,they,this,to,was,will,with"
2262
}
@@ -70,4 +110,4 @@
70110
}
71111
}
72112
}
73-
}
113+
}

0 commit comments

Comments
 (0)