Commit f915ef753b5caabd29923cf03ffba4276110ae95

Authored by Andrew Kane
1 parent dbe04a0a

Added generate settings method

Showing 1 changed file with 274 additions and 268 deletions   Show diff stats
lib/searchkick/index_options.rb
@@ -22,278 +22,13 @@ module Searchkick @@ -22,278 +22,13 @@ module Searchkick
22 settings = options[:settings] || {} 22 settings = options[:settings] || {}
23 mappings = custom_mapping 23 mappings = custom_mapping
24 else 24 else
25 - language = options[:language]  
26 - language = language.call if language.respond_to?(:call) 25 + settings = generate_settings
  26 +
  27 + mapping = {}
27 28
28 keyword_mapping = {type: "keyword"} 29 keyword_mapping = {type: "keyword"}
29 keyword_mapping[:ignore_above] = options[:ignore_above] || 30000 30 keyword_mapping[:ignore_above] = options[:ignore_above] || 30000
30 31
31 - settings = {  
32 - analysis: {  
33 - analyzer: {  
34 - searchkick_keyword: {  
35 - type: "custom",  
36 - tokenizer: "keyword",  
37 - filter: ["lowercase"] + (options[:stem_conversions] ? ["searchkick_stemmer"] : [])  
38 - },  
39 - default_analyzer => {  
40 - type: "custom",  
41 - # character filters -> tokenizer -> token filters  
42 - # https://www.elastic.co/guide/en/elasticsearch/guide/current/analysis-intro.html  
43 - char_filter: ["ampersand"],  
44 - tokenizer: "standard",  
45 - # synonym should come last, after stemming and shingle  
46 - # shingle must come before searchkick_stemmer  
47 - filter: ["lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"]  
48 - },  
49 - searchkick_search: {  
50 - type: "custom",  
51 - char_filter: ["ampersand"],  
52 - tokenizer: "standard",  
53 - filter: ["lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"]  
54 - },  
55 - searchkick_search2: {  
56 - type: "custom",  
57 - char_filter: ["ampersand"],  
58 - tokenizer: "standard",  
59 - filter: ["lowercase", "asciifolding", "searchkick_stemmer"]  
60 - },  
61 - # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb  
62 - searchkick_autocomplete_search: {  
63 - type: "custom",  
64 - tokenizer: "keyword",  
65 - filter: ["lowercase", "asciifolding"]  
66 - },  
67 - searchkick_word_search: {  
68 - type: "custom",  
69 - tokenizer: "standard",  
70 - filter: ["lowercase", "asciifolding"]  
71 - },  
72 - searchkick_suggest_index: {  
73 - type: "custom",  
74 - tokenizer: "standard",  
75 - filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"]  
76 - },  
77 - searchkick_text_start_index: {  
78 - type: "custom",  
79 - tokenizer: "keyword",  
80 - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]  
81 - },  
82 - searchkick_text_middle_index: {  
83 - type: "custom",  
84 - tokenizer: "keyword",  
85 - filter: ["lowercase", "asciifolding", "searchkick_ngram"]  
86 - },  
87 - searchkick_text_end_index: {  
88 - type: "custom",  
89 - tokenizer: "keyword",  
90 - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]  
91 - },  
92 - searchkick_word_start_index: {  
93 - type: "custom",  
94 - tokenizer: "standard",  
95 - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]  
96 - },  
97 - searchkick_word_middle_index: {  
98 - type: "custom",  
99 - tokenizer: "standard",  
100 - filter: ["lowercase", "asciifolding", "searchkick_ngram"]  
101 - },  
102 - searchkick_word_end_index: {  
103 - type: "custom",  
104 - tokenizer: "standard",  
105 - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]  
106 - }  
107 - },  
108 - filter: {  
109 - searchkick_index_shingle: {  
110 - type: "shingle",  
111 - token_separator: ""  
112 - },  
113 - # lucky find https://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7  
114 - searchkick_search_shingle: {  
115 - type: "shingle",  
116 - token_separator: "",  
117 - output_unigrams: false,  
118 - output_unigrams_if_no_shingles: true  
119 - },  
120 - searchkick_suggest_shingle: {  
121 - type: "shingle",  
122 - max_shingle_size: 5  
123 - },  
124 - searchkick_edge_ngram: {  
125 - type: "edge_ngram",  
126 - min_gram: 1,  
127 - max_gram: 50  
128 - },  
129 - searchkick_ngram: {  
130 - type: "ngram",  
131 - min_gram: 1,  
132 - max_gram: 50  
133 - },  
134 - searchkick_stemmer: {  
135 - # use stemmer if language is lowercase, snowball otherwise  
136 - type: language == language.to_s.downcase ? "stemmer" : "snowball",  
137 - language: language || "English"  
138 - }  
139 - },  
140 - char_filter: {  
141 - # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html  
142 - # &_to_and  
143 - ampersand: {  
144 - type: "mapping",  
145 - mappings: ["&=> and "]  
146 - }  
147 - }  
148 - }  
149 - }  
150 -  
151 - stem = options[:stem]  
152 -  
153 - case language  
154 - when "chinese"  
155 - settings[:analysis][:analyzer].merge!(  
156 - default_analyzer => {  
157 - type: "ik_smart"  
158 - },  
159 - searchkick_search: {  
160 - type: "ik_smart"  
161 - },  
162 - searchkick_search2: {  
163 - type: "ik_max_word"  
164 - }  
165 - )  
166 -  
167 - stem = false  
168 - when "chinese2", "smartcn"  
169 - settings[:analysis][:analyzer].merge!(  
170 - default_analyzer => {  
171 - type: "smartcn"  
172 - },  
173 - searchkick_search: {  
174 - type: "smartcn"  
175 - },  
176 - searchkick_search2: {  
177 - type: "smartcn"  
178 - }  
179 - )  
180 -  
181 - stem = false  
182 - when "japanese"  
183 - settings[:analysis][:analyzer].merge!(  
184 - default_analyzer => {  
185 - type: "kuromoji"  
186 - },  
187 - searchkick_search: {  
188 - type: "kuromoji"  
189 - },  
190 - searchkick_search2: {  
191 - type: "kuromoji"  
192 - }  
193 - )  
194 -  
195 - stem = false  
196 - when "korean"  
197 - settings[:analysis][:analyzer].merge!(  
198 - default_analyzer => {  
199 - type: "openkoreantext-analyzer"  
200 - },  
201 - searchkick_search: {  
202 - type: "openkoreantext-analyzer"  
203 - },  
204 - searchkick_search2: {  
205 - type: "openkoreantext-analyzer"  
206 - }  
207 - )  
208 -  
209 - stem = false  
210 - when "korean2"  
211 - settings[:analysis][:analyzer].merge!(  
212 - default_analyzer => {  
213 - type: "nori"  
214 - },  
215 - searchkick_search: {  
216 - type: "nori"  
217 - },  
218 - searchkick_search2: {  
219 - type: "nori"  
220 - }  
221 - )  
222 -  
223 - stem = false  
224 - when "vietnamese"  
225 - settings[:analysis][:analyzer].merge!(  
226 - default_analyzer => {  
227 - type: "vi_analyzer"  
228 - },  
229 - searchkick_search: {  
230 - type: "vi_analyzer"  
231 - },  
232 - searchkick_search2: {  
233 - type: "vi_analyzer"  
234 - }  
235 - )  
236 -  
237 - stem = false  
238 - when "polish", "ukrainian"  
239 - settings[:analysis][:analyzer].merge!(  
240 - default_analyzer => {  
241 - type: language  
242 - },  
243 - searchkick_search: {  
244 - type: language  
245 - },  
246 - searchkick_search2: {  
247 - type: language  
248 - }  
249 - )  
250 -  
251 - stem = false  
252 - end  
253 -  
254 - if Searchkick.env == "test"  
255 - settings[:number_of_shards] = 1  
256 - settings[:number_of_replicas] = 0  
257 - end  
258 -  
259 - if options[:similarity]  
260 - settings[:similarity] = {default: {type: options[:similarity]}}  
261 - end  
262 -  
263 - unless below62  
264 - settings[:index] = {  
265 - max_ngram_diff: 49,  
266 - max_shingle_diff: 4  
267 - }  
268 - end  
269 -  
270 - if options[:case_sensitive]  
271 - settings[:analysis][:analyzer].each do |_, analyzer|  
272 - analyzer[:filter].delete("lowercase")  
273 - end  
274 - end  
275 -  
276 - if stem == false  
277 - settings[:analysis][:filter].delete(:searchkick_stemmer)  
278 - settings[:analysis][:analyzer].each do |_, analyzer|  
279 - analyzer[:filter].delete("searchkick_stemmer") if analyzer[:filter]  
280 - end  
281 - end  
282 -  
283 - settings = settings.symbolize_keys.deep_merge((options[:settings] || {}).symbolize_keys)  
284 -  
285 - add_synonyms(settings)  
286 - add_search_synonyms(settings)  
287 - add_wordnet(settings) if options[:wordnet]  
288 -  
289 - if options[:special_characters] == false  
290 - settings[:analysis][:analyzer].each_value do |analyzer_settings|  
291 - analyzer_settings[:filter].reject! { |f| f == "asciifolding" }  
292 - end  
293 - end  
294 -  
295 - mapping = {}  
296 -  
297 # conversions 32 # conversions
298 Array(options[:conversions]).each do |conversions_field| 33 Array(options[:conversions]).each do |conversions_field|
299 mapping[conversions_field] = { 34 mapping[conversions_field] = {
@@ -427,6 +162,277 @@ module Searchkick @@ -427,6 +162,277 @@ module Searchkick
427 } 162 }
428 end 163 end
429 164
  165 + def generate_settings
  166 + language = options[:language]
  167 + language = language.call if language.respond_to?(:call)
  168 +
  169 + settings = {
  170 + analysis: {
  171 + analyzer: {
  172 + searchkick_keyword: {
  173 + type: "custom",
  174 + tokenizer: "keyword",
  175 + filter: ["lowercase"] + (options[:stem_conversions] ? ["searchkick_stemmer"] : [])
  176 + },
  177 + default_analyzer => {
  178 + type: "custom",
  179 + # character filters -> tokenizer -> token filters
  180 + # https://www.elastic.co/guide/en/elasticsearch/guide/current/analysis-intro.html
  181 + char_filter: ["ampersand"],
  182 + tokenizer: "standard",
  183 + # synonym should come last, after stemming and shingle
  184 + # shingle must come before searchkick_stemmer
  185 + filter: ["lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"]
  186 + },
  187 + searchkick_search: {
  188 + type: "custom",
  189 + char_filter: ["ampersand"],
  190 + tokenizer: "standard",
  191 + filter: ["lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"]
  192 + },
  193 + searchkick_search2: {
  194 + type: "custom",
  195 + char_filter: ["ampersand"],
  196 + tokenizer: "standard",
  197 + filter: ["lowercase", "asciifolding", "searchkick_stemmer"]
  198 + },
  199 + # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb
  200 + searchkick_autocomplete_search: {
  201 + type: "custom",
  202 + tokenizer: "keyword",
  203 + filter: ["lowercase", "asciifolding"]
  204 + },
  205 + searchkick_word_search: {
  206 + type: "custom",
  207 + tokenizer: "standard",
  208 + filter: ["lowercase", "asciifolding"]
  209 + },
  210 + searchkick_suggest_index: {
  211 + type: "custom",
  212 + tokenizer: "standard",
  213 + filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"]
  214 + },
  215 + searchkick_text_start_index: {
  216 + type: "custom",
  217 + tokenizer: "keyword",
  218 + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
  219 + },
  220 + searchkick_text_middle_index: {
  221 + type: "custom",
  222 + tokenizer: "keyword",
  223 + filter: ["lowercase", "asciifolding", "searchkick_ngram"]
  224 + },
  225 + searchkick_text_end_index: {
  226 + type: "custom",
  227 + tokenizer: "keyword",
  228 + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
  229 + },
  230 + searchkick_word_start_index: {
  231 + type: "custom",
  232 + tokenizer: "standard",
  233 + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
  234 + },
  235 + searchkick_word_middle_index: {
  236 + type: "custom",
  237 + tokenizer: "standard",
  238 + filter: ["lowercase", "asciifolding", "searchkick_ngram"]
  239 + },
  240 + searchkick_word_end_index: {
  241 + type: "custom",
  242 + tokenizer: "standard",
  243 + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
  244 + }
  245 + },
  246 + filter: {
  247 + searchkick_index_shingle: {
  248 + type: "shingle",
  249 + token_separator: ""
  250 + },
  251 + # lucky find https://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7
  252 + searchkick_search_shingle: {
  253 + type: "shingle",
  254 + token_separator: "",
  255 + output_unigrams: false,
  256 + output_unigrams_if_no_shingles: true
  257 + },
  258 + searchkick_suggest_shingle: {
  259 + type: "shingle",
  260 + max_shingle_size: 5
  261 + },
  262 + searchkick_edge_ngram: {
  263 + type: "edge_ngram",
  264 + min_gram: 1,
  265 + max_gram: 50
  266 + },
  267 + searchkick_ngram: {
  268 + type: "ngram",
  269 + min_gram: 1,
  270 + max_gram: 50
  271 + },
  272 + searchkick_stemmer: {
  273 + # use stemmer if language is lowercase, snowball otherwise
  274 + type: language == language.to_s.downcase ? "stemmer" : "snowball",
  275 + language: language || "English"
  276 + }
  277 + },
  278 + char_filter: {
  279 + # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html
  280 + # &_to_and
  281 + ampersand: {
  282 + type: "mapping",
  283 + mappings: ["&=> and "]
  284 + }
  285 + }
  286 + }
  287 + }
  288 +
  289 + stem = options[:stem]
  290 +
  291 + case language
  292 + when "chinese"
  293 + settings[:analysis][:analyzer].merge!(
  294 + default_analyzer => {
  295 + type: "ik_smart"
  296 + },
  297 + searchkick_search: {
  298 + type: "ik_smart"
  299 + },
  300 + searchkick_search2: {
  301 + type: "ik_max_word"
  302 + }
  303 + )
  304 +
  305 + stem = false
  306 + when "chinese2", "smartcn"
  307 + settings[:analysis][:analyzer].merge!(
  308 + default_analyzer => {
  309 + type: "smartcn"
  310 + },
  311 + searchkick_search: {
  312 + type: "smartcn"
  313 + },
  314 + searchkick_search2: {
  315 + type: "smartcn"
  316 + }
  317 + )
  318 +
  319 + stem = false
  320 + when "japanese"
  321 + settings[:analysis][:analyzer].merge!(
  322 + default_analyzer => {
  323 + type: "kuromoji"
  324 + },
  325 + searchkick_search: {
  326 + type: "kuromoji"
  327 + },
  328 + searchkick_search2: {
  329 + type: "kuromoji"
  330 + }
  331 + )
  332 +
  333 + stem = false
  334 + when "korean"
  335 + settings[:analysis][:analyzer].merge!(
  336 + default_analyzer => {
  337 + type: "openkoreantext-analyzer"
  338 + },
  339 + searchkick_search: {
  340 + type: "openkoreantext-analyzer"
  341 + },
  342 + searchkick_search2: {
  343 + type: "openkoreantext-analyzer"
  344 + }
  345 + )
  346 +
  347 + stem = false
  348 + when "korean2"
  349 + settings[:analysis][:analyzer].merge!(
  350 + default_analyzer => {
  351 + type: "nori"
  352 + },
  353 + searchkick_search: {
  354 + type: "nori"
  355 + },
  356 + searchkick_search2: {
  357 + type: "nori"
  358 + }
  359 + )
  360 +
  361 + stem = false
  362 + when "vietnamese"
  363 + settings[:analysis][:analyzer].merge!(
  364 + default_analyzer => {
  365 + type: "vi_analyzer"
  366 + },
  367 + searchkick_search: {
  368 + type: "vi_analyzer"
  369 + },
  370 + searchkick_search2: {
  371 + type: "vi_analyzer"
  372 + }
  373 + )
  374 +
  375 + stem = false
  376 + when "polish", "ukrainian"
  377 + settings[:analysis][:analyzer].merge!(
  378 + default_analyzer => {
  379 + type: language
  380 + },
  381 + searchkick_search: {
  382 + type: language
  383 + },
  384 + searchkick_search2: {
  385 + type: language
  386 + }
  387 + )
  388 +
  389 + stem = false
  390 + end
  391 +
  392 + if Searchkick.env == "test"
  393 + settings[:number_of_shards] = 1
  394 + settings[:number_of_replicas] = 0
  395 + end
  396 +
  397 + if options[:similarity]
  398 + settings[:similarity] = {default: {type: options[:similarity]}}
  399 + end
  400 +
  401 + unless below62
  402 + settings[:index] = {
  403 + max_ngram_diff: 49,
  404 + max_shingle_diff: 4
  405 + }
  406 + end
  407 +
  408 + if options[:case_sensitive]
  409 + settings[:analysis][:analyzer].each do |_, analyzer|
  410 + analyzer[:filter].delete("lowercase")
  411 + end
  412 + end
  413 +
  414 + if stem == false
  415 + settings[:analysis][:filter].delete(:searchkick_stemmer)
  416 + settings[:analysis][:analyzer].each do |_, analyzer|
  417 + analyzer[:filter].delete("searchkick_stemmer") if analyzer[:filter]
  418 + end
  419 + end
  420 +
  421 + settings = settings.symbolize_keys.deep_merge((options[:settings] || {}).symbolize_keys)
  422 +
  423 + add_synonyms(settings)
  424 + add_search_synonyms(settings)
  425 + add_wordnet(settings) if options[:wordnet]
  426 +
  427 + if options[:special_characters] == false
  428 + settings[:analysis][:analyzer].each_value do |analyzer_settings|
  429 + analyzer_settings[:filter].reject! { |f| f == "asciifolding" }
  430 + end
  431 + end
  432 +
  433 + settings
  434 + end
  435 +
430 def add_synonyms(settings) 436 def add_synonyms(settings)
431 synonyms = options[:synonyms] || [] 437 synonyms = options[:synonyms] || []
432 synonyms = synonyms.call if synonyms.respond_to?(:call) 438 synonyms = synonyms.call if synonyms.respond_to?(:call)