Commit f915ef753b5caabd29923cf03ffba4276110ae95
1 parent
dbe04a0a
Exists in
master
and in
8 other branches
Added generate settings method
Showing
1 changed file
with
274 additions
and
268 deletions
Show diff stats
lib/searchkick/index_options.rb
... | ... | @@ -22,278 +22,13 @@ module Searchkick |
22 | 22 | settings = options[:settings] || {} |
23 | 23 | mappings = custom_mapping |
24 | 24 | else |
25 | - language = options[:language] | |
26 | - language = language.call if language.respond_to?(:call) | |
25 | + settings = generate_settings | |
26 | + | |
27 | + mapping = {} | |
27 | 28 | |
28 | 29 | keyword_mapping = {type: "keyword"} |
29 | 30 | keyword_mapping[:ignore_above] = options[:ignore_above] || 30000 |
30 | 31 | |
31 | - settings = { | |
32 | - analysis: { | |
33 | - analyzer: { | |
34 | - searchkick_keyword: { | |
35 | - type: "custom", | |
36 | - tokenizer: "keyword", | |
37 | - filter: ["lowercase"] + (options[:stem_conversions] ? ["searchkick_stemmer"] : []) | |
38 | - }, | |
39 | - default_analyzer => { | |
40 | - type: "custom", | |
41 | - # character filters -> tokenizer -> token filters | |
42 | - # https://www.elastic.co/guide/en/elasticsearch/guide/current/analysis-intro.html | |
43 | - char_filter: ["ampersand"], | |
44 | - tokenizer: "standard", | |
45 | - # synonym should come last, after stemming and shingle | |
46 | - # shingle must come before searchkick_stemmer | |
47 | - filter: ["lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"] | |
48 | - }, | |
49 | - searchkick_search: { | |
50 | - type: "custom", | |
51 | - char_filter: ["ampersand"], | |
52 | - tokenizer: "standard", | |
53 | - filter: ["lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"] | |
54 | - }, | |
55 | - searchkick_search2: { | |
56 | - type: "custom", | |
57 | - char_filter: ["ampersand"], | |
58 | - tokenizer: "standard", | |
59 | - filter: ["lowercase", "asciifolding", "searchkick_stemmer"] | |
60 | - }, | |
61 | - # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb | |
62 | - searchkick_autocomplete_search: { | |
63 | - type: "custom", | |
64 | - tokenizer: "keyword", | |
65 | - filter: ["lowercase", "asciifolding"] | |
66 | - }, | |
67 | - searchkick_word_search: { | |
68 | - type: "custom", | |
69 | - tokenizer: "standard", | |
70 | - filter: ["lowercase", "asciifolding"] | |
71 | - }, | |
72 | - searchkick_suggest_index: { | |
73 | - type: "custom", | |
74 | - tokenizer: "standard", | |
75 | - filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"] | |
76 | - }, | |
77 | - searchkick_text_start_index: { | |
78 | - type: "custom", | |
79 | - tokenizer: "keyword", | |
80 | - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] | |
81 | - }, | |
82 | - searchkick_text_middle_index: { | |
83 | - type: "custom", | |
84 | - tokenizer: "keyword", | |
85 | - filter: ["lowercase", "asciifolding", "searchkick_ngram"] | |
86 | - }, | |
87 | - searchkick_text_end_index: { | |
88 | - type: "custom", | |
89 | - tokenizer: "keyword", | |
90 | - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] | |
91 | - }, | |
92 | - searchkick_word_start_index: { | |
93 | - type: "custom", | |
94 | - tokenizer: "standard", | |
95 | - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] | |
96 | - }, | |
97 | - searchkick_word_middle_index: { | |
98 | - type: "custom", | |
99 | - tokenizer: "standard", | |
100 | - filter: ["lowercase", "asciifolding", "searchkick_ngram"] | |
101 | - }, | |
102 | - searchkick_word_end_index: { | |
103 | - type: "custom", | |
104 | - tokenizer: "standard", | |
105 | - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] | |
106 | - } | |
107 | - }, | |
108 | - filter: { | |
109 | - searchkick_index_shingle: { | |
110 | - type: "shingle", | |
111 | - token_separator: "" | |
112 | - }, | |
113 | - # lucky find https://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7 | |
114 | - searchkick_search_shingle: { | |
115 | - type: "shingle", | |
116 | - token_separator: "", | |
117 | - output_unigrams: false, | |
118 | - output_unigrams_if_no_shingles: true | |
119 | - }, | |
120 | - searchkick_suggest_shingle: { | |
121 | - type: "shingle", | |
122 | - max_shingle_size: 5 | |
123 | - }, | |
124 | - searchkick_edge_ngram: { | |
125 | - type: "edge_ngram", | |
126 | - min_gram: 1, | |
127 | - max_gram: 50 | |
128 | - }, | |
129 | - searchkick_ngram: { | |
130 | - type: "ngram", | |
131 | - min_gram: 1, | |
132 | - max_gram: 50 | |
133 | - }, | |
134 | - searchkick_stemmer: { | |
135 | - # use stemmer if language is lowercase, snowball otherwise | |
136 | - type: language == language.to_s.downcase ? "stemmer" : "snowball", | |
137 | - language: language || "English" | |
138 | - } | |
139 | - }, | |
140 | - char_filter: { | |
141 | - # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html | |
142 | - # &_to_and | |
143 | - ampersand: { | |
144 | - type: "mapping", | |
145 | - mappings: ["&=> and "] | |
146 | - } | |
147 | - } | |
148 | - } | |
149 | - } | |
150 | - | |
151 | - stem = options[:stem] | |
152 | - | |
153 | - case language | |
154 | - when "chinese" | |
155 | - settings[:analysis][:analyzer].merge!( | |
156 | - default_analyzer => { | |
157 | - type: "ik_smart" | |
158 | - }, | |
159 | - searchkick_search: { | |
160 | - type: "ik_smart" | |
161 | - }, | |
162 | - searchkick_search2: { | |
163 | - type: "ik_max_word" | |
164 | - } | |
165 | - ) | |
166 | - | |
167 | - stem = false | |
168 | - when "chinese2", "smartcn" | |
169 | - settings[:analysis][:analyzer].merge!( | |
170 | - default_analyzer => { | |
171 | - type: "smartcn" | |
172 | - }, | |
173 | - searchkick_search: { | |
174 | - type: "smartcn" | |
175 | - }, | |
176 | - searchkick_search2: { | |
177 | - type: "smartcn" | |
178 | - } | |
179 | - ) | |
180 | - | |
181 | - stem = false | |
182 | - when "japanese" | |
183 | - settings[:analysis][:analyzer].merge!( | |
184 | - default_analyzer => { | |
185 | - type: "kuromoji" | |
186 | - }, | |
187 | - searchkick_search: { | |
188 | - type: "kuromoji" | |
189 | - }, | |
190 | - searchkick_search2: { | |
191 | - type: "kuromoji" | |
192 | - } | |
193 | - ) | |
194 | - | |
195 | - stem = false | |
196 | - when "korean" | |
197 | - settings[:analysis][:analyzer].merge!( | |
198 | - default_analyzer => { | |
199 | - type: "openkoreantext-analyzer" | |
200 | - }, | |
201 | - searchkick_search: { | |
202 | - type: "openkoreantext-analyzer" | |
203 | - }, | |
204 | - searchkick_search2: { | |
205 | - type: "openkoreantext-analyzer" | |
206 | - } | |
207 | - ) | |
208 | - | |
209 | - stem = false | |
210 | - when "korean2" | |
211 | - settings[:analysis][:analyzer].merge!( | |
212 | - default_analyzer => { | |
213 | - type: "nori" | |
214 | - }, | |
215 | - searchkick_search: { | |
216 | - type: "nori" | |
217 | - }, | |
218 | - searchkick_search2: { | |
219 | - type: "nori" | |
220 | - } | |
221 | - ) | |
222 | - | |
223 | - stem = false | |
224 | - when "vietnamese" | |
225 | - settings[:analysis][:analyzer].merge!( | |
226 | - default_analyzer => { | |
227 | - type: "vi_analyzer" | |
228 | - }, | |
229 | - searchkick_search: { | |
230 | - type: "vi_analyzer" | |
231 | - }, | |
232 | - searchkick_search2: { | |
233 | - type: "vi_analyzer" | |
234 | - } | |
235 | - ) | |
236 | - | |
237 | - stem = false | |
238 | - when "polish", "ukrainian" | |
239 | - settings[:analysis][:analyzer].merge!( | |
240 | - default_analyzer => { | |
241 | - type: language | |
242 | - }, | |
243 | - searchkick_search: { | |
244 | - type: language | |
245 | - }, | |
246 | - searchkick_search2: { | |
247 | - type: language | |
248 | - } | |
249 | - ) | |
250 | - | |
251 | - stem = false | |
252 | - end | |
253 | - | |
254 | - if Searchkick.env == "test" | |
255 | - settings[:number_of_shards] = 1 | |
256 | - settings[:number_of_replicas] = 0 | |
257 | - end | |
258 | - | |
259 | - if options[:similarity] | |
260 | - settings[:similarity] = {default: {type: options[:similarity]}} | |
261 | - end | |
262 | - | |
263 | - unless below62 | |
264 | - settings[:index] = { | |
265 | - max_ngram_diff: 49, | |
266 | - max_shingle_diff: 4 | |
267 | - } | |
268 | - end | |
269 | - | |
270 | - if options[:case_sensitive] | |
271 | - settings[:analysis][:analyzer].each do |_, analyzer| | |
272 | - analyzer[:filter].delete("lowercase") | |
273 | - end | |
274 | - end | |
275 | - | |
276 | - if stem == false | |
277 | - settings[:analysis][:filter].delete(:searchkick_stemmer) | |
278 | - settings[:analysis][:analyzer].each do |_, analyzer| | |
279 | - analyzer[:filter].delete("searchkick_stemmer") if analyzer[:filter] | |
280 | - end | |
281 | - end | |
282 | - | |
283 | - settings = settings.symbolize_keys.deep_merge((options[:settings] || {}).symbolize_keys) | |
284 | - | |
285 | - add_synonyms(settings) | |
286 | - add_search_synonyms(settings) | |
287 | - add_wordnet(settings) if options[:wordnet] | |
288 | - | |
289 | - if options[:special_characters] == false | |
290 | - settings[:analysis][:analyzer].each_value do |analyzer_settings| | |
291 | - analyzer_settings[:filter].reject! { |f| f == "asciifolding" } | |
292 | - end | |
293 | - end | |
294 | - | |
295 | - mapping = {} | |
296 | - | |
297 | 32 | # conversions |
298 | 33 | Array(options[:conversions]).each do |conversions_field| |
299 | 34 | mapping[conversions_field] = { |
... | ... | @@ -427,6 +162,277 @@ module Searchkick |
427 | 162 | } |
428 | 163 | end |
429 | 164 | |
165 | + def generate_settings | |
166 | + language = options[:language] | |
167 | + language = language.call if language.respond_to?(:call) | |
168 | + | |
169 | + settings = { | |
170 | + analysis: { | |
171 | + analyzer: { | |
172 | + searchkick_keyword: { | |
173 | + type: "custom", | |
174 | + tokenizer: "keyword", | |
175 | + filter: ["lowercase"] + (options[:stem_conversions] ? ["searchkick_stemmer"] : []) | |
176 | + }, | |
177 | + default_analyzer => { | |
178 | + type: "custom", | |
179 | + # character filters -> tokenizer -> token filters | |
180 | + # https://www.elastic.co/guide/en/elasticsearch/guide/current/analysis-intro.html | |
181 | + char_filter: ["ampersand"], | |
182 | + tokenizer: "standard", | |
183 | + # synonym should come last, after stemming and shingle | |
184 | + # shingle must come before searchkick_stemmer | |
185 | + filter: ["lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"] | |
186 | + }, | |
187 | + searchkick_search: { | |
188 | + type: "custom", | |
189 | + char_filter: ["ampersand"], | |
190 | + tokenizer: "standard", | |
191 | + filter: ["lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"] | |
192 | + }, | |
193 | + searchkick_search2: { | |
194 | + type: "custom", | |
195 | + char_filter: ["ampersand"], | |
196 | + tokenizer: "standard", | |
197 | + filter: ["lowercase", "asciifolding", "searchkick_stemmer"] | |
198 | + }, | |
199 | + # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb | |
200 | + searchkick_autocomplete_search: { | |
201 | + type: "custom", | |
202 | + tokenizer: "keyword", | |
203 | + filter: ["lowercase", "asciifolding"] | |
204 | + }, | |
205 | + searchkick_word_search: { | |
206 | + type: "custom", | |
207 | + tokenizer: "standard", | |
208 | + filter: ["lowercase", "asciifolding"] | |
209 | + }, | |
210 | + searchkick_suggest_index: { | |
211 | + type: "custom", | |
212 | + tokenizer: "standard", | |
213 | + filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"] | |
214 | + }, | |
215 | + searchkick_text_start_index: { | |
216 | + type: "custom", | |
217 | + tokenizer: "keyword", | |
218 | + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] | |
219 | + }, | |
220 | + searchkick_text_middle_index: { | |
221 | + type: "custom", | |
222 | + tokenizer: "keyword", | |
223 | + filter: ["lowercase", "asciifolding", "searchkick_ngram"] | |
224 | + }, | |
225 | + searchkick_text_end_index: { | |
226 | + type: "custom", | |
227 | + tokenizer: "keyword", | |
228 | + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] | |
229 | + }, | |
230 | + searchkick_word_start_index: { | |
231 | + type: "custom", | |
232 | + tokenizer: "standard", | |
233 | + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] | |
234 | + }, | |
235 | + searchkick_word_middle_index: { | |
236 | + type: "custom", | |
237 | + tokenizer: "standard", | |
238 | + filter: ["lowercase", "asciifolding", "searchkick_ngram"] | |
239 | + }, | |
240 | + searchkick_word_end_index: { | |
241 | + type: "custom", | |
242 | + tokenizer: "standard", | |
243 | + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] | |
244 | + } | |
245 | + }, | |
246 | + filter: { | |
247 | + searchkick_index_shingle: { | |
248 | + type: "shingle", | |
249 | + token_separator: "" | |
250 | + }, | |
251 | + # lucky find https://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7 | |
252 | + searchkick_search_shingle: { | |
253 | + type: "shingle", | |
254 | + token_separator: "", | |
255 | + output_unigrams: false, | |
256 | + output_unigrams_if_no_shingles: true | |
257 | + }, | |
258 | + searchkick_suggest_shingle: { | |
259 | + type: "shingle", | |
260 | + max_shingle_size: 5 | |
261 | + }, | |
262 | + searchkick_edge_ngram: { | |
263 | + type: "edge_ngram", | |
264 | + min_gram: 1, | |
265 | + max_gram: 50 | |
266 | + }, | |
267 | + searchkick_ngram: { | |
268 | + type: "ngram", | |
269 | + min_gram: 1, | |
270 | + max_gram: 50 | |
271 | + }, | |
272 | + searchkick_stemmer: { | |
273 | + # use stemmer if language is lowercase, snowball otherwise | |
274 | + type: language == language.to_s.downcase ? "stemmer" : "snowball", | |
275 | + language: language || "English" | |
276 | + } | |
277 | + }, | |
278 | + char_filter: { | |
279 | + # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html | |
280 | + # &_to_and | |
281 | + ampersand: { | |
282 | + type: "mapping", | |
283 | + mappings: ["&=> and "] | |
284 | + } | |
285 | + } | |
286 | + } | |
287 | + } | |
288 | + | |
289 | + stem = options[:stem] | |
290 | + | |
291 | + case language | |
292 | + when "chinese" | |
293 | + settings[:analysis][:analyzer].merge!( | |
294 | + default_analyzer => { | |
295 | + type: "ik_smart" | |
296 | + }, | |
297 | + searchkick_search: { | |
298 | + type: "ik_smart" | |
299 | + }, | |
300 | + searchkick_search2: { | |
301 | + type: "ik_max_word" | |
302 | + } | |
303 | + ) | |
304 | + | |
305 | + stem = false | |
306 | + when "chinese2", "smartcn" | |
307 | + settings[:analysis][:analyzer].merge!( | |
308 | + default_analyzer => { | |
309 | + type: "smartcn" | |
310 | + }, | |
311 | + searchkick_search: { | |
312 | + type: "smartcn" | |
313 | + }, | |
314 | + searchkick_search2: { | |
315 | + type: "smartcn" | |
316 | + } | |
317 | + ) | |
318 | + | |
319 | + stem = false | |
320 | + when "japanese" | |
321 | + settings[:analysis][:analyzer].merge!( | |
322 | + default_analyzer => { | |
323 | + type: "kuromoji" | |
324 | + }, | |
325 | + searchkick_search: { | |
326 | + type: "kuromoji" | |
327 | + }, | |
328 | + searchkick_search2: { | |
329 | + type: "kuromoji" | |
330 | + } | |
331 | + ) | |
332 | + | |
333 | + stem = false | |
334 | + when "korean" | |
335 | + settings[:analysis][:analyzer].merge!( | |
336 | + default_analyzer => { | |
337 | + type: "openkoreantext-analyzer" | |
338 | + }, | |
339 | + searchkick_search: { | |
340 | + type: "openkoreantext-analyzer" | |
341 | + }, | |
342 | + searchkick_search2: { | |
343 | + type: "openkoreantext-analyzer" | |
344 | + } | |
345 | + ) | |
346 | + | |
347 | + stem = false | |
348 | + when "korean2" | |
349 | + settings[:analysis][:analyzer].merge!( | |
350 | + default_analyzer => { | |
351 | + type: "nori" | |
352 | + }, | |
353 | + searchkick_search: { | |
354 | + type: "nori" | |
355 | + }, | |
356 | + searchkick_search2: { | |
357 | + type: "nori" | |
358 | + } | |
359 | + ) | |
360 | + | |
361 | + stem = false | |
362 | + when "vietnamese" | |
363 | + settings[:analysis][:analyzer].merge!( | |
364 | + default_analyzer => { | |
365 | + type: "vi_analyzer" | |
366 | + }, | |
367 | + searchkick_search: { | |
368 | + type: "vi_analyzer" | |
369 | + }, | |
370 | + searchkick_search2: { | |
371 | + type: "vi_analyzer" | |
372 | + } | |
373 | + ) | |
374 | + | |
375 | + stem = false | |
376 | + when "polish", "ukrainian" | |
377 | + settings[:analysis][:analyzer].merge!( | |
378 | + default_analyzer => { | |
379 | + type: language | |
380 | + }, | |
381 | + searchkick_search: { | |
382 | + type: language | |
383 | + }, | |
384 | + searchkick_search2: { | |
385 | + type: language | |
386 | + } | |
387 | + ) | |
388 | + | |
389 | + stem = false | |
390 | + end | |
391 | + | |
392 | + if Searchkick.env == "test" | |
393 | + settings[:number_of_shards] = 1 | |
394 | + settings[:number_of_replicas] = 0 | |
395 | + end | |
396 | + | |
397 | + if options[:similarity] | |
398 | + settings[:similarity] = {default: {type: options[:similarity]}} | |
399 | + end | |
400 | + | |
401 | + unless below62 | |
402 | + settings[:index] = { | |
403 | + max_ngram_diff: 49, | |
404 | + max_shingle_diff: 4 | |
405 | + } | |
406 | + end | |
407 | + | |
408 | + if options[:case_sensitive] | |
409 | + settings[:analysis][:analyzer].each do |_, analyzer| | |
410 | + analyzer[:filter].delete("lowercase") | |
411 | + end | |
412 | + end | |
413 | + | |
414 | + if stem == false | |
415 | + settings[:analysis][:filter].delete(:searchkick_stemmer) | |
416 | + settings[:analysis][:analyzer].each do |_, analyzer| | |
417 | + analyzer[:filter].delete("searchkick_stemmer") if analyzer[:filter] | |
418 | + end | |
419 | + end | |
420 | + | |
421 | + settings = settings.symbolize_keys.deep_merge((options[:settings] || {}).symbolize_keys) | |
422 | + | |
423 | + add_synonyms(settings) | |
424 | + add_search_synonyms(settings) | |
425 | + add_wordnet(settings) if options[:wordnet] | |
426 | + | |
427 | + if options[:special_characters] == false | |
428 | + settings[:analysis][:analyzer].each_value do |analyzer_settings| | |
429 | + analyzer_settings[:filter].reject! { |f| f == "asciifolding" } | |
430 | + end | |
431 | + end | |
432 | + | |
433 | + settings | |
434 | + end | |
435 | + | |
430 | 436 | def add_synonyms(settings) |
431 | 437 | synonyms = options[:synonyms] || [] |
432 | 438 | synonyms = synonyms.call if synonyms.respond_to?(:call) | ... | ... |