Commit f915ef753b5caabd29923cf03ffba4276110ae95
1 parent
dbe04a0a
Exists in
master
and in
8 other branches
Added generate settings method
Showing
1 changed file
with
274 additions
and
268 deletions
Show diff stats
lib/searchkick/index_options.rb
@@ -22,278 +22,13 @@ module Searchkick | @@ -22,278 +22,13 @@ module Searchkick | ||
22 | settings = options[:settings] || {} | 22 | settings = options[:settings] || {} |
23 | mappings = custom_mapping | 23 | mappings = custom_mapping |
24 | else | 24 | else |
25 | - language = options[:language] | ||
26 | - language = language.call if language.respond_to?(:call) | 25 | + settings = generate_settings |
26 | + | ||
27 | + mapping = {} | ||
27 | 28 | ||
28 | keyword_mapping = {type: "keyword"} | 29 | keyword_mapping = {type: "keyword"} |
29 | keyword_mapping[:ignore_above] = options[:ignore_above] || 30000 | 30 | keyword_mapping[:ignore_above] = options[:ignore_above] || 30000 |
30 | 31 | ||
31 | - settings = { | ||
32 | - analysis: { | ||
33 | - analyzer: { | ||
34 | - searchkick_keyword: { | ||
35 | - type: "custom", | ||
36 | - tokenizer: "keyword", | ||
37 | - filter: ["lowercase"] + (options[:stem_conversions] ? ["searchkick_stemmer"] : []) | ||
38 | - }, | ||
39 | - default_analyzer => { | ||
40 | - type: "custom", | ||
41 | - # character filters -> tokenizer -> token filters | ||
42 | - # https://www.elastic.co/guide/en/elasticsearch/guide/current/analysis-intro.html | ||
43 | - char_filter: ["ampersand"], | ||
44 | - tokenizer: "standard", | ||
45 | - # synonym should come last, after stemming and shingle | ||
46 | - # shingle must come before searchkick_stemmer | ||
47 | - filter: ["lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"] | ||
48 | - }, | ||
49 | - searchkick_search: { | ||
50 | - type: "custom", | ||
51 | - char_filter: ["ampersand"], | ||
52 | - tokenizer: "standard", | ||
53 | - filter: ["lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"] | ||
54 | - }, | ||
55 | - searchkick_search2: { | ||
56 | - type: "custom", | ||
57 | - char_filter: ["ampersand"], | ||
58 | - tokenizer: "standard", | ||
59 | - filter: ["lowercase", "asciifolding", "searchkick_stemmer"] | ||
60 | - }, | ||
61 | - # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb | ||
62 | - searchkick_autocomplete_search: { | ||
63 | - type: "custom", | ||
64 | - tokenizer: "keyword", | ||
65 | - filter: ["lowercase", "asciifolding"] | ||
66 | - }, | ||
67 | - searchkick_word_search: { | ||
68 | - type: "custom", | ||
69 | - tokenizer: "standard", | ||
70 | - filter: ["lowercase", "asciifolding"] | ||
71 | - }, | ||
72 | - searchkick_suggest_index: { | ||
73 | - type: "custom", | ||
74 | - tokenizer: "standard", | ||
75 | - filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"] | ||
76 | - }, | ||
77 | - searchkick_text_start_index: { | ||
78 | - type: "custom", | ||
79 | - tokenizer: "keyword", | ||
80 | - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] | ||
81 | - }, | ||
82 | - searchkick_text_middle_index: { | ||
83 | - type: "custom", | ||
84 | - tokenizer: "keyword", | ||
85 | - filter: ["lowercase", "asciifolding", "searchkick_ngram"] | ||
86 | - }, | ||
87 | - searchkick_text_end_index: { | ||
88 | - type: "custom", | ||
89 | - tokenizer: "keyword", | ||
90 | - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] | ||
91 | - }, | ||
92 | - searchkick_word_start_index: { | ||
93 | - type: "custom", | ||
94 | - tokenizer: "standard", | ||
95 | - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] | ||
96 | - }, | ||
97 | - searchkick_word_middle_index: { | ||
98 | - type: "custom", | ||
99 | - tokenizer: "standard", | ||
100 | - filter: ["lowercase", "asciifolding", "searchkick_ngram"] | ||
101 | - }, | ||
102 | - searchkick_word_end_index: { | ||
103 | - type: "custom", | ||
104 | - tokenizer: "standard", | ||
105 | - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] | ||
106 | - } | ||
107 | - }, | ||
108 | - filter: { | ||
109 | - searchkick_index_shingle: { | ||
110 | - type: "shingle", | ||
111 | - token_separator: "" | ||
112 | - }, | ||
113 | - # lucky find https://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7 | ||
114 | - searchkick_search_shingle: { | ||
115 | - type: "shingle", | ||
116 | - token_separator: "", | ||
117 | - output_unigrams: false, | ||
118 | - output_unigrams_if_no_shingles: true | ||
119 | - }, | ||
120 | - searchkick_suggest_shingle: { | ||
121 | - type: "shingle", | ||
122 | - max_shingle_size: 5 | ||
123 | - }, | ||
124 | - searchkick_edge_ngram: { | ||
125 | - type: "edge_ngram", | ||
126 | - min_gram: 1, | ||
127 | - max_gram: 50 | ||
128 | - }, | ||
129 | - searchkick_ngram: { | ||
130 | - type: "ngram", | ||
131 | - min_gram: 1, | ||
132 | - max_gram: 50 | ||
133 | - }, | ||
134 | - searchkick_stemmer: { | ||
135 | - # use stemmer if language is lowercase, snowball otherwise | ||
136 | - type: language == language.to_s.downcase ? "stemmer" : "snowball", | ||
137 | - language: language || "English" | ||
138 | - } | ||
139 | - }, | ||
140 | - char_filter: { | ||
141 | - # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html | ||
142 | - # &_to_and | ||
143 | - ampersand: { | ||
144 | - type: "mapping", | ||
145 | - mappings: ["&=> and "] | ||
146 | - } | ||
147 | - } | ||
148 | - } | ||
149 | - } | ||
150 | - | ||
151 | - stem = options[:stem] | ||
152 | - | ||
153 | - case language | ||
154 | - when "chinese" | ||
155 | - settings[:analysis][:analyzer].merge!( | ||
156 | - default_analyzer => { | ||
157 | - type: "ik_smart" | ||
158 | - }, | ||
159 | - searchkick_search: { | ||
160 | - type: "ik_smart" | ||
161 | - }, | ||
162 | - searchkick_search2: { | ||
163 | - type: "ik_max_word" | ||
164 | - } | ||
165 | - ) | ||
166 | - | ||
167 | - stem = false | ||
168 | - when "chinese2", "smartcn" | ||
169 | - settings[:analysis][:analyzer].merge!( | ||
170 | - default_analyzer => { | ||
171 | - type: "smartcn" | ||
172 | - }, | ||
173 | - searchkick_search: { | ||
174 | - type: "smartcn" | ||
175 | - }, | ||
176 | - searchkick_search2: { | ||
177 | - type: "smartcn" | ||
178 | - } | ||
179 | - ) | ||
180 | - | ||
181 | - stem = false | ||
182 | - when "japanese" | ||
183 | - settings[:analysis][:analyzer].merge!( | ||
184 | - default_analyzer => { | ||
185 | - type: "kuromoji" | ||
186 | - }, | ||
187 | - searchkick_search: { | ||
188 | - type: "kuromoji" | ||
189 | - }, | ||
190 | - searchkick_search2: { | ||
191 | - type: "kuromoji" | ||
192 | - } | ||
193 | - ) | ||
194 | - | ||
195 | - stem = false | ||
196 | - when "korean" | ||
197 | - settings[:analysis][:analyzer].merge!( | ||
198 | - default_analyzer => { | ||
199 | - type: "openkoreantext-analyzer" | ||
200 | - }, | ||
201 | - searchkick_search: { | ||
202 | - type: "openkoreantext-analyzer" | ||
203 | - }, | ||
204 | - searchkick_search2: { | ||
205 | - type: "openkoreantext-analyzer" | ||
206 | - } | ||
207 | - ) | ||
208 | - | ||
209 | - stem = false | ||
210 | - when "korean2" | ||
211 | - settings[:analysis][:analyzer].merge!( | ||
212 | - default_analyzer => { | ||
213 | - type: "nori" | ||
214 | - }, | ||
215 | - searchkick_search: { | ||
216 | - type: "nori" | ||
217 | - }, | ||
218 | - searchkick_search2: { | ||
219 | - type: "nori" | ||
220 | - } | ||
221 | - ) | ||
222 | - | ||
223 | - stem = false | ||
224 | - when "vietnamese" | ||
225 | - settings[:analysis][:analyzer].merge!( | ||
226 | - default_analyzer => { | ||
227 | - type: "vi_analyzer" | ||
228 | - }, | ||
229 | - searchkick_search: { | ||
230 | - type: "vi_analyzer" | ||
231 | - }, | ||
232 | - searchkick_search2: { | ||
233 | - type: "vi_analyzer" | ||
234 | - } | ||
235 | - ) | ||
236 | - | ||
237 | - stem = false | ||
238 | - when "polish", "ukrainian" | ||
239 | - settings[:analysis][:analyzer].merge!( | ||
240 | - default_analyzer => { | ||
241 | - type: language | ||
242 | - }, | ||
243 | - searchkick_search: { | ||
244 | - type: language | ||
245 | - }, | ||
246 | - searchkick_search2: { | ||
247 | - type: language | ||
248 | - } | ||
249 | - ) | ||
250 | - | ||
251 | - stem = false | ||
252 | - end | ||
253 | - | ||
254 | - if Searchkick.env == "test" | ||
255 | - settings[:number_of_shards] = 1 | ||
256 | - settings[:number_of_replicas] = 0 | ||
257 | - end | ||
258 | - | ||
259 | - if options[:similarity] | ||
260 | - settings[:similarity] = {default: {type: options[:similarity]}} | ||
261 | - end | ||
262 | - | ||
263 | - unless below62 | ||
264 | - settings[:index] = { | ||
265 | - max_ngram_diff: 49, | ||
266 | - max_shingle_diff: 4 | ||
267 | - } | ||
268 | - end | ||
269 | - | ||
270 | - if options[:case_sensitive] | ||
271 | - settings[:analysis][:analyzer].each do |_, analyzer| | ||
272 | - analyzer[:filter].delete("lowercase") | ||
273 | - end | ||
274 | - end | ||
275 | - | ||
276 | - if stem == false | ||
277 | - settings[:analysis][:filter].delete(:searchkick_stemmer) | ||
278 | - settings[:analysis][:analyzer].each do |_, analyzer| | ||
279 | - analyzer[:filter].delete("searchkick_stemmer") if analyzer[:filter] | ||
280 | - end | ||
281 | - end | ||
282 | - | ||
283 | - settings = settings.symbolize_keys.deep_merge((options[:settings] || {}).symbolize_keys) | ||
284 | - | ||
285 | - add_synonyms(settings) | ||
286 | - add_search_synonyms(settings) | ||
287 | - add_wordnet(settings) if options[:wordnet] | ||
288 | - | ||
289 | - if options[:special_characters] == false | ||
290 | - settings[:analysis][:analyzer].each_value do |analyzer_settings| | ||
291 | - analyzer_settings[:filter].reject! { |f| f == "asciifolding" } | ||
292 | - end | ||
293 | - end | ||
294 | - | ||
295 | - mapping = {} | ||
296 | - | ||
297 | # conversions | 32 | # conversions |
298 | Array(options[:conversions]).each do |conversions_field| | 33 | Array(options[:conversions]).each do |conversions_field| |
299 | mapping[conversions_field] = { | 34 | mapping[conversions_field] = { |
@@ -427,6 +162,277 @@ module Searchkick | @@ -427,6 +162,277 @@ module Searchkick | ||
427 | } | 162 | } |
428 | end | 163 | end |
429 | 164 | ||
165 | + def generate_settings | ||
166 | + language = options[:language] | ||
167 | + language = language.call if language.respond_to?(:call) | ||
168 | + | ||
169 | + settings = { | ||
170 | + analysis: { | ||
171 | + analyzer: { | ||
172 | + searchkick_keyword: { | ||
173 | + type: "custom", | ||
174 | + tokenizer: "keyword", | ||
175 | + filter: ["lowercase"] + (options[:stem_conversions] ? ["searchkick_stemmer"] : []) | ||
176 | + }, | ||
177 | + default_analyzer => { | ||
178 | + type: "custom", | ||
179 | + # character filters -> tokenizer -> token filters | ||
180 | + # https://www.elastic.co/guide/en/elasticsearch/guide/current/analysis-intro.html | ||
181 | + char_filter: ["ampersand"], | ||
182 | + tokenizer: "standard", | ||
183 | + # synonym should come last, after stemming and shingle | ||
184 | + # shingle must come before searchkick_stemmer | ||
185 | + filter: ["lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"] | ||
186 | + }, | ||
187 | + searchkick_search: { | ||
188 | + type: "custom", | ||
189 | + char_filter: ["ampersand"], | ||
190 | + tokenizer: "standard", | ||
191 | + filter: ["lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"] | ||
192 | + }, | ||
193 | + searchkick_search2: { | ||
194 | + type: "custom", | ||
195 | + char_filter: ["ampersand"], | ||
196 | + tokenizer: "standard", | ||
197 | + filter: ["lowercase", "asciifolding", "searchkick_stemmer"] | ||
198 | + }, | ||
199 | + # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb | ||
200 | + searchkick_autocomplete_search: { | ||
201 | + type: "custom", | ||
202 | + tokenizer: "keyword", | ||
203 | + filter: ["lowercase", "asciifolding"] | ||
204 | + }, | ||
205 | + searchkick_word_search: { | ||
206 | + type: "custom", | ||
207 | + tokenizer: "standard", | ||
208 | + filter: ["lowercase", "asciifolding"] | ||
209 | + }, | ||
210 | + searchkick_suggest_index: { | ||
211 | + type: "custom", | ||
212 | + tokenizer: "standard", | ||
213 | + filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"] | ||
214 | + }, | ||
215 | + searchkick_text_start_index: { | ||
216 | + type: "custom", | ||
217 | + tokenizer: "keyword", | ||
218 | + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] | ||
219 | + }, | ||
220 | + searchkick_text_middle_index: { | ||
221 | + type: "custom", | ||
222 | + tokenizer: "keyword", | ||
223 | + filter: ["lowercase", "asciifolding", "searchkick_ngram"] | ||
224 | + }, | ||
225 | + searchkick_text_end_index: { | ||
226 | + type: "custom", | ||
227 | + tokenizer: "keyword", | ||
228 | + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] | ||
229 | + }, | ||
230 | + searchkick_word_start_index: { | ||
231 | + type: "custom", | ||
232 | + tokenizer: "standard", | ||
233 | + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] | ||
234 | + }, | ||
235 | + searchkick_word_middle_index: { | ||
236 | + type: "custom", | ||
237 | + tokenizer: "standard", | ||
238 | + filter: ["lowercase", "asciifolding", "searchkick_ngram"] | ||
239 | + }, | ||
240 | + searchkick_word_end_index: { | ||
241 | + type: "custom", | ||
242 | + tokenizer: "standard", | ||
243 | + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] | ||
244 | + } | ||
245 | + }, | ||
246 | + filter: { | ||
247 | + searchkick_index_shingle: { | ||
248 | + type: "shingle", | ||
249 | + token_separator: "" | ||
250 | + }, | ||
251 | + # lucky find https://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7 | ||
252 | + searchkick_search_shingle: { | ||
253 | + type: "shingle", | ||
254 | + token_separator: "", | ||
255 | + output_unigrams: false, | ||
256 | + output_unigrams_if_no_shingles: true | ||
257 | + }, | ||
258 | + searchkick_suggest_shingle: { | ||
259 | + type: "shingle", | ||
260 | + max_shingle_size: 5 | ||
261 | + }, | ||
262 | + searchkick_edge_ngram: { | ||
263 | + type: "edge_ngram", | ||
264 | + min_gram: 1, | ||
265 | + max_gram: 50 | ||
266 | + }, | ||
267 | + searchkick_ngram: { | ||
268 | + type: "ngram", | ||
269 | + min_gram: 1, | ||
270 | + max_gram: 50 | ||
271 | + }, | ||
272 | + searchkick_stemmer: { | ||
273 | + # use stemmer if language is lowercase, snowball otherwise | ||
274 | + type: language == language.to_s.downcase ? "stemmer" : "snowball", | ||
275 | + language: language || "English" | ||
276 | + } | ||
277 | + }, | ||
278 | + char_filter: { | ||
279 | + # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html | ||
280 | + # &_to_and | ||
281 | + ampersand: { | ||
282 | + type: "mapping", | ||
283 | + mappings: ["&=> and "] | ||
284 | + } | ||
285 | + } | ||
286 | + } | ||
287 | + } | ||
288 | + | ||
289 | + stem = options[:stem] | ||
290 | + | ||
291 | + case language | ||
292 | + when "chinese" | ||
293 | + settings[:analysis][:analyzer].merge!( | ||
294 | + default_analyzer => { | ||
295 | + type: "ik_smart" | ||
296 | + }, | ||
297 | + searchkick_search: { | ||
298 | + type: "ik_smart" | ||
299 | + }, | ||
300 | + searchkick_search2: { | ||
301 | + type: "ik_max_word" | ||
302 | + } | ||
303 | + ) | ||
304 | + | ||
305 | + stem = false | ||
306 | + when "chinese2", "smartcn" | ||
307 | + settings[:analysis][:analyzer].merge!( | ||
308 | + default_analyzer => { | ||
309 | + type: "smartcn" | ||
310 | + }, | ||
311 | + searchkick_search: { | ||
312 | + type: "smartcn" | ||
313 | + }, | ||
314 | + searchkick_search2: { | ||
315 | + type: "smartcn" | ||
316 | + } | ||
317 | + ) | ||
318 | + | ||
319 | + stem = false | ||
320 | + when "japanese" | ||
321 | + settings[:analysis][:analyzer].merge!( | ||
322 | + default_analyzer => { | ||
323 | + type: "kuromoji" | ||
324 | + }, | ||
325 | + searchkick_search: { | ||
326 | + type: "kuromoji" | ||
327 | + }, | ||
328 | + searchkick_search2: { | ||
329 | + type: "kuromoji" | ||
330 | + } | ||
331 | + ) | ||
332 | + | ||
333 | + stem = false | ||
334 | + when "korean" | ||
335 | + settings[:analysis][:analyzer].merge!( | ||
336 | + default_analyzer => { | ||
337 | + type: "openkoreantext-analyzer" | ||
338 | + }, | ||
339 | + searchkick_search: { | ||
340 | + type: "openkoreantext-analyzer" | ||
341 | + }, | ||
342 | + searchkick_search2: { | ||
343 | + type: "openkoreantext-analyzer" | ||
344 | + } | ||
345 | + ) | ||
346 | + | ||
347 | + stem = false | ||
348 | + when "korean2" | ||
349 | + settings[:analysis][:analyzer].merge!( | ||
350 | + default_analyzer => { | ||
351 | + type: "nori" | ||
352 | + }, | ||
353 | + searchkick_search: { | ||
354 | + type: "nori" | ||
355 | + }, | ||
356 | + searchkick_search2: { | ||
357 | + type: "nori" | ||
358 | + } | ||
359 | + ) | ||
360 | + | ||
361 | + stem = false | ||
362 | + when "vietnamese" | ||
363 | + settings[:analysis][:analyzer].merge!( | ||
364 | + default_analyzer => { | ||
365 | + type: "vi_analyzer" | ||
366 | + }, | ||
367 | + searchkick_search: { | ||
368 | + type: "vi_analyzer" | ||
369 | + }, | ||
370 | + searchkick_search2: { | ||
371 | + type: "vi_analyzer" | ||
372 | + } | ||
373 | + ) | ||
374 | + | ||
375 | + stem = false | ||
376 | + when "polish", "ukrainian" | ||
377 | + settings[:analysis][:analyzer].merge!( | ||
378 | + default_analyzer => { | ||
379 | + type: language | ||
380 | + }, | ||
381 | + searchkick_search: { | ||
382 | + type: language | ||
383 | + }, | ||
384 | + searchkick_search2: { | ||
385 | + type: language | ||
386 | + } | ||
387 | + ) | ||
388 | + | ||
389 | + stem = false | ||
390 | + end | ||
391 | + | ||
392 | + if Searchkick.env == "test" | ||
393 | + settings[:number_of_shards] = 1 | ||
394 | + settings[:number_of_replicas] = 0 | ||
395 | + end | ||
396 | + | ||
397 | + if options[:similarity] | ||
398 | + settings[:similarity] = {default: {type: options[:similarity]}} | ||
399 | + end | ||
400 | + | ||
401 | + unless below62 | ||
402 | + settings[:index] = { | ||
403 | + max_ngram_diff: 49, | ||
404 | + max_shingle_diff: 4 | ||
405 | + } | ||
406 | + end | ||
407 | + | ||
408 | + if options[:case_sensitive] | ||
409 | + settings[:analysis][:analyzer].each do |_, analyzer| | ||
410 | + analyzer[:filter].delete("lowercase") | ||
411 | + end | ||
412 | + end | ||
413 | + | ||
414 | + if stem == false | ||
415 | + settings[:analysis][:filter].delete(:searchkick_stemmer) | ||
416 | + settings[:analysis][:analyzer].each do |_, analyzer| | ||
417 | + analyzer[:filter].delete("searchkick_stemmer") if analyzer[:filter] | ||
418 | + end | ||
419 | + end | ||
420 | + | ||
421 | + settings = settings.symbolize_keys.deep_merge((options[:settings] || {}).symbolize_keys) | ||
422 | + | ||
423 | + add_synonyms(settings) | ||
424 | + add_search_synonyms(settings) | ||
425 | + add_wordnet(settings) if options[:wordnet] | ||
426 | + | ||
427 | + if options[:special_characters] == false | ||
428 | + settings[:analysis][:analyzer].each_value do |analyzer_settings| | ||
429 | + analyzer_settings[:filter].reject! { |f| f == "asciifolding" } | ||
430 | + end | ||
431 | + end | ||
432 | + | ||
433 | + settings | ||
434 | + end | ||
435 | + | ||
430 | def add_synonyms(settings) | 436 | def add_synonyms(settings) |
431 | synonyms = options[:synonyms] || [] | 437 | synonyms = options[:synonyms] || [] |
432 | synonyms = synonyms.call if synonyms.respond_to?(:call) | 438 | synonyms = synonyms.call if synonyms.respond_to?(:call) |