Commit f915ef753b5caabd29923cf03ffba4276110ae95

Authored by Andrew Kane
1 parent dbe04a0a

Added generate settings method

Showing 1 changed file with 274 additions and 268 deletions   Show diff stats
lib/searchkick/index_options.rb
... ... @@ -22,278 +22,13 @@ module Searchkick
22 22 settings = options[:settings] || {}
23 23 mappings = custom_mapping
24 24 else
25   - language = options[:language]
26   - language = language.call if language.respond_to?(:call)
  25 + settings = generate_settings
  26 +
  27 + mapping = {}
27 28  
28 29 keyword_mapping = {type: "keyword"}
29 30 keyword_mapping[:ignore_above] = options[:ignore_above] || 30000
30 31  
31   - settings = {
32   - analysis: {
33   - analyzer: {
34   - searchkick_keyword: {
35   - type: "custom",
36   - tokenizer: "keyword",
37   - filter: ["lowercase"] + (options[:stem_conversions] ? ["searchkick_stemmer"] : [])
38   - },
39   - default_analyzer => {
40   - type: "custom",
41   - # character filters -> tokenizer -> token filters
42   - # https://www.elastic.co/guide/en/elasticsearch/guide/current/analysis-intro.html
43   - char_filter: ["ampersand"],
44   - tokenizer: "standard",
45   - # synonym should come last, after stemming and shingle
46   - # shingle must come before searchkick_stemmer
47   - filter: ["lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"]
48   - },
49   - searchkick_search: {
50   - type: "custom",
51   - char_filter: ["ampersand"],
52   - tokenizer: "standard",
53   - filter: ["lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"]
54   - },
55   - searchkick_search2: {
56   - type: "custom",
57   - char_filter: ["ampersand"],
58   - tokenizer: "standard",
59   - filter: ["lowercase", "asciifolding", "searchkick_stemmer"]
60   - },
61   - # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb
62   - searchkick_autocomplete_search: {
63   - type: "custom",
64   - tokenizer: "keyword",
65   - filter: ["lowercase", "asciifolding"]
66   - },
67   - searchkick_word_search: {
68   - type: "custom",
69   - tokenizer: "standard",
70   - filter: ["lowercase", "asciifolding"]
71   - },
72   - searchkick_suggest_index: {
73   - type: "custom",
74   - tokenizer: "standard",
75   - filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"]
76   - },
77   - searchkick_text_start_index: {
78   - type: "custom",
79   - tokenizer: "keyword",
80   - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
81   - },
82   - searchkick_text_middle_index: {
83   - type: "custom",
84   - tokenizer: "keyword",
85   - filter: ["lowercase", "asciifolding", "searchkick_ngram"]
86   - },
87   - searchkick_text_end_index: {
88   - type: "custom",
89   - tokenizer: "keyword",
90   - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
91   - },
92   - searchkick_word_start_index: {
93   - type: "custom",
94   - tokenizer: "standard",
95   - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
96   - },
97   - searchkick_word_middle_index: {
98   - type: "custom",
99   - tokenizer: "standard",
100   - filter: ["lowercase", "asciifolding", "searchkick_ngram"]
101   - },
102   - searchkick_word_end_index: {
103   - type: "custom",
104   - tokenizer: "standard",
105   - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
106   - }
107   - },
108   - filter: {
109   - searchkick_index_shingle: {
110   - type: "shingle",
111   - token_separator: ""
112   - },
113   - # lucky find https://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7
114   - searchkick_search_shingle: {
115   - type: "shingle",
116   - token_separator: "",
117   - output_unigrams: false,
118   - output_unigrams_if_no_shingles: true
119   - },
120   - searchkick_suggest_shingle: {
121   - type: "shingle",
122   - max_shingle_size: 5
123   - },
124   - searchkick_edge_ngram: {
125   - type: "edge_ngram",
126   - min_gram: 1,
127   - max_gram: 50
128   - },
129   - searchkick_ngram: {
130   - type: "ngram",
131   - min_gram: 1,
132   - max_gram: 50
133   - },
134   - searchkick_stemmer: {
135   - # use stemmer if language is lowercase, snowball otherwise
136   - type: language == language.to_s.downcase ? "stemmer" : "snowball",
137   - language: language || "English"
138   - }
139   - },
140   - char_filter: {
141   - # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html
142   - # &_to_and
143   - ampersand: {
144   - type: "mapping",
145   - mappings: ["&=> and "]
146   - }
147   - }
148   - }
149   - }
150   -
151   - stem = options[:stem]
152   -
153   - case language
154   - when "chinese"
155   - settings[:analysis][:analyzer].merge!(
156   - default_analyzer => {
157   - type: "ik_smart"
158   - },
159   - searchkick_search: {
160   - type: "ik_smart"
161   - },
162   - searchkick_search2: {
163   - type: "ik_max_word"
164   - }
165   - )
166   -
167   - stem = false
168   - when "chinese2", "smartcn"
169   - settings[:analysis][:analyzer].merge!(
170   - default_analyzer => {
171   - type: "smartcn"
172   - },
173   - searchkick_search: {
174   - type: "smartcn"
175   - },
176   - searchkick_search2: {
177   - type: "smartcn"
178   - }
179   - )
180   -
181   - stem = false
182   - when "japanese"
183   - settings[:analysis][:analyzer].merge!(
184   - default_analyzer => {
185   - type: "kuromoji"
186   - },
187   - searchkick_search: {
188   - type: "kuromoji"
189   - },
190   - searchkick_search2: {
191   - type: "kuromoji"
192   - }
193   - )
194   -
195   - stem = false
196   - when "korean"
197   - settings[:analysis][:analyzer].merge!(
198   - default_analyzer => {
199   - type: "openkoreantext-analyzer"
200   - },
201   - searchkick_search: {
202   - type: "openkoreantext-analyzer"
203   - },
204   - searchkick_search2: {
205   - type: "openkoreantext-analyzer"
206   - }
207   - )
208   -
209   - stem = false
210   - when "korean2"
211   - settings[:analysis][:analyzer].merge!(
212   - default_analyzer => {
213   - type: "nori"
214   - },
215   - searchkick_search: {
216   - type: "nori"
217   - },
218   - searchkick_search2: {
219   - type: "nori"
220   - }
221   - )
222   -
223   - stem = false
224   - when "vietnamese"
225   - settings[:analysis][:analyzer].merge!(
226   - default_analyzer => {
227   - type: "vi_analyzer"
228   - },
229   - searchkick_search: {
230   - type: "vi_analyzer"
231   - },
232   - searchkick_search2: {
233   - type: "vi_analyzer"
234   - }
235   - )
236   -
237   - stem = false
238   - when "polish", "ukrainian"
239   - settings[:analysis][:analyzer].merge!(
240   - default_analyzer => {
241   - type: language
242   - },
243   - searchkick_search: {
244   - type: language
245   - },
246   - searchkick_search2: {
247   - type: language
248   - }
249   - )
250   -
251   - stem = false
252   - end
253   -
254   - if Searchkick.env == "test"
255   - settings[:number_of_shards] = 1
256   - settings[:number_of_replicas] = 0
257   - end
258   -
259   - if options[:similarity]
260   - settings[:similarity] = {default: {type: options[:similarity]}}
261   - end
262   -
263   - unless below62
264   - settings[:index] = {
265   - max_ngram_diff: 49,
266   - max_shingle_diff: 4
267   - }
268   - end
269   -
270   - if options[:case_sensitive]
271   - settings[:analysis][:analyzer].each do |_, analyzer|
272   - analyzer[:filter].delete("lowercase")
273   - end
274   - end
275   -
276   - if stem == false
277   - settings[:analysis][:filter].delete(:searchkick_stemmer)
278   - settings[:analysis][:analyzer].each do |_, analyzer|
279   - analyzer[:filter].delete("searchkick_stemmer") if analyzer[:filter]
280   - end
281   - end
282   -
283   - settings = settings.symbolize_keys.deep_merge((options[:settings] || {}).symbolize_keys)
284   -
285   - add_synonyms(settings)
286   - add_search_synonyms(settings)
287   - add_wordnet(settings) if options[:wordnet]
288   -
289   - if options[:special_characters] == false
290   - settings[:analysis][:analyzer].each_value do |analyzer_settings|
291   - analyzer_settings[:filter].reject! { |f| f == "asciifolding" }
292   - end
293   - end
294   -
295   - mapping = {}
296   -
297 32 # conversions
298 33 Array(options[:conversions]).each do |conversions_field|
299 34 mapping[conversions_field] = {
... ... @@ -427,6 +162,277 @@ module Searchkick
427 162 }
428 163 end
429 164  
  165 + def generate_settings
  166 + language = options[:language]
  167 + language = language.call if language.respond_to?(:call)
  168 +
  169 + settings = {
  170 + analysis: {
  171 + analyzer: {
  172 + searchkick_keyword: {
  173 + type: "custom",
  174 + tokenizer: "keyword",
  175 + filter: ["lowercase"] + (options[:stem_conversions] ? ["searchkick_stemmer"] : [])
  176 + },
  177 + default_analyzer => {
  178 + type: "custom",
  179 + # character filters -> tokenizer -> token filters
  180 + # https://www.elastic.co/guide/en/elasticsearch/guide/current/analysis-intro.html
  181 + char_filter: ["ampersand"],
  182 + tokenizer: "standard",
  183 + # synonym should come last, after stemming and shingle
  184 + # shingle must come before searchkick_stemmer
  185 + filter: ["lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"]
  186 + },
  187 + searchkick_search: {
  188 + type: "custom",
  189 + char_filter: ["ampersand"],
  190 + tokenizer: "standard",
  191 + filter: ["lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"]
  192 + },
  193 + searchkick_search2: {
  194 + type: "custom",
  195 + char_filter: ["ampersand"],
  196 + tokenizer: "standard",
  197 + filter: ["lowercase", "asciifolding", "searchkick_stemmer"]
  198 + },
  199 + # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb
  200 + searchkick_autocomplete_search: {
  201 + type: "custom",
  202 + tokenizer: "keyword",
  203 + filter: ["lowercase", "asciifolding"]
  204 + },
  205 + searchkick_word_search: {
  206 + type: "custom",
  207 + tokenizer: "standard",
  208 + filter: ["lowercase", "asciifolding"]
  209 + },
  210 + searchkick_suggest_index: {
  211 + type: "custom",
  212 + tokenizer: "standard",
  213 + filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"]
  214 + },
  215 + searchkick_text_start_index: {
  216 + type: "custom",
  217 + tokenizer: "keyword",
  218 + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
  219 + },
  220 + searchkick_text_middle_index: {
  221 + type: "custom",
  222 + tokenizer: "keyword",
  223 + filter: ["lowercase", "asciifolding", "searchkick_ngram"]
  224 + },
  225 + searchkick_text_end_index: {
  226 + type: "custom",
  227 + tokenizer: "keyword",
  228 + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
  229 + },
  230 + searchkick_word_start_index: {
  231 + type: "custom",
  232 + tokenizer: "standard",
  233 + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
  234 + },
  235 + searchkick_word_middle_index: {
  236 + type: "custom",
  237 + tokenizer: "standard",
  238 + filter: ["lowercase", "asciifolding", "searchkick_ngram"]
  239 + },
  240 + searchkick_word_end_index: {
  241 + type: "custom",
  242 + tokenizer: "standard",
  243 + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
  244 + }
  245 + },
  246 + filter: {
  247 + searchkick_index_shingle: {
  248 + type: "shingle",
  249 + token_separator: ""
  250 + },
  251 + # lucky find https://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7
  252 + searchkick_search_shingle: {
  253 + type: "shingle",
  254 + token_separator: "",
  255 + output_unigrams: false,
  256 + output_unigrams_if_no_shingles: true
  257 + },
  258 + searchkick_suggest_shingle: {
  259 + type: "shingle",
  260 + max_shingle_size: 5
  261 + },
  262 + searchkick_edge_ngram: {
  263 + type: "edge_ngram",
  264 + min_gram: 1,
  265 + max_gram: 50
  266 + },
  267 + searchkick_ngram: {
  268 + type: "ngram",
  269 + min_gram: 1,
  270 + max_gram: 50
  271 + },
  272 + searchkick_stemmer: {
  273 + # use stemmer if language is lowercase, snowball otherwise
  274 + type: language == language.to_s.downcase ? "stemmer" : "snowball",
  275 + language: language || "English"
  276 + }
  277 + },
  278 + char_filter: {
  279 + # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html
  280 + # &_to_and
  281 + ampersand: {
  282 + type: "mapping",
  283 + mappings: ["&=> and "]
  284 + }
  285 + }
  286 + }
  287 + }
  288 +
  289 + stem = options[:stem]
  290 +
  291 + case language
  292 + when "chinese"
  293 + settings[:analysis][:analyzer].merge!(
  294 + default_analyzer => {
  295 + type: "ik_smart"
  296 + },
  297 + searchkick_search: {
  298 + type: "ik_smart"
  299 + },
  300 + searchkick_search2: {
  301 + type: "ik_max_word"
  302 + }
  303 + )
  304 +
  305 + stem = false
  306 + when "chinese2", "smartcn"
  307 + settings[:analysis][:analyzer].merge!(
  308 + default_analyzer => {
  309 + type: "smartcn"
  310 + },
  311 + searchkick_search: {
  312 + type: "smartcn"
  313 + },
  314 + searchkick_search2: {
  315 + type: "smartcn"
  316 + }
  317 + )
  318 +
  319 + stem = false
  320 + when "japanese"
  321 + settings[:analysis][:analyzer].merge!(
  322 + default_analyzer => {
  323 + type: "kuromoji"
  324 + },
  325 + searchkick_search: {
  326 + type: "kuromoji"
  327 + },
  328 + searchkick_search2: {
  329 + type: "kuromoji"
  330 + }
  331 + )
  332 +
  333 + stem = false
  334 + when "korean"
  335 + settings[:analysis][:analyzer].merge!(
  336 + default_analyzer => {
  337 + type: "openkoreantext-analyzer"
  338 + },
  339 + searchkick_search: {
  340 + type: "openkoreantext-analyzer"
  341 + },
  342 + searchkick_search2: {
  343 + type: "openkoreantext-analyzer"
  344 + }
  345 + )
  346 +
  347 + stem = false
  348 + when "korean2"
  349 + settings[:analysis][:analyzer].merge!(
  350 + default_analyzer => {
  351 + type: "nori"
  352 + },
  353 + searchkick_search: {
  354 + type: "nori"
  355 + },
  356 + searchkick_search2: {
  357 + type: "nori"
  358 + }
  359 + )
  360 +
  361 + stem = false
  362 + when "vietnamese"
  363 + settings[:analysis][:analyzer].merge!(
  364 + default_analyzer => {
  365 + type: "vi_analyzer"
  366 + },
  367 + searchkick_search: {
  368 + type: "vi_analyzer"
  369 + },
  370 + searchkick_search2: {
  371 + type: "vi_analyzer"
  372 + }
  373 + )
  374 +
  375 + stem = false
  376 + when "polish", "ukrainian"
  377 + settings[:analysis][:analyzer].merge!(
  378 + default_analyzer => {
  379 + type: language
  380 + },
  381 + searchkick_search: {
  382 + type: language
  383 + },
  384 + searchkick_search2: {
  385 + type: language
  386 + }
  387 + )
  388 +
  389 + stem = false
  390 + end
  391 +
  392 + if Searchkick.env == "test"
  393 + settings[:number_of_shards] = 1
  394 + settings[:number_of_replicas] = 0
  395 + end
  396 +
  397 + if options[:similarity]
  398 + settings[:similarity] = {default: {type: options[:similarity]}}
  399 + end
  400 +
  401 + unless below62
  402 + settings[:index] = {
  403 + max_ngram_diff: 49,
  404 + max_shingle_diff: 4
  405 + }
  406 + end
  407 +
  408 + if options[:case_sensitive]
  409 + settings[:analysis][:analyzer].each do |_, analyzer|
  410 + analyzer[:filter].delete("lowercase")
  411 + end
  412 + end
  413 +
  414 + if stem == false
  415 + settings[:analysis][:filter].delete(:searchkick_stemmer)
  416 + settings[:analysis][:analyzer].each do |_, analyzer|
  417 + analyzer[:filter].delete("searchkick_stemmer") if analyzer[:filter]
  418 + end
  419 + end
  420 +
  421 + settings = settings.symbolize_keys.deep_merge((options[:settings] || {}).symbolize_keys)
  422 +
  423 + add_synonyms(settings)
  424 + add_search_synonyms(settings)
  425 + add_wordnet(settings) if options[:wordnet]
  426 +
  427 + if options[:special_characters] == false
  428 + settings[:analysis][:analyzer].each_value do |analyzer_settings|
  429 + analyzer_settings[:filter].reject! { |f| f == "asciifolding" }
  430 + end
  431 + end
  432 +
  433 + settings
  434 + end
  435 +
430 436 def add_synonyms(settings)
431 437 synonyms = options[:synonyms] || []
432 438 synonyms = synonyms.call if synonyms.respond_to?(:call)
... ...