Commit 67f3e9b5f11f1608feca39c28dc0ef7eac6a2020

Authored by Andrew Kane
1 parent ae5a7e94

Moved logic into Searchkick::Index

lib/searchkick/index.rb
1 1 module Searchkick
2 2 class Index
3   - attr_reader :name
  3 + attr_reader :name, :options
4 4  
5   - def initialize(name)
  5 + def initialize(name, options = {})
6 6 @name = name
  7 + @options = options
7 8 end
8 9  
9 10 def create(options = {})
... ... @@ -84,6 +85,279 @@ module Searchkick
84 85 client.indices.analyze({text: text, index: name}.merge(options))["tokens"].map{|t| t["token"] }
85 86 end
86 87  
  88 + # remove old indices that start w/ index_name
  89 + def clean_indices
  90 + all_indices = Searchkick.client.indices.get_aliases
  91 + indices = all_indices.select{|k, v| (v.empty? || v["aliases"].empty?) && k =~ /\A#{Regexp.escape(name)}_\d{14,17}\z/ }.keys
  92 + indices.each do |index|
  93 + Searchkick::Index.new(index).delete
  94 + end
  95 + indices
  96 + end
  97 +
  98 + def create_index
  99 + index = Searchkick::Index.new("#{name}_#{Time.now.strftime('%Y%m%d%H%M%S%L')}")
  100 + index.create(index_options)
  101 + index
  102 + end
  103 +
  104 + def index_options
  105 + options = @options
  106 +
  107 + if options[:mappings] and !options[:merge_mappings]
  108 + settings = options[:settings] || {}
  109 + mappings = options[:mappings]
  110 + else
  111 + settings = {
  112 + analysis: {
  113 + analyzer: {
  114 + searchkick_keyword: {
  115 + type: "custom",
  116 + tokenizer: "keyword",
  117 + filter: ["lowercase"] + (options[:stem_conversions] == false ? [] : ["searchkick_stemmer"])
  118 + },
  119 + default_index: {
  120 + type: "custom",
  121 + tokenizer: "standard",
  122 + # synonym should come last, after stemming and shingle
  123 + # shingle must come before searchkick_stemmer
  124 + filter: ["standard", "lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"]
  125 + },
  126 + searchkick_search: {
  127 + type: "custom",
  128 + tokenizer: "standard",
  129 + filter: ["standard", "lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"]
  130 + },
  131 + searchkick_search2: {
  132 + type: "custom",
  133 + tokenizer: "standard",
  134 + filter: ["standard", "lowercase", "asciifolding", "searchkick_stemmer"]
  135 + },
  136 + # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb
  137 + searchkick_autocomplete_index: {
  138 + type: "custom",
  139 + tokenizer: "searchkick_autocomplete_ngram",
  140 + filter: ["lowercase", "asciifolding"]
  141 + },
  142 + searchkick_autocomplete_search: {
  143 + type: "custom",
  144 + tokenizer: "keyword",
  145 + filter: ["lowercase", "asciifolding"]
  146 + },
  147 + searchkick_word_search: {
  148 + type: "custom",
  149 + tokenizer: "standard",
  150 + filter: ["lowercase", "asciifolding"]
  151 + },
  152 + searchkick_suggest_index: {
  153 + type: "custom",
  154 + tokenizer: "standard",
  155 + filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"]
  156 + },
  157 + searchkick_text_start_index: {
  158 + type: "custom",
  159 + tokenizer: "keyword",
  160 + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
  161 + },
  162 + searchkick_text_middle_index: {
  163 + type: "custom",
  164 + tokenizer: "keyword",
  165 + filter: ["lowercase", "asciifolding", "searchkick_ngram"]
  166 + },
  167 + searchkick_text_end_index: {
  168 + type: "custom",
  169 + tokenizer: "keyword",
  170 + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
  171 + },
  172 + searchkick_word_start_index: {
  173 + type: "custom",
  174 + tokenizer: "standard",
  175 + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
  176 + },
  177 + searchkick_word_middle_index: {
  178 + type: "custom",
  179 + tokenizer: "standard",
  180 + filter: ["lowercase", "asciifolding", "searchkick_ngram"]
  181 + },
  182 + searchkick_word_end_index: {
  183 + type: "custom",
  184 + tokenizer: "standard",
  185 + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
  186 + }
  187 + },
  188 + filter: {
  189 + searchkick_index_shingle: {
  190 + type: "shingle",
  191 + token_separator: ""
  192 + },
  193 + # lucky find http://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7
  194 + searchkick_search_shingle: {
  195 + type: "shingle",
  196 + token_separator: "",
  197 + output_unigrams: false,
  198 + output_unigrams_if_no_shingles: true
  199 + },
  200 + searchkick_suggest_shingle: {
  201 + type: "shingle",
  202 + max_shingle_size: 5
  203 + },
  204 + searchkick_edge_ngram: {
  205 + type: "edgeNGram",
  206 + min_gram: 1,
  207 + max_gram: 50
  208 + },
  209 + searchkick_ngram: {
  210 + type: "nGram",
  211 + min_gram: 1,
  212 + max_gram: 50
  213 + },
  214 + searchkick_stemmer: {
  215 + type: "snowball",
  216 + language: options[:language] || "English"
  217 + }
  218 + },
  219 + tokenizer: {
  220 + searchkick_autocomplete_ngram: {
  221 + type: "edgeNGram",
  222 + min_gram: 1,
  223 + max_gram: 50
  224 + }
  225 + }
  226 + }
  227 + }
  228 +
  229 + if Searchkick.env == "test"
  230 + settings.merge!(number_of_shards: 1, number_of_replicas: 0)
  231 + end
  232 +
  233 + settings.deep_merge!(options[:settings] || {})
  234 +
  235 + # synonyms
  236 + synonyms = options[:synonyms] || []
  237 + if synonyms.any?
  238 + settings[:analysis][:filter][:searchkick_synonym] = {
  239 + type: "synonym",
  240 + synonyms: synonyms.select{|s| s.size > 1 }.map{|s| s.join(",") }
  241 + }
  242 + # choosing a place for the synonym filter when stemming is not easy
  243 + # https://groups.google.com/forum/#!topic/elasticsearch/p7qcQlgHdB8
  244 + # TODO use a snowball stemmer on synonyms when creating the token filter
  245 +
  246 + # http://elasticsearch-users.115913.n3.nabble.com/synonym-multi-words-search-td4030811.html
  247 + # I find the following approach effective if you are doing multi-word synonyms (synonym phrases):
  248 + # - Only apply the synonym expansion at index time
  249 + # - Don't have the synonym filter applied search
  250 + # - Use directional synonyms where appropriate. You want to make sure that you're not injecting terms that are too general.
  251 + settings[:analysis][:analyzer][:default_index][:filter].insert(4, "searchkick_synonym")
  252 + settings[:analysis][:analyzer][:default_index][:filter] << "searchkick_synonym"
  253 + end
  254 +
  255 + if options[:wordnet]
  256 + settings[:analysis][:filter][:searchkick_wordnet] = {
  257 + type: "synonym",
  258 + format: "wordnet",
  259 + synonyms_path: Searchkick.wordnet_path
  260 + }
  261 +
  262 + settings[:analysis][:analyzer][:default_index][:filter].insert(4, "searchkick_wordnet")
  263 + settings[:analysis][:analyzer][:default_index][:filter] << "searchkick_wordnet"
  264 + end
  265 +
  266 + if options[:special_characters] == false
  267 + settings[:analysis][:analyzer].each do |analyzer, analyzer_settings|
  268 + analyzer_settings[:filter].reject!{|f| f == "asciifolding" }
  269 + end
  270 + end
  271 +
  272 + mapping = {}
  273 +
  274 + # conversions
  275 + if options[:conversions]
  276 + mapping[:conversions] = {
  277 + type: "nested",
  278 + properties: {
  279 + query: {type: "string", analyzer: "searchkick_keyword"},
  280 + count: {type: "integer"}
  281 + }
  282 + }
  283 + end
  284 +
  285 + mapping_options = Hash[
  286 + [:autocomplete, :suggest, :text_start, :text_middle, :text_end, :word_start, :word_middle, :word_end, :highlight]
  287 + .map{|type| [type, (options[type] || []).map(&:to_s)] }
  288 + ]
  289 +
  290 + mapping_options.values.flatten.uniq.each do |field|
  291 + field_mapping = {
  292 + type: "multi_field",
  293 + fields: {
  294 + field => {type: "string", index: "not_analyzed"},
  295 + "analyzed" => {type: "string", index: "analyzed"}
  296 + # term_vector: "with_positions_offsets" for fast / correct highlighting
  297 + # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-request-highlighting.html#_fast_vector_highlighter
  298 + }
  299 + }
  300 +
  301 + mapping_options.except(:highlight).each do |type, fields|
  302 + if fields.include?(field)
  303 + field_mapping[:fields][type] = {type: "string", index: "analyzed", analyzer: "searchkick_#{type}_index"}
  304 + end
  305 + end
  306 +
  307 + if mapping_options[:highlight].include?(field)
  308 + field_mapping[:fields]["analyzed"][:term_vector] = "with_positions_offsets"
  309 + end
  310 +
  311 + mapping[field] = field_mapping
  312 + end
  313 +
  314 + (options[:locations] || []).map(&:to_s).each do |field|
  315 + mapping[field] = {
  316 + type: "geo_point"
  317 + }
  318 + end
  319 +
  320 + (options[:unsearchable] || []).map(&:to_s).each do |field|
  321 + mapping[field] = {
  322 + type: "string",
  323 + index: "no"
  324 + }
  325 + end
  326 +
  327 + mappings = {
  328 + _default_: {
  329 + properties: mapping,
  330 + # https://gist.github.com/kimchy/2898285
  331 + dynamic_templates: [
  332 + {
  333 + string_template: {
  334 + match: "*",
  335 + match_mapping_type: "string",
  336 + mapping: {
  337 + # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/
  338 + type: "multi_field",
  339 + fields: {
  340 + # analyzed field must be the default field for include_in_all
  341 + # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/
  342 + # however, we can include the not_analyzed field in _all
  343 + # and the _all index analyzer will take care of it
  344 + "{name}" => {type: "string", index: "not_analyzed"},
  345 + "analyzed" => {type: "string", index: "analyzed"}
  346 + }
  347 + }
  348 + }
  349 + }
  350 + ]
  351 + }
  352 + }.deep_merge(options[:mappings] || {})
  353 + end
  354 +
  355 + {
  356 + settings: settings,
  357 + mappings: mappings
  358 + }
  359 + end
  360 +
87 361 protected
88 362  
89 363 def client
... ...
lib/searchkick/model.rb
... ... @@ -17,7 +17,7 @@ module Searchkick
17 17 def self.searchkick_index
18 18 index = class_variable_get :@@searchkick_index
19 19 index = index.call if index.respond_to? :call
20   - Searchkick::Index.new(index)
  20 + Searchkick::Index.new(index, class_variable_get(:@@searchkick_options))
21 21 end
22 22  
23 23 define_singleton_method(Searchkick.search_method_name) do |term = nil, options={}, &block|
... ...
lib/searchkick/reindex.rb
... ... @@ -31,14 +31,8 @@ module Searchkick
31 31 true
32 32 end
33 33  
34   - # remove old indices that start w/ index_name
35 34 def clean_indices
36   - all_indices = Searchkick.client.indices.get_aliases
37   - indices = all_indices.select{|k, v| (v.empty? || v["aliases"].empty?) && k =~ /\A#{Regexp.escape(searchkick_index.name)}_\d{14,17}\z/ }.keys
38   - indices.each do |index|
39   - Searchkick::Index.new(index).delete
40   - end
41   - indices
  35 + searchkick_index.clean_indices
42 36 end
43 37  
44 38 def self.extended(klass)
... ... @@ -73,266 +67,11 @@ module Searchkick
73 67 end
74 68  
75 69 def searchkick_create_index
76   - index = Searchkick::Index.new("#{searchkick_index.name}_#{Time.now.strftime('%Y%m%d%H%M%S%L')}")
77   - index.create searchkick_index_options
78   - index
  70 + searchkick_index.create_index
79 71 end
80 72  
81 73 def searchkick_index_options
82   - options = searchkick_options
83   -
84   - if options[:mappings] and !options[:merge_mappings]
85   - settings = options[:settings] || {}
86   - mappings = options[:mappings]
87   - else
88   - settings = {
89   - analysis: {
90   - analyzer: {
91   - searchkick_keyword: {
92   - type: "custom",
93   - tokenizer: "keyword",
94   - filter: ["lowercase"] + (options[:stem_conversions] == false ? [] : ["searchkick_stemmer"])
95   - },
96   - default_index: {
97   - type: "custom",
98   - tokenizer: "standard",
99   - # synonym should come last, after stemming and shingle
100   - # shingle must come before searchkick_stemmer
101   - filter: ["standard", "lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"]
102   - },
103   - searchkick_search: {
104   - type: "custom",
105   - tokenizer: "standard",
106   - filter: ["standard", "lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"]
107   - },
108   - searchkick_search2: {
109   - type: "custom",
110   - tokenizer: "standard",
111   - filter: ["standard", "lowercase", "asciifolding", "searchkick_stemmer"]
112   - },
113   - # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb
114   - searchkick_autocomplete_index: {
115   - type: "custom",
116   - tokenizer: "searchkick_autocomplete_ngram",
117   - filter: ["lowercase", "asciifolding"]
118   - },
119   - searchkick_autocomplete_search: {
120   - type: "custom",
121   - tokenizer: "keyword",
122   - filter: ["lowercase", "asciifolding"]
123   - },
124   - searchkick_word_search: {
125   - type: "custom",
126   - tokenizer: "standard",
127   - filter: ["lowercase", "asciifolding"]
128   - },
129   - searchkick_suggest_index: {
130   - type: "custom",
131   - tokenizer: "standard",
132   - filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"]
133   - },
134   - searchkick_text_start_index: {
135   - type: "custom",
136   - tokenizer: "keyword",
137   - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
138   - },
139   - searchkick_text_middle_index: {
140   - type: "custom",
141   - tokenizer: "keyword",
142   - filter: ["lowercase", "asciifolding", "searchkick_ngram"]
143   - },
144   - searchkick_text_end_index: {
145   - type: "custom",
146   - tokenizer: "keyword",
147   - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
148   - },
149   - searchkick_word_start_index: {
150   - type: "custom",
151   - tokenizer: "standard",
152   - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
153   - },
154   - searchkick_word_middle_index: {
155   - type: "custom",
156   - tokenizer: "standard",
157   - filter: ["lowercase", "asciifolding", "searchkick_ngram"]
158   - },
159   - searchkick_word_end_index: {
160   - type: "custom",
161   - tokenizer: "standard",
162   - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
163   - }
164   - },
165   - filter: {
166   - searchkick_index_shingle: {
167   - type: "shingle",
168   - token_separator: ""
169   - },
170   - # lucky find http://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7
171   - searchkick_search_shingle: {
172   - type: "shingle",
173   - token_separator: "",
174   - output_unigrams: false,
175   - output_unigrams_if_no_shingles: true
176   - },
177   - searchkick_suggest_shingle: {
178   - type: "shingle",
179   - max_shingle_size: 5
180   - },
181   - searchkick_edge_ngram: {
182   - type: "edgeNGram",
183   - min_gram: 1,
184   - max_gram: 50
185   - },
186   - searchkick_ngram: {
187   - type: "nGram",
188   - min_gram: 1,
189   - max_gram: 50
190   - },
191   - searchkick_stemmer: {
192   - type: "snowball",
193   - language: options[:language] || "English"
194   - }
195   - },
196   - tokenizer: {
197   - searchkick_autocomplete_ngram: {
198   - type: "edgeNGram",
199   - min_gram: 1,
200   - max_gram: 50
201   - }
202   - }
203   - }
204   - }
205   -
206   - if Searchkick.env == "test"
207   - settings.merge!(number_of_shards: 1, number_of_replicas: 0)
208   - end
209   -
210   - settings.deep_merge!(options[:settings] || {})
211   -
212   - # synonyms
213   - synonyms = options[:synonyms] || []
214   - if synonyms.any?
215   - settings[:analysis][:filter][:searchkick_synonym] = {
216   - type: "synonym",
217   - synonyms: synonyms.select{|s| s.size > 1 }.map{|s| s.join(",") }
218   - }
219   - # choosing a place for the synonym filter when stemming is not easy
220   - # https://groups.google.com/forum/#!topic/elasticsearch/p7qcQlgHdB8
221   - # TODO use a snowball stemmer on synonyms when creating the token filter
222   -
223   - # http://elasticsearch-users.115913.n3.nabble.com/synonym-multi-words-search-td4030811.html
224   - # I find the following approach effective if you are doing multi-word synonyms (synonym phrases):
225   - # - Only apply the synonym expansion at index time
226   - # - Don't have the synonym filter applied search
227   - # - Use directional synonyms where appropriate. You want to make sure that you're not injecting terms that are too general.
228   - settings[:analysis][:analyzer][:default_index][:filter].insert(4, "searchkick_synonym")
229   - settings[:analysis][:analyzer][:default_index][:filter] << "searchkick_synonym"
230   - end
231   -
232   - if options[:wordnet]
233   - settings[:analysis][:filter][:searchkick_wordnet] = {
234   - type: "synonym",
235   - format: "wordnet",
236   - synonyms_path: Searchkick.wordnet_path
237   - }
238   -
239   - settings[:analysis][:analyzer][:default_index][:filter].insert(4, "searchkick_wordnet")
240   - settings[:analysis][:analyzer][:default_index][:filter] << "searchkick_wordnet"
241   - end
242   -
243   - if options[:special_characters] == false
244   - settings[:analysis][:analyzer].each do |analyzer, analyzer_settings|
245   - analyzer_settings[:filter].reject!{|f| f == "asciifolding" }
246   - end
247   - end
248   -
249   - mapping = {}
250   -
251   - # conversions
252   - if options[:conversions]
253   - mapping[:conversions] = {
254   - type: "nested",
255   - properties: {
256   - query: {type: "string", analyzer: "searchkick_keyword"},
257   - count: {type: "integer"}
258   - }
259   - }
260   - end
261   -
262   - mapping_options = Hash[
263   - [:autocomplete, :suggest, :text_start, :text_middle, :text_end, :word_start, :word_middle, :word_end, :highlight]
264   - .map{|type| [type, (options[type] || []).map(&:to_s)] }
265   - ]
266   -
267   - mapping_options.values.flatten.uniq.each do |field|
268   - field_mapping = {
269   - type: "multi_field",
270   - fields: {
271   - field => {type: "string", index: "not_analyzed"},
272   - "analyzed" => {type: "string", index: "analyzed"}
273   - # term_vector: "with_positions_offsets" for fast / correct highlighting
274   - # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-request-highlighting.html#_fast_vector_highlighter
275   - }
276   - }
277   -
278   - mapping_options.except(:highlight).each do |type, fields|
279   - if fields.include?(field)
280   - field_mapping[:fields][type] = {type: "string", index: "analyzed", analyzer: "searchkick_#{type}_index"}
281   - end
282   - end
283   -
284   - if mapping_options[:highlight].include?(field)
285   - field_mapping[:fields]["analyzed"][:term_vector] = "with_positions_offsets"
286   - end
287   -
288   - mapping[field] = field_mapping
289   - end
290   -
291   - (options[:locations] || []).map(&:to_s).each do |field|
292   - mapping[field] = {
293   - type: "geo_point"
294   - }
295   - end
296   -
297   - (options[:unsearchable] || []).map(&:to_s).each do |field|
298   - mapping[field] = {
299   - type: "string",
300   - index: "no"
301   - }
302   - end
303   -
304   - mappings = {
305   - _default_: {
306   - properties: mapping,
307   - # https://gist.github.com/kimchy/2898285
308   - dynamic_templates: [
309   - {
310   - string_template: {
311   - match: "*",
312   - match_mapping_type: "string",
313   - mapping: {
314   - # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/
315   - type: "multi_field",
316   - fields: {
317   - # analyzed field must be the default field for include_in_all
318   - # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/
319   - # however, we can include the not_analyzed field in _all
320   - # and the _all index analyzer will take care of it
321   - "{name}" => {type: "string", index: "not_analyzed"},
322   - "analyzed" => {type: "string", index: "analyzed"}
323   - }
324   - }
325   - }
326   - }
327   - ]
328   - }
329   - }.deep_merge(options[:mappings] || {})
330   - end
331   -
332   - {
333   - settings: settings,
334   - mappings: mappings
335   - }
  74 + searchkick_index.index_options
336 75 end
337 76  
338 77 end
... ...