Repository: couchdb-mango Updated Branches: refs/heads/master a297e2e9e -> 090dc6750
Provide an ability to disable the indexing of array lengths. Depending on the data shape, cloudant query would end up creating many thousands of unique fields and this is leading to JVM heap exhaustion as Lucene tries to cache information about fields and Lucene is not designed to handle many thousands fields. This change allows the user to disable the indexing of array lengths field. So that they donât need to take the hit on performance if they donât plan to use that field in their queries ($size operator) Array length field is a single extra field per unique path to an array. The case where we found this was a client that had data that used arbitrary data as keys which exploded the number of fields in Lucene. The obvious fix was to switch to only indexing what they wanted to query on. Unfortunately that didn't prevent the automatically created array length fields from being created. This patch is a big hammer to remove the auto generated array length fields which may be generally useful. Though we're also planning on another patch that removes array length fields for anything that's not specified in the index's field list. Add index_array_lengths to the list of valid fields in the index document so that the index document with this field will pass and enforce the boolean value. Project: http://git-wip-us.apache.org/repos/asf/couchdb-mango/repo Commit: http://git-wip-us.apache.org/repos/asf/couchdb-mango/commit/bf44d0fe Tree: http://git-wip-us.apache.org/repos/asf/couchdb-mango/tree/bf44d0fe Diff: http://git-wip-us.apache.org/repos/asf/couchdb-mango/diff/bf44d0fe Branch: refs/heads/master Commit: bf44d0fe3869386e5fdcf99f6c690dc8498f213b Parents: a297e2e Author: brkolla <[email protected]> Authored: Thu Oct 22 20:16:42 2015 -0400 Committer: brkolla <[email protected]> Committed: Tue Oct 27 14:18:14 2015 -0400 ---------------------------------------------------------------------- src/mango_idx_text.erl | 6 ++++++ src/mango_native_proc.erl | 27 +++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/couchdb-mango/blob/bf44d0fe/src/mango_idx_text.erl ---------------------------------------------------------------------- diff --git a/src/mango_idx_text.erl b/src/mango_idx_text.erl index fcd2939..9ade6e2 100644 --- a/src/mango_idx_text.erl +++ b/src/mango_idx_text.erl @@ -219,6 +219,12 @@ opts() -> {optional, true}, {default, []}, {validator, fun ?MODULE:validate_fields/1} + ]}, + {<<"index_array_lengths">>, [ + {tag, index_array_lengths}, + {optional, true}, + {default, true}, + {validator, fun mango_opts:is_boolean/1} ]} ]. http://git-wip-us.apache.org/repos/asf/couchdb-mango/blob/bf44d0fe/src/mango_native_proc.erl ---------------------------------------------------------------------- diff --git a/src/mango_native_proc.erl b/src/mango_native_proc.erl index 822d173..6d0fb24 100644 --- a/src/mango_native_proc.erl +++ b/src/mango_native_proc.erl @@ -40,6 +40,7 @@ -record(tacc, { + index_array_lengths = true, fields = all_fields, path = [] }). @@ -164,8 +165,12 @@ get_text_entries({IdxProps}, Doc) -> get_text_entries0(IdxProps, Doc) -> DefaultEnabled = get_default_enabled(IdxProps), + IndexArrayLengths = get_index_array_lengths(IdxProps), FieldsList = get_text_field_list(IdxProps), - TAcc = #tacc{fields = FieldsList}, + TAcc = #tacc{ + index_array_lengths = IndexArrayLengths, + fields = FieldsList + }, Fields0 = get_text_field_values(Doc, TAcc), Fields = if not DefaultEnabled -> Fields0; true -> add_default_text_field(Fields0) @@ -179,13 +184,19 @@ get_text_field_values({Props}, TAcc) when is_list(Props) -> get_text_field_values_obj(Props, TAcc, []); get_text_field_values(Values, TAcc) when is_list(Values) -> + IndexArrayLengths = TAcc#tacc.index_array_lengths, NewPath = ["[]" | TAcc#tacc.path], NewTAcc = TAcc#tacc{path = NewPath}, - % We bypass make_text_field and directly call make_text_field_name - % because the length field name is not part of the path. - LengthFieldName = make_text_field_name(NewTAcc#tacc.path, <<"length">>), - LengthField = [{LengthFieldName, <<"length">>, length(Values)}], - get_text_field_values_arr(Values, NewTAcc, LengthField); + case IndexArrayLengths of + true -> + % We bypass make_text_field and directly call make_text_field_name + % because the length field name is not part of the path. + LengthFieldName = make_text_field_name(NewTAcc#tacc.path, <<"length">>), + LengthField = [{LengthFieldName, <<"length">>, length(Values)}], + get_text_field_values_arr(Values, NewTAcc, LengthField); + _ -> + get_text_field_values_arr(Values, NewTAcc, []) + end; get_text_field_values(Bin, TAcc) when is_binary(Bin) -> make_text_field(TAcc, <<"string">>, Bin); @@ -227,6 +238,10 @@ get_default_enabled(Props) -> end. +get_index_array_lengths(Props) -> + couch_util:get_value(<<"index_array_lengths">>, Props, true). + + add_default_text_field(Fields) -> DefaultFields = add_default_text_field(Fields, []), DefaultFields ++ Fields.
