Repository: couchdb-mango
Updated Branches:
  refs/heads/master a297e2e9e -> 090dc6750


Provide an ability to disable the indexing of array lengths.

Depending on the data shape, cloudant query would end up creating many
thousands of unique fields and this is leading to JVM heap exhaustion
as Lucene tries to cache information about fields and Lucene is not
designed to handle many thousands fields.
This change allows the user to disable the indexing of array lengths
field. So that they don’t need to take the hit on performance if they
don’t plan to use that field in their queries ($size operator)

Array length field is a single extra field per unique path to an array. The 
case where we found this was a client that had data that used arbitrary data as 
keys which exploded the number of fields in Lucene. The obvious fix was to 
switch to only indexing what they wanted to query on. Unfortunately that didn't 
prevent the automatically created array length fields from being created. This 
patch is a big hammer to remove the auto generated array length fields which 
may be generally useful. Though we're also planning on another patch that 
removes array length fields for anything that's not specified in the index's 
field list.

Add index_array_lengths to the list of valid fields in the index
document so that the index document with this field will pass and
enforce the boolean value.


Project: http://git-wip-us.apache.org/repos/asf/couchdb-mango/repo
Commit: http://git-wip-us.apache.org/repos/asf/couchdb-mango/commit/bf44d0fe
Tree: http://git-wip-us.apache.org/repos/asf/couchdb-mango/tree/bf44d0fe
Diff: http://git-wip-us.apache.org/repos/asf/couchdb-mango/diff/bf44d0fe

Branch: refs/heads/master
Commit: bf44d0fe3869386e5fdcf99f6c690dc8498f213b
Parents: a297e2e
Author: brkolla <[email protected]>
Authored: Thu Oct 22 20:16:42 2015 -0400
Committer: brkolla <[email protected]>
Committed: Tue Oct 27 14:18:14 2015 -0400

----------------------------------------------------------------------
 src/mango_idx_text.erl    |  6 ++++++
 src/mango_native_proc.erl | 27 +++++++++++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/couchdb-mango/blob/bf44d0fe/src/mango_idx_text.erl
----------------------------------------------------------------------
diff --git a/src/mango_idx_text.erl b/src/mango_idx_text.erl
index fcd2939..9ade6e2 100644
--- a/src/mango_idx_text.erl
+++ b/src/mango_idx_text.erl
@@ -219,6 +219,12 @@ opts() ->
             {optional, true},
             {default, []},
             {validator, fun ?MODULE:validate_fields/1}
+        ]},
+        {<<"index_array_lengths">>, [
+            {tag, index_array_lengths},
+            {optional, true},
+            {default, true},
+            {validator, fun mango_opts:is_boolean/1}
         ]}
     ].
 

http://git-wip-us.apache.org/repos/asf/couchdb-mango/blob/bf44d0fe/src/mango_native_proc.erl
----------------------------------------------------------------------
diff --git a/src/mango_native_proc.erl b/src/mango_native_proc.erl
index 822d173..6d0fb24 100644
--- a/src/mango_native_proc.erl
+++ b/src/mango_native_proc.erl
@@ -40,6 +40,7 @@
 
 
 -record(tacc, {
+    index_array_lengths = true,
     fields = all_fields,
     path = []
 }).
@@ -164,8 +165,12 @@ get_text_entries({IdxProps}, Doc) ->
 
 get_text_entries0(IdxProps, Doc) ->
     DefaultEnabled = get_default_enabled(IdxProps),
+    IndexArrayLengths = get_index_array_lengths(IdxProps),
     FieldsList = get_text_field_list(IdxProps),
-    TAcc = #tacc{fields = FieldsList},
+    TAcc = #tacc{
+        index_array_lengths = IndexArrayLengths,
+        fields = FieldsList
+    },
     Fields0 = get_text_field_values(Doc, TAcc),
     Fields = if not DefaultEnabled -> Fields0; true ->
         add_default_text_field(Fields0)
@@ -179,13 +184,19 @@ get_text_field_values({Props}, TAcc) when is_list(Props) 
->
     get_text_field_values_obj(Props, TAcc, []);
 
 get_text_field_values(Values, TAcc) when is_list(Values) ->
+    IndexArrayLengths = TAcc#tacc.index_array_lengths,
     NewPath = ["[]" | TAcc#tacc.path],
     NewTAcc = TAcc#tacc{path = NewPath},
-    % We bypass make_text_field and directly call make_text_field_name
-    % because the length field name is not part of the path.
-    LengthFieldName = make_text_field_name(NewTAcc#tacc.path, <<"length">>),
-    LengthField = [{LengthFieldName, <<"length">>, length(Values)}],
-    get_text_field_values_arr(Values, NewTAcc, LengthField);
+    case IndexArrayLengths of 
+        true ->
+            % We bypass make_text_field and directly call make_text_field_name
+            % because the length field name is not part of the path.
+            LengthFieldName = make_text_field_name(NewTAcc#tacc.path, 
<<"length">>),
+            LengthField = [{LengthFieldName, <<"length">>, length(Values)}],
+            get_text_field_values_arr(Values, NewTAcc, LengthField);
+        _ ->
+            get_text_field_values_arr(Values, NewTAcc, [])
+    end;
 
 get_text_field_values(Bin, TAcc) when is_binary(Bin) ->
     make_text_field(TAcc, <<"string">>, Bin);
@@ -227,6 +238,10 @@ get_default_enabled(Props) ->
     end.
 
 
+get_index_array_lengths(Props) ->
+    couch_util:get_value(<<"index_array_lengths">>, Props, true).
+
+
 add_default_text_field(Fields) ->
     DefaultFields = add_default_text_field(Fields, []),
     DefaultFields ++ Fields.

Reply via email to