[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r300798144 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. Review comment: Fixed this and pushed my changes again. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r300774780 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { + _numElements = _dataBuffer.getInt(MAGIC_HEADER_LENGTH); +
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r300765246 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { Review comment: Filed https://github.com/apache/incubator-pinot/issues/4393 to handle this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299708993 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { Review comment: Initializing this to `-1` now. @mcvsubbu we can do that too but I
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r300764157 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); Review comment: I've added version number in the header now which should let us evolve the store. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r300053802 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { + _numElements = _dataBuffer.getInt(MAGIC_HEADER_LENGTH); +
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r300053298 ## File path: pinot-core/src/main/java/org/apache/pinot/core/realtime/converter/RealtimeSegmentConverter.java ## @@ -54,10 +54,12 @@ private List invertedIndexColumns; private List noDictionaryColumns; private StarTreeIndexSpec starTreeIndexSpec; + private List varLengthDictionaryColumns; public RealtimeSegmentConverter(MutableSegmentImpl realtimeSegment, String outputPath, Schema schema, String tableName, String timeColumnName, String segmentName, String sortedColumn, - List invertedIndexColumns, List noDictionaryColumns, StarTreeIndexSpec starTreeIndexSpec) { + List invertedIndexColumns, List noDictionaryColumns, + List varLengthDictionaryColumns, StarTreeIndexSpec starTreeIndexSpec) { Review comment: I've not spent enough time on this part of code to give my take on what's the best thing to do. If one of you can please log an issue with more details, I'll take it up next, check the code and can make the changes finally. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299742746 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { Review comment: Filed https://github.com/apache/incubator-pinot/issues/4393 to do the cleanup. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299741474 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { + _numElements = _dataBuffer.getInt(MAGIC_HEADER_LENGTH); +
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299717989 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { Review comment: I've added version in the latest commit which will be used for the evolution of this store/data structure. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299717784 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. Review comment: I've added version number in the most recent commit. About 2, having such offset will force us to read that value before reading any `byte[]` from the store. Currently, the read side code is simple because you know where the data starts already and can directly read. So, my take is, unless there is a strong reason, it might be better off leaving it like this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299708477 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { Review comment: Sure, will take up in another PR. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299709846 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { + _numElements = _dataBuffer.getInt(MAGIC_HEADER_LENGTH); +
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299712221 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { + _numElements = _dataBuffer.getInt(MAGIC_HEADER_LENGTH); +
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299708186 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { Review comment: Yes. I've discuss this with @kishoreg in-person but the gist of it is, this new class is named exactly on the lines of an existing class `FixedByteValueReaderWriter` to be consistent. I'll do a new PR cleaning up both of these. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299710857 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { + _numElements = _dataBuffer.getInt(MAGIC_HEADER_LENGTH); +
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299713698 ## File path: pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java ## @@ -211,6 +203,39 @@ public void build() } } + private void writeBytesValueDictionary(int numValues, byte[][] sortedByteArrays) throws IOException { + +if (_useVarLengthDictionary) { + // Backward-compatible: index file is always big-endian + long size = VarLengthBytesValueReaderWriter.getRequiredSize(sortedByteArrays); + try (PinotDataBuffer dataBuffer = PinotDataBuffer + .mapFile(_dictionaryFile, false, 0, size, ByteOrder.BIG_ENDIAN, + getClass().getSimpleName()); + VarLengthBytesValueReaderWriter writer = new VarLengthBytesValueReaderWriter( + dataBuffer)) { +writer.init(sortedByteArrays); Review comment: I've renamed `init()` to `write()` to be clear. Please see my above comment for more details. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299708993 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { Review comment: Initializing this to `-1` now. @mcvsubbu we can do that too but I'm
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299713449 ## File path: pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java ## @@ -211,6 +203,39 @@ public void build() } } + private void writeBytesValueDictionary(int numValues, byte[][] sortedByteArrays) throws IOException { + +if (_useVarLengthDictionary) { Review comment: Strictly speaking, `VarLengthBytesValueReaderWriter` and `FixedByteValueReaderWriter` are two implementations of readers but there is no common `Writer` interface here and hence the way the `byte[]` are written isn't uniform either. Like I mentioned in another comment, I'm planning to do another PR to rename the classes and `ValueReader.java` interface so I'll see if this needs more cleanup then. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299692430 ## File path: pinot-core/src/main/java/org/apache/pinot/core/segment/index/loader/defaultcolumn/BaseDefaultColumnHandler.java ## @@ -311,12 +311,14 @@ protected void createColumnV1Indices(String column) totalDocs, maxNumberOfMultiValueElements); ColumnIndexCreationInfo columnIndexCreationInfo = -new ColumnIndexCreationInfo(columnStatistics, true/*createDictionary*/, ForwardIndexType.FIXED_BIT_COMPRESSED, +new ColumnIndexCreationInfo(columnStatistics, true/*createDictionary*/, Review comment: I'm passing `false` for 3rd parameter here. I'll reformat using Pinot style now. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299698415 ## File path: pinot-core/src/main/java/org/apache/pinot/core/data/manager/realtime/LLRealtimeSegmentDataManager.java ## @@ -237,6 +237,7 @@ public void deleteSegmentFile() { private final String _timeColumnName; private final List _invertedIndexColumns; private final List _noDictionaryColumns; + private final List _varLengthDictionaryColumns; Review comment: Need this while converting Realtime segments to offline segments. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299698286 ## File path: pinot-core/src/main/java/org/apache/pinot/core/data/manager/realtime/HLRealtimeSegmentDataManager.java ## @@ -145,6 +146,8 @@ public HLRealtimeSegmentDataManager(final RealtimeSegmentZKMetadata realtimeSegm // No DictionaryColumns noDictionaryColumns = new ArrayList<>(indexLoadingConfig.getNoDictionaryColumns()); +varLengthDictionaryColumns = new ArrayList<>(indexLoadingConfig.getVarLengthDictionaryColumns()); Review comment: That's because we need the configuration of whether to use a variable length dictionary for a column when we are converting Realtime segments into Offline segments. LMK if you know of any better ways of getting this config instead of carrying in realtime classes. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299691978 ## File path: pinot-core/src/main/java/org/apache/pinot/core/segment/index/readers/ImmutableDictionaryReader.java ## @@ -244,6 +252,6 @@ protected String getPaddedString(int dictId, byte[] buffer) { } protected byte[] getBuffer() { -return new byte[_numBytesPerValue]; +return (_valueReader instanceof VarLengthBytesValueReaderWriter) ? null : new byte[_numBytesPerValue]; Review comment: Fixed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299690774 ## File path: pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java ## @@ -211,6 +203,39 @@ public void build() } } + private void writeBytesValueDictionary(int numValues, byte[][] sortedByteArrays) throws IOException { Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r299690213 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { + _numElements = _dataBuffer.getInt(MAGIC_HEADER_LENGTH); +
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r298916629 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { + _numElements = _dataBuffer.getInt(MAGIC_HEADER_LENGTH); +
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r298915186 ## File path: pinot-core/src/main/java/org/apache/pinot/core/segment/creator/impl/SegmentDictionaryCreator.java ## @@ -234,7 +259,7 @@ public int indexOfSV(Object value) { } } - public int[] indexOfMV(Object value) { + int[] indexOfMV(Object value) { Review comment: I don't think the `public` scope is needed, hence removed but i'll revert that change. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r298913207 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. Review comment: Added javadoc for this as well. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r298909190 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; Review comment: Handled. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r298893980 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r298892922 ## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/VarLengthBytesValueReaderWriter.java ## @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import com.google.common.base.Preconditions; +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import org.apache.pinot.common.utils.StringUtil; +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + +/** + * Implementation of {@link ValueReader} that will allow each value to be of variable + * length and there by avoiding the unnecessary padding. + * + * The layout of the file is as follows: + * Header Section: + * + *Magic header byte sequence. + * + * + * Values: + * + *Integer offsets to start position of byte arrays. + *All byte arrays. + * + * + * Only sequential writes are supported. + * + * @see FixedByteValueReaderWriter + */ +public class VarLengthBytesValueReaderWriter implements Closeable, ValueReader { + + /** + * Header used to identify the dictionary files written in variable length bytes + * format. Keeping this a mix of alphanumeric with special character to avoid + * collisions with the regular int/string dictionary values written in fixed size + * format. + */ + private static final byte[] MAGIC_HEADER = StringUtil.encodeUtf8("vl1;"); + private static final int MAGIC_HEADER_LENGTH = MAGIC_HEADER.length; + private static final int INDEX_ARRAY_START_OFFSET = MAGIC_HEADER.length + Integer.BYTES; + + private final PinotDataBuffer _dataBuffer; + private transient int _numElements; + + public VarLengthBytesValueReaderWriter(PinotDataBuffer dataBuffer) { +this._dataBuffer = dataBuffer; + } + + public static long getRequiredSize(byte[][] byteArrays) { +// First include the magic header, length field and offsets array. +long length = INDEX_ARRAY_START_OFFSET + Integer.BYTES * (byteArrays.length); + +for (byte[] array: byteArrays) { + length += array.length; +} +return length; + } + + public static boolean isVarLengthBytesDictBuffer(PinotDataBuffer buffer) { +// If the buffer is smaller than header + numElements size, it's not var length dictionary. +if (buffer.size() > INDEX_ARRAY_START_OFFSET) { + byte[] header = new byte[MAGIC_HEADER_LENGTH]; + buffer.copyTo(0, header, 0, MAGIC_HEADER_LENGTH); + + if (Arrays.equals(MAGIC_HEADER, header)) { +// Also verify that there is a valid numElements value. +int numElements = buffer.getInt(MAGIC_HEADER_LENGTH); +return numElements > 0; + } +} + +return false; + } + + private void writeHeader() { +for (int offset = 0; offset < MAGIC_HEADER_LENGTH; offset++) { + _dataBuffer.putByte(offset, MAGIC_HEADER[offset]); +} + } + + public void init(byte[][] byteArrays) { +this._numElements = byteArrays.length; + +writeHeader(); + +// Add the number of elements as the first field in the buffer. +_dataBuffer.putInt(MAGIC_HEADER_LENGTH, byteArrays.length); + +// Then write the offset of each of the byte array in the data buffer. +int nextOffset = INDEX_ARRAY_START_OFFSET; +int nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.putInt(nextOffset, nextArrayStartOffset); + nextOffset += Integer.BYTES; + nextArrayStartOffset += array.length; +} + +// Finally write the byte arrays. +nextArrayStartOffset = INDEX_ARRAY_START_OFFSET + Integer.BYTES * byteArrays.length; +for (byte[] array: byteArrays) { + _dataBuffer.readFrom(nextArrayStartOffset, array); + nextArrayStartOffset += array.length; +} + } + + int getNumElements() { +// Lazily initialize the numElements. +if (_numElements == 0) { + _numElements = _dataBuffer.getInt(MAGIC_HEADER_LENGTH); +
[GitHub] [incubator-pinot] buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts.
buchireddy commented on a change in pull request #4321: #4317 Feature/variable length bytes offline dictionary for indexing bytes and string dicts. URL: https://github.com/apache/incubator-pinot/pull/4321#discussion_r298892663 ## File path: pinot-common/src/main/java/org/apache/pinot/common/config/IndexingConfig.java ## @@ -223,6 +231,14 @@ public boolean isAggregateMetrics() { return _aggregateMetrics; } + public List getVarLengthDictionaryColumns() { +return _varLengthDictionaryColumns; + } + + public void setVarLengthDictionaryColumns(List _varLengthDictionaryColumns) { +this._varLengthDictionaryColumns = _varLengthDictionaryColumns; Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org