snleee commented on code in PR #12032: URL: https://github.com/apache/pinot/pull/12032#discussion_r1401274397
########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SpecialValueTransformer.java: ########## @@ -0,0 +1,96 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.recordtransformer; + +import java.util.HashSet; +import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.GenericRow; + + +/** + * The {@code SpecialValueTransformer} class will transform special values according to the following rules: + * <ul> + * <li>Negative zero (-0.0) should be converted to 0.0</li> + * <li>NaN should be converted to default null</li> + * </ul> + * <p>NOTE: should put this after the {@link DataTypeTransformer} so that all values follow the data types in + * {@link FieldSpec}. + */ +public class SpecialValueTransformer implements RecordTransformer { + private final HashSet<String> _specialValuesKeySet = new HashSet<>(); + + public SpecialValueTransformer(Schema schema) { + for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { + if (!fieldSpec.isVirtualColumn() && (fieldSpec.getDataType() == DataType.FLOAT + || fieldSpec.getDataType() == DataType.DOUBLE)) { + _specialValuesKeySet.add(fieldSpec.getName()); + } + } + } + + private Object transformNegativeZero(Object value) { + if ((value instanceof Float) && (Float.floatToRawIntBits((float) value) == Float.floatToRawIntBits(-0.0f))) { + value = 0.0f; + } else if ((value instanceof Double) && (Double.doubleToLongBits((double) value) == Double.doubleToLongBits( + -0.0d))) { + value = 0.0d; + } + return value; + } + + private Object transformNaN(Object value) { + if ((value instanceof Float) && ((Float) value).isNaN()) { + value = FieldSpec.DEFAULT_DIMENSION_NULL_VALUE_OF_FLOAT; Review Comment: +1 on Jackie's comment to set this value to null instead of we explicitly try to assign NULL value in this transformer. ########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SpecialValueTransformer.java: ########## @@ -0,0 +1,96 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.recordtransformer; + +import java.util.HashSet; +import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.GenericRow; + + +/** + * The {@code SpecialValueTransformer} class will transform special values according to the following rules: + * <ul> + * <li>Negative zero (-0.0) should be converted to 0.0</li> + * <li>NaN should be converted to default null</li> + * </ul> + * <p>NOTE: should put this after the {@link DataTypeTransformer} so that all values follow the data types in + * {@link FieldSpec}. + */ +public class SpecialValueTransformer implements RecordTransformer { + private final HashSet<String> _specialValuesKeySet = new HashSet<>(); + + public SpecialValueTransformer(Schema schema) { + for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { + if (!fieldSpec.isVirtualColumn() && (fieldSpec.getDataType() == DataType.FLOAT + || fieldSpec.getDataType() == DataType.DOUBLE)) { + _specialValuesKeySet.add(fieldSpec.getName()); + } + } + } + + private Object transformNegativeZero(Object value) { + if ((value instanceof Float) && (Float.floatToRawIntBits((float) value) == Float.floatToRawIntBits(-0.0f))) { + value = 0.0f; + } else if ((value instanceof Double) && (Double.doubleToLongBits((double) value) == Double.doubleToLongBits( + -0.0d))) { + value = 0.0d; + } + return value; + } + + private Object transformNaN(Object value) { + if ((value instanceof Float) && ((Float) value).isNaN()) { + value = FieldSpec.DEFAULT_DIMENSION_NULL_VALUE_OF_FLOAT; + } else if ((value instanceof Double) && ((Double) value).isNaN()) { + value = FieldSpec.DEFAULT_DIMENSION_NULL_VALUE_OF_DOUBLE; + } + return value; + } + + @Override + public boolean isNoOp() { + return _specialValuesKeySet.isEmpty(); + } + + @Override + public GenericRow transform(GenericRow record) { + for (String element : _specialValuesKeySet) { + Object value = record.getValue(element); + if (value instanceof Float || value instanceof Double) { + // Single-valued column. + Object zeroTransformedValue = transformNegativeZero(value); + Object nanTransformedValue = transformNaN(zeroTransformedValue); + if (nanTransformedValue != value) { + record.putValue(element, nanTransformedValue); + } + } else if (value instanceof Object[]) { + // Multi-valued column. Review Comment: Currently, our null value handler only takes care of the case when the entire value for MV column is null. @Jackie-Jiang Do you have any recommendation on the case where one of the values in the mv column is `NaN`? Also, do we allow `null` for mv column value? ########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/CompositeTransformer.java: ########## @@ -68,13 +68,18 @@ public class CompositeTransformer implements RecordTransformer { * Optional {@link SanitizationTransformer} after {@link NullValueTransformer} so that before sanitation, all * values are non-null and follow the data types defined in the schema * </li> + * <li> + * {@link SpecialValueTransformer} after {@link DataTypeTransformer} so that we already have the values complying + * with the schema before handling special values + * </li> * </ul> */ public static List<RecordTransformer> getDefaultTransformers(TableConfig tableConfig, Schema schema) { return Stream.of(new ExpressionTransformer(tableConfig, schema), new FilterTransformer(tableConfig), - new SchemaConformingTransformer(tableConfig, schema), new DataTypeTransformer(tableConfig, schema), - new TimeValidationTransformer(tableConfig, schema), new NullValueTransformer(tableConfig, schema), - new SanitizationTransformer(schema)).filter(t -> !t.isNoOp()).collect(Collectors.toList()); + new SchemaConformingTransformer(tableConfig, schema), new DataTypeTransformer(tableConfig, schema), Review Comment: +1 on adding the unit test for order of the transformer. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
