Github user meiercaleb commented on a diff in the pull request:

    https://github.com/apache/incubator-rya/pull/153#discussion_r133817921
  
    --- Diff: 
extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java
 ---
    @@ -0,0 +1,1066 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *   http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing,
    + * software distributed under the License is distributed on an
    + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    + * KIND, either express or implied.  See the License for the
    + * specific language governing permissions and limitations
    + * under the License.
    + */
    +package org.apache.rya.indexing.smarturi.duplication;
    +
    +import static java.util.Objects.requireNonNull;
    +
    +import java.math.BigDecimal;
    +import java.util.ArrayList;
    +import java.util.Date;
    +import java.util.HashMap;
    +import java.util.LinkedHashMap;
    +import java.util.List;
    +import java.util.Map;
    +import java.util.Map.Entry;
    +import java.util.Objects;
    +import java.util.Optional;
    +import java.util.Set;
    +import java.util.TreeSet;
    +
    +import org.apache.commons.configuration.ConfigurationException;
    +import org.apache.commons.lang.StringUtils;
    +import org.apache.rya.api.domain.RyaType;
    +import org.apache.rya.api.domain.RyaURI;
    +import org.apache.rya.api.resolver.impl.DateTimeRyaTypeResolver;
    +import org.apache.rya.indexing.entity.model.Entity;
    +import org.apache.rya.indexing.entity.model.Property;
    +import org.apache.rya.indexing.smarturi.SmartUriAdapter;
    +import org.apache.rya.indexing.smarturi.SmartUriException;
    +import 
org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig;
    +import org.calrissian.mango.types.exception.TypeEncodingException;
    +import org.joda.time.DateTime;
    +import org.openrdf.model.URI;
    +import org.openrdf.model.impl.URIImpl;
    +import org.openrdf.model.vocabulary.XMLSchema;
    +
    +import com.google.common.collect.ImmutableMap;
    +
    +/**
    + * Detects if two entities contain data that's nearly identical based on a 
set
    + * tolerance for each field's type. Two entities are considered nearly
    + * identical if all their properties are equal and/or within the specified
    + * tolerance for the property's object type. Setting all object type 
tolerances
    + * to 0 means that the objects need to be exactly equal to each other to be
    + * considered duplicates. Duplicate data detection can be enabled/disabled
    + * through configuration and each object type can have a tolerance based on
    + * either the difference or the percentage difference between the objects 
being
    + * compared.
    + */
    +public class DuplicateDataDetector {
    +    private final Map<URI, ApproxEqualsDetector<?>> uriMap = new 
HashMap<>();
    +    private final Map<Class<?>, ApproxEqualsDetector<?>> classMap = new 
HashMap<>();
    +
    +    private boolean isDetectionEnabled;
    +
    +    /**
    +     * Creates a new instance of {@link DuplicateDataDetector} with the
    +     * values provided by the configuration file.
    +     * @param duplicateDataConfig the {@link DuplicateDataConfig}
    +     */
    +    public DuplicateDataDetector(final DuplicateDataConfig 
duplicateDataConfig) {
    +        this(duplicateDataConfig.getBooleanTolerance(),
    +            duplicateDataConfig.getByteTolerance(),
    +            duplicateDataConfig.getDateTolerance(),
    +            duplicateDataConfig.getDoubleTolerance(),
    +            duplicateDataConfig.getFloatTolerance(),
    +            duplicateDataConfig.getIntegerTolerance(),
    +            duplicateDataConfig.getLongTolerance(),
    +            duplicateDataConfig.getShortTolerance(),
    +            duplicateDataConfig.getStringTolerance(),
    +            duplicateDataConfig.getUriTolerance(),
    +            duplicateDataConfig.getEquivalentTermsMap(),
    +            duplicateDataConfig.isDetectionEnabled()
    +        );
    +    }
    +
    +    /**
    +     * Creates a new instance of {@link DuplicateDataDetector} with the 
values
    +     * from the config.
    +     * @throws ConfigurationException
    +     */
    +    public DuplicateDataDetector() throws ConfigurationException {
    +        this(new DuplicateDataConfig());
    +    }
    +
    +    /**
    +     * Creates a new instance of {@link DuplicateDataDetector}.
    +     * @param tolerance the tolerance to assign to all types.
    +     */
    +    public DuplicateDataDetector(final double tolerance) {
    +        this(new Tolerance(tolerance, ToleranceType.DIFFERENCE), new 
LinkedHashMap<>());
    +    }
    +
    +    /**
    +     * Creates a new instance of {@link DuplicateDataDetector}.
    +     * @param tolerance the tolerance to assign to all types.
    +     * @param equivalentTermsMap the {@link Map} of terms that are 
considered
    +     * equivalent to each other. (not {@code null})
    +     */
    +    public DuplicateDataDetector(final Tolerance tolerance, final 
Map<String, List<String>> equivalentTermsMap) {
    +        this(tolerance, tolerance, tolerance, tolerance, tolerance,
    +            tolerance, tolerance, tolerance, tolerance, tolerance , 
equivalentTermsMap, true);
    +    }
    +
    +    /**
    +     * Creates a new instance of {@link DuplicateDataDetector}.
    +     * @param booleanTolerance the {@link Boolean} tolerance value or
    +     * {@code null} if not specified.
    +     * @param byteTolerance the {@link Byte} tolerance value or {@code 
null} if
    +     * not specified.
    +     * @param dateTolerance the {@link Date} tolerance value or {@code 
null} if
    +     * not specified.
    +     * @param doubleTolerance the {@link Double} tolerance value or {@code 
null}
    +     * if not specified.
    +     * @param floatTolerance the {@link Float} tolerance value or {@code 
null}
    +     * if not specified.
    +     * @param integerTolerance the {@link Integer} tolerance value or
    +     * {@code null} if not specified.
    +     * @param longTolerance the {@link Long} tolerance value or {@code 
null} if
    +     * not specified.
    +     * @param shortTolerance the {@link Short} tolerance value or {@code 
null}
    +     * if not specified.
    +     * @param stringTolerance the {@link String} tolerance value or {@code 
null}
    +     * if not specified.
    +     * @param uriTolerance the {@link URI} tolerance value or {@code null} 
if
    +     * not specified.
    +     * @param equivalentTermsMap the {@link Map} of terms that are 
considered
    +     * equivalent to each other. (not {@code null})
    +     * @param isDetectionEnabled {@code true} to enable detection. {@code 
false}
    +     * to disable detection.
    +     */
    +    public DuplicateDataDetector(final Tolerance booleanTolerance, final 
Tolerance byteTolerance,
    +            final Tolerance dateTolerance, final Tolerance 
doubleTolerance, final Tolerance floatTolerance,
    +            final Tolerance integerTolerance, final Tolerance 
longTolerance, final Tolerance shortTolerance,
    +            final Tolerance stringTolerance, final Tolerance uriTolerance, 
final Map<String, List<String>> equivalentTermsMap,
    +            final boolean isDetectionEnabled)
    +    {
    +        init(booleanTolerance, byteTolerance, dateTolerance, 
doubleTolerance, floatTolerance,
    +            integerTolerance, longTolerance, shortTolerance, 
stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled);
    +    }
    +
    +    private void init(final Tolerance booleanTolerance, final Tolerance 
byteTolerance,
    +            final Tolerance dateTolerance, final Tolerance 
doubleTolerance, final Tolerance floatTolerance,
    +            final Tolerance integerTolerance, final Tolerance 
longTolerance, final Tolerance shortTolerance,
    +            final Tolerance stringTolerance, final Tolerance uriTolerance, 
final Map<String, List<String>> equivalentTermsMap,
    +            final boolean isDetectionEnabled)
    +    {
    +        final List<ApproxEqualsDetector<?>> detectors = new ArrayList<>();
    +        detectors.add(new BooleanApproxEqualsDetector(booleanTolerance));
    +        detectors.add(new ByteApproxEqualsDetector(byteTolerance));
    +        detectors.add(new DateApproxEqualsDetector(dateTolerance));
    +        detectors.add(new DateTimeApproxEqualsDetector(dateTolerance));
    +        detectors.add(new DoubleApproxEqualsDetector(doubleTolerance));
    +        detectors.add(new FloatApproxEqualsDetector(floatTolerance));
    +        detectors.add(new IntegerApproxEqualsDetector(integerTolerance));
    +        detectors.add(new LongApproxEqualsDetector(longTolerance));
    +        detectors.add(new ShortApproxEqualsDetector(shortTolerance));
    +        detectors.add(new StringApproxEqualsDetector(stringTolerance, 
equivalentTermsMap));
    +        detectors.add(new UriApproxEqualsDetector(uriTolerance));
    +
    +        for (final ApproxEqualsDetector<?> approxEqualsDetector : 
detectors) {
    +            uriMap.put(approxEqualsDetector.getXmlSchemaUri(), 
approxEqualsDetector);
    +            classMap.put(approxEqualsDetector.getTypeClass(), 
approxEqualsDetector);
    +        }
    +
    +        this.isDetectionEnabled = isDetectionEnabled;
    +    }
    +
    +    /**
    +     * @return {@code true} to enable detection. {@code false} to disable
    +     * detection.
    +     */
    +    public boolean isDetectionEnabled() {
    +        return isDetectionEnabled;
    +    }
    +
    +    /**
    +     * Removes any duplicate (nearly identical) entities from the 
collection
    +     * of entities.
    +     * @param entities the {@link List} of {@link Entity}s. (not {@code 
null})
    +     * @throws SmartUriException
    +     */
    +    public void removeDuplicatesFromCollection(final List<Entity> 
entities) throws SmartUriException {
    +        requireNonNull(entities);
    +        // Use a Sorted Set in reverse order to hold the indices
    +        final Set<Integer> indicesToRemove = new TreeSet<>((a, b) -> 
Integer.compare(b, a));
    +        if (entities != null && entities.size() > 1) {
    +            // Compare all entities to each other while avoiding making the
    +            // same comparisons again and not comparing an entity to 
itself.
    +            for (int i = 0; i < entities.size() - 1; i++) {
    +                final Entity entity1 = entities.get(i);
    +                for (int j = entities.size() - 1; j > i; j--) {
    +                    final Entity entity2 = entities.get(j);
    +                    final boolean areDuplicates = compareEntities(entity1, 
entity2);
    +                    if (areDuplicates) {
    +                        indicesToRemove.add(j);
    +                    }
    +                }
    +            }
    +        }
    +        if (!indicesToRemove.isEmpty()) {
    +            // Remove indices in reverse order (already sorted in 
descending
    +            // order so just loop through them)
    +            for (final int index : indicesToRemove) {
    +                entities.remove(index);
    +            }
    +        }
    +    }
    +
    +    /**
    +     * Compares two Smart URI's to determine if they have nearly identical 
data.
    +     * @param uri1 the first Smart {@link URI}. (not {@code null})
    +     * @param uri2 the second Smart {@link URI}. (not {@code null})
    +     * @return {@code true} if the two Smart URI's have nearly identical 
data.
    +     * {@code false} otherwise.
    +     * @throws SmartUriException
    +     */
    +    public boolean compareSmartUris(final URI uri1, final URI uri2) throws 
SmartUriException {
    +        requireNonNull(uri1);
    +        requireNonNull(uri2);
    +        final Entity entity1 = SmartUriAdapter.deserializeUriEntity(uri1);
    +        final Entity entity2 = SmartUriAdapter.deserializeUriEntity(uri2);
    +        return compareEntities(entity1, entity2);
    +    }
    +
    +    /**
    +     * Compares two entities to determine if they have nearly identical 
data.
    +     * @param entity1 the first {@link Entity}. (not {@code null})
    +     * @param entity2 the second {@link Entity}. (not {@code null})
    +     * @return {@code true} if the two entities have nearly identical data.
    +     * {@code false} otherwise.
    +     * @throws SmartUriException
    +     */
    +    public boolean compareEntities(final Entity entity1, final Entity 
entity2) throws SmartUriException {
    +        requireNonNull(entity1);
    +        requireNonNull(entity2);
    +        boolean allValuesNearlyEqual = true;
    +
    +        final List<RyaURI> types1 = entity1.getExplicitTypeIds();
    +        final List<RyaURI> types2 = entity2.getExplicitTypeIds();
    +        final boolean doBothHaveSameTypes = types1.containsAll(types2);
    +        if (!doBothHaveSameTypes) {
    +            return false;
    +        }
    +        for (final Entry<RyaURI, ImmutableMap<RyaURI, Property>> entry : 
entity1.getProperties().entrySet()) {
    +            final RyaURI typeIdUri = entry.getKey();
    +            for (final Entry<RyaURI, Property> typeProperty : 
entry.getValue().entrySet()) {
    +                final RyaURI propertyNameUri = typeProperty.getKey();
    +                final Property property1 = typeProperty.getValue();
    +
    +                final Optional<Property> p2 = 
entity2.lookupTypeProperty(typeIdUri, propertyNameUri);
    +                if (p2.isPresent()) {
    +                    final Property property2 = p2.get();
    +                    final RyaType value1 = property1.getValue();
    +                    final RyaType value2 = property2.getValue();
    +                    final String data1 = value1.getData();
    +                    final String data2 = value2.getData();
    +                    final URI xmlSchemaUri1 = value1.getDataType();
    +                    final ApproxEqualsDetector<?> approxEqualsDetector = 
uriMap.get(xmlSchemaUri1);
    +                    if (approxEqualsDetector == null) {
    +                        throw new SmartUriException("No appropriate 
detector found for the type: " + xmlSchemaUri1);
    +                    }
    +                    final boolean approxEquals = 
approxEqualsDetector.areApproxEquals(data1, data2);
    +                    if (!approxEquals) {
    +                        allValuesNearlyEqual = false;
    +                        break;
    +                    }
    +                } else {
    +                    allValuesNearlyEqual = false;
    +                    break;
    +                }
    +            }
    +            if (!allValuesNearlyEqual) {
    +                break;
    +            }
    +        }
    +        return allValuesNearlyEqual;
    +    }
    +
    +    /**
    +     * Gets the appropriate {@link ApproxEqualsDetector} for the specified
    +     * class.
    +     * @param clazz the {@link Class} to find an {@link 
ApproxEqualsDetector}
    +     * for.
    +     * @return the {@link ApproxEqualsDetector} for the class or {@code 
null} if
    +     * none could be found.
    +     */
    +    public ApproxEqualsDetector<?> getDetectorForType(final Class<?> 
clazz) {
    +        return classMap.get(clazz);
    +    }
    +
    +    private static boolean isOnlyOneNull(final Object lhs, final Object 
rhs) {
    +        return (lhs == null && rhs != null) || (lhs != null && rhs == 
null);
    +    }
    +
    +    /**
    +     * Class to detect if two booleans are considered approximately equal 
to
    +     * each other.
    +     */
    +    public static class BooleanApproxEqualsDetector implements 
ApproxEqualsDetector<Boolean> {
    +        private static final Tolerance DEFAULT_TOLERANCE = new 
Tolerance(0.0, ToleranceType.DIFFERENCE);
    +        private final Tolerance tolerance;
    +
    +        /**
    +         * Creates a new instance of {@link BooleanApproxEqualsDetector}.
    +         * @param tolerance the {@link Tolerance}.
    +         */
    +        public BooleanApproxEqualsDetector(final Tolerance tolerance) {
    +            this.tolerance = tolerance != null ? tolerance : 
getDefaultTolerance();
    +        }
    +
    +        @Override
    +        public boolean areObjectsApproxEquals(final Boolean lhs, final 
Boolean rhs) {
    +            // Should never be almost equals when tolerance is 0, only 
exactly equals
    +            // Otherwise if there's any tolerance specified everything is 
equal
    +            return tolerance.getValue() == 0 ? Objects.equals(lhs, rhs) : 
true;
    +        }
    +
    +        @Override
    +        public Tolerance getDefaultTolerance() {
    +            return DEFAULT_TOLERANCE;
    +        }
    +
    +        @Override
    +        public Boolean convertStringToObject(final String string) throws 
SmartUriException {
    +            return Boolean.valueOf(string);
    +        }
    +
    +        @Override
    +        public Class<?> getTypeClass() {
    +            return Boolean.class;
    +        }
    +
    +        @Override
    +        public URI getXmlSchemaUri() {
    +            return XMLSchema.BOOLEAN;
    +        }
    +    }
    +
    +    /**
    +     * Class to detect if two bytes are considered approximately equal to 
each
    +     * other.
    +     */
    +    public static class ByteApproxEqualsDetector implements 
ApproxEqualsDetector<Byte> {
    +        private static final Tolerance DEFAULT_TOLERANCE = new 
Tolerance(0.0, ToleranceType.DIFFERENCE);
    +        private final Tolerance tolerance;
    +
    +        /**
    +         * Creates a new instance of {@link ByteApproxEqualsDetector}.
    +         * @param tolerance the {@link Tolerance}.
    +         */
    +        public ByteApproxEqualsDetector(final Tolerance tolerance) {
    +            this.tolerance = tolerance != null ? tolerance : 
getDefaultTolerance();
    +        }
    +
    +        @Override
    +        public boolean areObjectsApproxEquals(final Byte lhs, final Byte 
rhs) {
    --- End diff --
    
    Okay. Got it.  Was thinking about it outside of that context.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

Reply via email to