[ https://issues.apache.org/jira/browse/RYA-250?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16130673#comment-16130673 ]
ASF GitHub Bot commented on RYA-250: ------------------------------------ Github user meiercaleb commented on a diff in the pull request: https://github.com/apache/incubator-rya/pull/153#discussion_r133742379 --- Diff: extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java --- @@ -0,0 +1,1066 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.rya.indexing.smarturi.duplication; + +import static java.util.Objects.requireNonNull; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.commons.configuration.ConfigurationException; +import org.apache.commons.lang.StringUtils; +import org.apache.rya.api.domain.RyaType; +import org.apache.rya.api.domain.RyaURI; +import org.apache.rya.api.resolver.impl.DateTimeRyaTypeResolver; +import org.apache.rya.indexing.entity.model.Entity; +import org.apache.rya.indexing.entity.model.Property; +import org.apache.rya.indexing.smarturi.SmartUriAdapter; +import org.apache.rya.indexing.smarturi.SmartUriException; +import org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig; +import org.calrissian.mango.types.exception.TypeEncodingException; +import org.joda.time.DateTime; +import org.openrdf.model.URI; +import org.openrdf.model.impl.URIImpl; +import org.openrdf.model.vocabulary.XMLSchema; + +import com.google.common.collect.ImmutableMap; + +/** + * Detects if two entities contain data that's nearly identical based on a set + * tolerance for each field's type. Two entities are considered nearly + * identical if all their properties are equal and/or within the specified + * tolerance for the property's object type. Setting all object type tolerances + * to 0 means that the objects need to be exactly equal to each other to be + * considered duplicates. Duplicate data detection can be enabled/disabled + * through configuration and each object type can have a tolerance based on + * either the difference or the percentage difference between the objects being + * compared. + */ +public class DuplicateDataDetector { + private final Map<URI, ApproxEqualsDetector<?>> uriMap = new HashMap<>(); + private final Map<Class<?>, ApproxEqualsDetector<?>> classMap = new HashMap<>(); + + private boolean isDetectionEnabled; + + /** + * Creates a new instance of {@link DuplicateDataDetector} with the + * values provided by the configuration file. + * @param duplicateDataConfig the {@link DuplicateDataConfig} + */ + public DuplicateDataDetector(final DuplicateDataConfig duplicateDataConfig) { + this(duplicateDataConfig.getBooleanTolerance(), + duplicateDataConfig.getByteTolerance(), + duplicateDataConfig.getDateTolerance(), + duplicateDataConfig.getDoubleTolerance(), + duplicateDataConfig.getFloatTolerance(), + duplicateDataConfig.getIntegerTolerance(), + duplicateDataConfig.getLongTolerance(), + duplicateDataConfig.getShortTolerance(), + duplicateDataConfig.getStringTolerance(), + duplicateDataConfig.getUriTolerance(), + duplicateDataConfig.getEquivalentTermsMap(), + duplicateDataConfig.isDetectionEnabled() + ); + } + + /** + * Creates a new instance of {@link DuplicateDataDetector} with the values + * from the config. + * @throws ConfigurationException + */ + public DuplicateDataDetector() throws ConfigurationException { + this(new DuplicateDataConfig()); + } + + /** + * Creates a new instance of {@link DuplicateDataDetector}. + * @param tolerance the tolerance to assign to all types. + */ + public DuplicateDataDetector(final double tolerance) { + this(new Tolerance(tolerance, ToleranceType.DIFFERENCE), new LinkedHashMap<>()); + } + + /** + * Creates a new instance of {@link DuplicateDataDetector}. + * @param tolerance the tolerance to assign to all types. + * @param equivalentTermsMap the {@link Map} of terms that are considered + * equivalent to each other. (not {@code null}) + */ + public DuplicateDataDetector(final Tolerance tolerance, final Map<String, List<String>> equivalentTermsMap) { + this(tolerance, tolerance, tolerance, tolerance, tolerance, + tolerance, tolerance, tolerance, tolerance, tolerance , equivalentTermsMap, true); + } + + /** + * Creates a new instance of {@link DuplicateDataDetector}. + * @param booleanTolerance the {@link Boolean} tolerance value or + * {@code null} if not specified. + * @param byteTolerance the {@link Byte} tolerance value or {@code null} if + * not specified. + * @param dateTolerance the {@link Date} tolerance value or {@code null} if + * not specified. + * @param doubleTolerance the {@link Double} tolerance value or {@code null} + * if not specified. + * @param floatTolerance the {@link Float} tolerance value or {@code null} + * if not specified. + * @param integerTolerance the {@link Integer} tolerance value or + * {@code null} if not specified. + * @param longTolerance the {@link Long} tolerance value or {@code null} if + * not specified. + * @param shortTolerance the {@link Short} tolerance value or {@code null} + * if not specified. + * @param stringTolerance the {@link String} tolerance value or {@code null} + * if not specified. + * @param uriTolerance the {@link URI} tolerance value or {@code null} if + * not specified. + * @param equivalentTermsMap the {@link Map} of terms that are considered + * equivalent to each other. (not {@code null}) + * @param isDetectionEnabled {@code true} to enable detection. {@code false} + * to disable detection. + */ + public DuplicateDataDetector(final Tolerance booleanTolerance, final Tolerance byteTolerance, + final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance, + final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance, + final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap, + final boolean isDetectionEnabled) + { + init(booleanTolerance, byteTolerance, dateTolerance, doubleTolerance, floatTolerance, + integerTolerance, longTolerance, shortTolerance, stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled); + } + + private void init(final Tolerance booleanTolerance, final Tolerance byteTolerance, + final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance, + final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance, + final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap, + final boolean isDetectionEnabled) + { + final List<ApproxEqualsDetector<?>> detectors = new ArrayList<>(); + detectors.add(new BooleanApproxEqualsDetector(booleanTolerance)); + detectors.add(new ByteApproxEqualsDetector(byteTolerance)); + detectors.add(new DateApproxEqualsDetector(dateTolerance)); + detectors.add(new DateTimeApproxEqualsDetector(dateTolerance)); + detectors.add(new DoubleApproxEqualsDetector(doubleTolerance)); + detectors.add(new FloatApproxEqualsDetector(floatTolerance)); + detectors.add(new IntegerApproxEqualsDetector(integerTolerance)); + detectors.add(new LongApproxEqualsDetector(longTolerance)); + detectors.add(new ShortApproxEqualsDetector(shortTolerance)); + detectors.add(new StringApproxEqualsDetector(stringTolerance, equivalentTermsMap)); + detectors.add(new UriApproxEqualsDetector(uriTolerance)); + + for (final ApproxEqualsDetector<?> approxEqualsDetector : detectors) { + uriMap.put(approxEqualsDetector.getXmlSchemaUri(), approxEqualsDetector); + classMap.put(approxEqualsDetector.getTypeClass(), approxEqualsDetector); + } + + this.isDetectionEnabled = isDetectionEnabled; + } + + /** + * @return {@code true} to enable detection. {@code false} to disable + * detection. + */ + public boolean isDetectionEnabled() { + return isDetectionEnabled; + } + + /** + * Removes any duplicate (nearly identical) entities from the collection + * of entities. + * @param entities the {@link List} of {@link Entity}s. (not {@code null}) + * @throws SmartUriException + */ + public void removeDuplicatesFromCollection(final List<Entity> entities) throws SmartUriException { + requireNonNull(entities); + // Use a Sorted Set in reverse order to hold the indices + final Set<Integer> indicesToRemove = new TreeSet<>((a, b) -> Integer.compare(b, a)); + if (entities != null && entities.size() > 1) { + // Compare all entities to each other while avoiding making the + // same comparisons again and not comparing an entity to itself. + for (int i = 0; i < entities.size() - 1; i++) { + final Entity entity1 = entities.get(i); + for (int j = entities.size() - 1; j > i; j--) { + final Entity entity2 = entities.get(j); + final boolean areDuplicates = compareEntities(entity1, entity2); + if (areDuplicates) { + indicesToRemove.add(j); + } + } + } + } + if (!indicesToRemove.isEmpty()) { + // Remove indices in reverse order (already sorted in descending + // order so just loop through them) + for (final int index : indicesToRemove) { + entities.remove(index); + } + } + } + + /** + * Compares two Smart URI's to determine if they have nearly identical data. + * @param uri1 the first Smart {@link URI}. (not {@code null}) + * @param uri2 the second Smart {@link URI}. (not {@code null}) + * @return {@code true} if the two Smart URI's have nearly identical data. + * {@code false} otherwise. + * @throws SmartUriException + */ + public boolean compareSmartUris(final URI uri1, final URI uri2) throws SmartUriException { + requireNonNull(uri1); + requireNonNull(uri2); + final Entity entity1 = SmartUriAdapter.deserializeUriEntity(uri1); + final Entity entity2 = SmartUriAdapter.deserializeUriEntity(uri2); + return compareEntities(entity1, entity2); + } + + /** + * Compares two entities to determine if they have nearly identical data. + * @param entity1 the first {@link Entity}. (not {@code null}) + * @param entity2 the second {@link Entity}. (not {@code null}) + * @return {@code true} if the two entities have nearly identical data. + * {@code false} otherwise. + * @throws SmartUriException + */ + public boolean compareEntities(final Entity entity1, final Entity entity2) throws SmartUriException { + requireNonNull(entity1); + requireNonNull(entity2); + boolean allValuesNearlyEqual = true; + + final List<RyaURI> types1 = entity1.getExplicitTypeIds(); + final List<RyaURI> types2 = entity2.getExplicitTypeIds(); + final boolean doBothHaveSameTypes = types1.containsAll(types2); + if (!doBothHaveSameTypes) { + return false; + } + for (final Entry<RyaURI, ImmutableMap<RyaURI, Property>> entry : entity1.getProperties().entrySet()) { + final RyaURI typeIdUri = entry.getKey(); + for (final Entry<RyaURI, Property> typeProperty : entry.getValue().entrySet()) { + final RyaURI propertyNameUri = typeProperty.getKey(); + final Property property1 = typeProperty.getValue(); + + final Optional<Property> p2 = entity2.lookupTypeProperty(typeIdUri, propertyNameUri); + if (p2.isPresent()) { + final Property property2 = p2.get(); + final RyaType value1 = property1.getValue(); + final RyaType value2 = property2.getValue(); + final String data1 = value1.getData(); + final String data2 = value2.getData(); + final URI xmlSchemaUri1 = value1.getDataType(); + final ApproxEqualsDetector<?> approxEqualsDetector = uriMap.get(xmlSchemaUri1); + if (approxEqualsDetector == null) { + throw new SmartUriException("No appropriate detector found for the type: " + xmlSchemaUri1); + } + final boolean approxEquals = approxEqualsDetector.areApproxEquals(data1, data2); + if (!approxEquals) { + allValuesNearlyEqual = false; + break; + } + } else { + allValuesNearlyEqual = false; + break; + } + } + if (!allValuesNearlyEqual) { + break; + } + } + return allValuesNearlyEqual; + } + + /** + * Gets the appropriate {@link ApproxEqualsDetector} for the specified + * class. + * @param clazz the {@link Class} to find an {@link ApproxEqualsDetector} + * for. + * @return the {@link ApproxEqualsDetector} for the class or {@code null} if + * none could be found. + */ + public ApproxEqualsDetector<?> getDetectorForType(final Class<?> clazz) { + return classMap.get(clazz); + } + + private static boolean isOnlyOneNull(final Object lhs, final Object rhs) { + return (lhs == null && rhs != null) || (lhs != null && rhs == null); + } + + /** + * Class to detect if two booleans are considered approximately equal to + * each other. + */ + public static class BooleanApproxEqualsDetector implements ApproxEqualsDetector<Boolean> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0, ToleranceType.DIFFERENCE); + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link BooleanApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public BooleanApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final Boolean lhs, final Boolean rhs) { + // Should never be almost equals when tolerance is 0, only exactly equals + // Otherwise if there's any tolerance specified everything is equal + return tolerance.getValue() == 0 ? Objects.equals(lhs, rhs) : true; + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public Boolean convertStringToObject(final String string) throws SmartUriException { + return Boolean.valueOf(string); + } + + @Override + public Class<?> getTypeClass() { + return Boolean.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.BOOLEAN; + } + } + + /** + * Class to detect if two bytes are considered approximately equal to each + * other. + */ + public static class ByteApproxEqualsDetector implements ApproxEqualsDetector<Byte> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0, ToleranceType.DIFFERENCE); + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link ByteApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public ByteApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final Byte lhs, final Byte rhs) { --- End diff -- Seems like this is Serialization dependent. Maybe it would be a good idea to include similar method that takes in an implementation of a standard Serialization interface along with two java objects? Or maybe just two Java serializables? I guess this doesn't work in the context of this particular implementation. Maybe it would would be good to have an ApproxEqualsDetector class with a Serializable parameter? That way, you know how things are being serialized. That could just delegate to this class under the hood. > Smart URI avoid data duplication > -------------------------------- > > Key: RYA-250 > URL: https://issues.apache.org/jira/browse/RYA-250 > Project: Rya > Issue Type: Task > Components: dao > Affects Versions: 3.2.10 > Reporter: Eric White > Assignee: Eric White > Fix For: 3.2.10 > > > Implement Smart URI methods for avoiding data duplication. -- This message was sent by Atlassian JIRA (v6.4.14#64029)