[ 
https://issues.apache.org/jira/browse/RYA-250?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16131229#comment-16131229
 ] 

ASF GitHub Bot commented on RYA-250:
------------------------------------

Github user meiercaleb commented on a diff in the pull request:

    https://github.com/apache/incubator-rya/pull/153#discussion_r133817921
  
    --- Diff: 
extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java
 ---
    @@ -0,0 +1,1066 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *   http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing,
    + * software distributed under the License is distributed on an
    + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    + * KIND, either express or implied.  See the License for the
    + * specific language governing permissions and limitations
    + * under the License.
    + */
    +package org.apache.rya.indexing.smarturi.duplication;
    +
    +import static java.util.Objects.requireNonNull;
    +
    +import java.math.BigDecimal;
    +import java.util.ArrayList;
    +import java.util.Date;
    +import java.util.HashMap;
    +import java.util.LinkedHashMap;
    +import java.util.List;
    +import java.util.Map;
    +import java.util.Map.Entry;
    +import java.util.Objects;
    +import java.util.Optional;
    +import java.util.Set;
    +import java.util.TreeSet;
    +
    +import org.apache.commons.configuration.ConfigurationException;
    +import org.apache.commons.lang.StringUtils;
    +import org.apache.rya.api.domain.RyaType;
    +import org.apache.rya.api.domain.RyaURI;
    +import org.apache.rya.api.resolver.impl.DateTimeRyaTypeResolver;
    +import org.apache.rya.indexing.entity.model.Entity;
    +import org.apache.rya.indexing.entity.model.Property;
    +import org.apache.rya.indexing.smarturi.SmartUriAdapter;
    +import org.apache.rya.indexing.smarturi.SmartUriException;
    +import 
org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig;
    +import org.calrissian.mango.types.exception.TypeEncodingException;
    +import org.joda.time.DateTime;
    +import org.openrdf.model.URI;
    +import org.openrdf.model.impl.URIImpl;
    +import org.openrdf.model.vocabulary.XMLSchema;
    +
    +import com.google.common.collect.ImmutableMap;
    +
    +/**
    + * Detects if two entities contain data that's nearly identical based on a 
set
    + * tolerance for each field's type. Two entities are considered nearly
    + * identical if all their properties are equal and/or within the specified
    + * tolerance for the property's object type. Setting all object type 
tolerances
    + * to 0 means that the objects need to be exactly equal to each other to be
    + * considered duplicates. Duplicate data detection can be enabled/disabled
    + * through configuration and each object type can have a tolerance based on
    + * either the difference or the percentage difference between the objects 
being
    + * compared.
    + */
    +public class DuplicateDataDetector {
    +    private final Map<URI, ApproxEqualsDetector<?>> uriMap = new 
HashMap<>();
    +    private final Map<Class<?>, ApproxEqualsDetector<?>> classMap = new 
HashMap<>();
    +
    +    private boolean isDetectionEnabled;
    +
    +    /**
    +     * Creates a new instance of {@link DuplicateDataDetector} with the
    +     * values provided by the configuration file.
    +     * @param duplicateDataConfig the {@link DuplicateDataConfig}
    +     */
    +    public DuplicateDataDetector(final DuplicateDataConfig 
duplicateDataConfig) {
    +        this(duplicateDataConfig.getBooleanTolerance(),
    +            duplicateDataConfig.getByteTolerance(),
    +            duplicateDataConfig.getDateTolerance(),
    +            duplicateDataConfig.getDoubleTolerance(),
    +            duplicateDataConfig.getFloatTolerance(),
    +            duplicateDataConfig.getIntegerTolerance(),
    +            duplicateDataConfig.getLongTolerance(),
    +            duplicateDataConfig.getShortTolerance(),
    +            duplicateDataConfig.getStringTolerance(),
    +            duplicateDataConfig.getUriTolerance(),
    +            duplicateDataConfig.getEquivalentTermsMap(),
    +            duplicateDataConfig.isDetectionEnabled()
    +        );
    +    }
    +
    +    /**
    +     * Creates a new instance of {@link DuplicateDataDetector} with the 
values
    +     * from the config.
    +     * @throws ConfigurationException
    +     */
    +    public DuplicateDataDetector() throws ConfigurationException {
    +        this(new DuplicateDataConfig());
    +    }
    +
    +    /**
    +     * Creates a new instance of {@link DuplicateDataDetector}.
    +     * @param tolerance the tolerance to assign to all types.
    +     */
    +    public DuplicateDataDetector(final double tolerance) {
    +        this(new Tolerance(tolerance, ToleranceType.DIFFERENCE), new 
LinkedHashMap<>());
    +    }
    +
    +    /**
    +     * Creates a new instance of {@link DuplicateDataDetector}.
    +     * @param tolerance the tolerance to assign to all types.
    +     * @param equivalentTermsMap the {@link Map} of terms that are 
considered
    +     * equivalent to each other. (not {@code null})
    +     */
    +    public DuplicateDataDetector(final Tolerance tolerance, final 
Map<String, List<String>> equivalentTermsMap) {
    +        this(tolerance, tolerance, tolerance, tolerance, tolerance,
    +            tolerance, tolerance, tolerance, tolerance, tolerance , 
equivalentTermsMap, true);
    +    }
    +
    +    /**
    +     * Creates a new instance of {@link DuplicateDataDetector}.
    +     * @param booleanTolerance the {@link Boolean} tolerance value or
    +     * {@code null} if not specified.
    +     * @param byteTolerance the {@link Byte} tolerance value or {@code 
null} if
    +     * not specified.
    +     * @param dateTolerance the {@link Date} tolerance value or {@code 
null} if
    +     * not specified.
    +     * @param doubleTolerance the {@link Double} tolerance value or {@code 
null}
    +     * if not specified.
    +     * @param floatTolerance the {@link Float} tolerance value or {@code 
null}
    +     * if not specified.
    +     * @param integerTolerance the {@link Integer} tolerance value or
    +     * {@code null} if not specified.
    +     * @param longTolerance the {@link Long} tolerance value or {@code 
null} if
    +     * not specified.
    +     * @param shortTolerance the {@link Short} tolerance value or {@code 
null}
    +     * if not specified.
    +     * @param stringTolerance the {@link String} tolerance value or {@code 
null}
    +     * if not specified.
    +     * @param uriTolerance the {@link URI} tolerance value or {@code null} 
if
    +     * not specified.
    +     * @param equivalentTermsMap the {@link Map} of terms that are 
considered
    +     * equivalent to each other. (not {@code null})
    +     * @param isDetectionEnabled {@code true} to enable detection. {@code 
false}
    +     * to disable detection.
    +     */
    +    public DuplicateDataDetector(final Tolerance booleanTolerance, final 
Tolerance byteTolerance,
    +            final Tolerance dateTolerance, final Tolerance 
doubleTolerance, final Tolerance floatTolerance,
    +            final Tolerance integerTolerance, final Tolerance 
longTolerance, final Tolerance shortTolerance,
    +            final Tolerance stringTolerance, final Tolerance uriTolerance, 
final Map<String, List<String>> equivalentTermsMap,
    +            final boolean isDetectionEnabled)
    +    {
    +        init(booleanTolerance, byteTolerance, dateTolerance, 
doubleTolerance, floatTolerance,
    +            integerTolerance, longTolerance, shortTolerance, 
stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled);
    +    }
    +
    +    private void init(final Tolerance booleanTolerance, final Tolerance 
byteTolerance,
    +            final Tolerance dateTolerance, final Tolerance 
doubleTolerance, final Tolerance floatTolerance,
    +            final Tolerance integerTolerance, final Tolerance 
longTolerance, final Tolerance shortTolerance,
    +            final Tolerance stringTolerance, final Tolerance uriTolerance, 
final Map<String, List<String>> equivalentTermsMap,
    +            final boolean isDetectionEnabled)
    +    {
    +        final List<ApproxEqualsDetector<?>> detectors = new ArrayList<>();
    +        detectors.add(new BooleanApproxEqualsDetector(booleanTolerance));
    +        detectors.add(new ByteApproxEqualsDetector(byteTolerance));
    +        detectors.add(new DateApproxEqualsDetector(dateTolerance));
    +        detectors.add(new DateTimeApproxEqualsDetector(dateTolerance));
    +        detectors.add(new DoubleApproxEqualsDetector(doubleTolerance));
    +        detectors.add(new FloatApproxEqualsDetector(floatTolerance));
    +        detectors.add(new IntegerApproxEqualsDetector(integerTolerance));
    +        detectors.add(new LongApproxEqualsDetector(longTolerance));
    +        detectors.add(new ShortApproxEqualsDetector(shortTolerance));
    +        detectors.add(new StringApproxEqualsDetector(stringTolerance, 
equivalentTermsMap));
    +        detectors.add(new UriApproxEqualsDetector(uriTolerance));
    +
    +        for (final ApproxEqualsDetector<?> approxEqualsDetector : 
detectors) {
    +            uriMap.put(approxEqualsDetector.getXmlSchemaUri(), 
approxEqualsDetector);
    +            classMap.put(approxEqualsDetector.getTypeClass(), 
approxEqualsDetector);
    +        }
    +
    +        this.isDetectionEnabled = isDetectionEnabled;
    +    }
    +
    +    /**
    +     * @return {@code true} to enable detection. {@code false} to disable
    +     * detection.
    +     */
    +    public boolean isDetectionEnabled() {
    +        return isDetectionEnabled;
    +    }
    +
    +    /**
    +     * Removes any duplicate (nearly identical) entities from the 
collection
    +     * of entities.
    +     * @param entities the {@link List} of {@link Entity}s. (not {@code 
null})
    +     * @throws SmartUriException
    +     */
    +    public void removeDuplicatesFromCollection(final List<Entity> 
entities) throws SmartUriException {
    +        requireNonNull(entities);
    +        // Use a Sorted Set in reverse order to hold the indices
    +        final Set<Integer> indicesToRemove = new TreeSet<>((a, b) -> 
Integer.compare(b, a));
    +        if (entities != null && entities.size() > 1) {
    +            // Compare all entities to each other while avoiding making the
    +            // same comparisons again and not comparing an entity to 
itself.
    +            for (int i = 0; i < entities.size() - 1; i++) {
    +                final Entity entity1 = entities.get(i);
    +                for (int j = entities.size() - 1; j > i; j--) {
    +                    final Entity entity2 = entities.get(j);
    +                    final boolean areDuplicates = compareEntities(entity1, 
entity2);
    +                    if (areDuplicates) {
    +                        indicesToRemove.add(j);
    +                    }
    +                }
    +            }
    +        }
    +        if (!indicesToRemove.isEmpty()) {
    +            // Remove indices in reverse order (already sorted in 
descending
    +            // order so just loop through them)
    +            for (final int index : indicesToRemove) {
    +                entities.remove(index);
    +            }
    +        }
    +    }
    +
    +    /**
    +     * Compares two Smart URI's to determine if they have nearly identical 
data.
    +     * @param uri1 the first Smart {@link URI}. (not {@code null})
    +     * @param uri2 the second Smart {@link URI}. (not {@code null})
    +     * @return {@code true} if the two Smart URI's have nearly identical 
data.
    +     * {@code false} otherwise.
    +     * @throws SmartUriException
    +     */
    +    public boolean compareSmartUris(final URI uri1, final URI uri2) throws 
SmartUriException {
    +        requireNonNull(uri1);
    +        requireNonNull(uri2);
    +        final Entity entity1 = SmartUriAdapter.deserializeUriEntity(uri1);
    +        final Entity entity2 = SmartUriAdapter.deserializeUriEntity(uri2);
    +        return compareEntities(entity1, entity2);
    +    }
    +
    +    /**
    +     * Compares two entities to determine if they have nearly identical 
data.
    +     * @param entity1 the first {@link Entity}. (not {@code null})
    +     * @param entity2 the second {@link Entity}. (not {@code null})
    +     * @return {@code true} if the two entities have nearly identical data.
    +     * {@code false} otherwise.
    +     * @throws SmartUriException
    +     */
    +    public boolean compareEntities(final Entity entity1, final Entity 
entity2) throws SmartUriException {
    +        requireNonNull(entity1);
    +        requireNonNull(entity2);
    +        boolean allValuesNearlyEqual = true;
    +
    +        final List<RyaURI> types1 = entity1.getExplicitTypeIds();
    +        final List<RyaURI> types2 = entity2.getExplicitTypeIds();
    +        final boolean doBothHaveSameTypes = types1.containsAll(types2);
    +        if (!doBothHaveSameTypes) {
    +            return false;
    +        }
    +        for (final Entry<RyaURI, ImmutableMap<RyaURI, Property>> entry : 
entity1.getProperties().entrySet()) {
    +            final RyaURI typeIdUri = entry.getKey();
    +            for (final Entry<RyaURI, Property> typeProperty : 
entry.getValue().entrySet()) {
    +                final RyaURI propertyNameUri = typeProperty.getKey();
    +                final Property property1 = typeProperty.getValue();
    +
    +                final Optional<Property> p2 = 
entity2.lookupTypeProperty(typeIdUri, propertyNameUri);
    +                if (p2.isPresent()) {
    +                    final Property property2 = p2.get();
    +                    final RyaType value1 = property1.getValue();
    +                    final RyaType value2 = property2.getValue();
    +                    final String data1 = value1.getData();
    +                    final String data2 = value2.getData();
    +                    final URI xmlSchemaUri1 = value1.getDataType();
    +                    final ApproxEqualsDetector<?> approxEqualsDetector = 
uriMap.get(xmlSchemaUri1);
    +                    if (approxEqualsDetector == null) {
    +                        throw new SmartUriException("No appropriate 
detector found for the type: " + xmlSchemaUri1);
    +                    }
    +                    final boolean approxEquals = 
approxEqualsDetector.areApproxEquals(data1, data2);
    +                    if (!approxEquals) {
    +                        allValuesNearlyEqual = false;
    +                        break;
    +                    }
    +                } else {
    +                    allValuesNearlyEqual = false;
    +                    break;
    +                }
    +            }
    +            if (!allValuesNearlyEqual) {
    +                break;
    +            }
    +        }
    +        return allValuesNearlyEqual;
    +    }
    +
    +    /**
    +     * Gets the appropriate {@link ApproxEqualsDetector} for the specified
    +     * class.
    +     * @param clazz the {@link Class} to find an {@link 
ApproxEqualsDetector}
    +     * for.
    +     * @return the {@link ApproxEqualsDetector} for the class or {@code 
null} if
    +     * none could be found.
    +     */
    +    public ApproxEqualsDetector<?> getDetectorForType(final Class<?> 
clazz) {
    +        return classMap.get(clazz);
    +    }
    +
    +    private static boolean isOnlyOneNull(final Object lhs, final Object 
rhs) {
    +        return (lhs == null && rhs != null) || (lhs != null && rhs == 
null);
    +    }
    +
    +    /**
    +     * Class to detect if two booleans are considered approximately equal 
to
    +     * each other.
    +     */
    +    public static class BooleanApproxEqualsDetector implements 
ApproxEqualsDetector<Boolean> {
    +        private static final Tolerance DEFAULT_TOLERANCE = new 
Tolerance(0.0, ToleranceType.DIFFERENCE);
    +        private final Tolerance tolerance;
    +
    +        /**
    +         * Creates a new instance of {@link BooleanApproxEqualsDetector}.
    +         * @param tolerance the {@link Tolerance}.
    +         */
    +        public BooleanApproxEqualsDetector(final Tolerance tolerance) {
    +            this.tolerance = tolerance != null ? tolerance : 
getDefaultTolerance();
    +        }
    +
    +        @Override
    +        public boolean areObjectsApproxEquals(final Boolean lhs, final 
Boolean rhs) {
    +            // Should never be almost equals when tolerance is 0, only 
exactly equals
    +            // Otherwise if there's any tolerance specified everything is 
equal
    +            return tolerance.getValue() == 0 ? Objects.equals(lhs, rhs) : 
true;
    +        }
    +
    +        @Override
    +        public Tolerance getDefaultTolerance() {
    +            return DEFAULT_TOLERANCE;
    +        }
    +
    +        @Override
    +        public Boolean convertStringToObject(final String string) throws 
SmartUriException {
    +            return Boolean.valueOf(string);
    +        }
    +
    +        @Override
    +        public Class<?> getTypeClass() {
    +            return Boolean.class;
    +        }
    +
    +        @Override
    +        public URI getXmlSchemaUri() {
    +            return XMLSchema.BOOLEAN;
    +        }
    +    }
    +
    +    /**
    +     * Class to detect if two bytes are considered approximately equal to 
each
    +     * other.
    +     */
    +    public static class ByteApproxEqualsDetector implements 
ApproxEqualsDetector<Byte> {
    +        private static final Tolerance DEFAULT_TOLERANCE = new 
Tolerance(0.0, ToleranceType.DIFFERENCE);
    +        private final Tolerance tolerance;
    +
    +        /**
    +         * Creates a new instance of {@link ByteApproxEqualsDetector}.
    +         * @param tolerance the {@link Tolerance}.
    +         */
    +        public ByteApproxEqualsDetector(final Tolerance tolerance) {
    +            this.tolerance = tolerance != null ? tolerance : 
getDefaultTolerance();
    +        }
    +
    +        @Override
    +        public boolean areObjectsApproxEquals(final Byte lhs, final Byte 
rhs) {
    --- End diff --
    
    Okay. Got it.  Was thinking about it outside of that context.


> Smart URI avoid data duplication
> --------------------------------
>
>                 Key: RYA-250
>                 URL: https://issues.apache.org/jira/browse/RYA-250
>             Project: Rya
>          Issue Type: Task
>          Components: dao
>    Affects Versions: 3.2.10
>            Reporter: Eric White
>            Assignee: Eric White
>             Fix For: 3.2.10
>
>
> Implement Smart URI methods for avoiding data duplication.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to