http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java new file mode 100644 index 0000000..220db30 --- /dev/null +++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/DuplicateDataDetector.java @@ -0,0 +1,1066 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.rya.indexing.smarturi.duplication; + +import static java.util.Objects.requireNonNull; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.commons.configuration.ConfigurationException; +import org.apache.commons.lang.StringUtils; +import org.apache.rya.api.domain.RyaType; +import org.apache.rya.api.domain.RyaURI; +import org.apache.rya.api.resolver.impl.DateTimeRyaTypeResolver; +import org.apache.rya.indexing.entity.model.Entity; +import org.apache.rya.indexing.entity.model.Property; +import org.apache.rya.indexing.smarturi.SmartUriAdapter; +import org.apache.rya.indexing.smarturi.SmartUriException; +import org.apache.rya.indexing.smarturi.duplication.conf.DuplicateDataConfig; +import org.calrissian.mango.types.exception.TypeEncodingException; +import org.joda.time.DateTime; +import org.openrdf.model.URI; +import org.openrdf.model.impl.URIImpl; +import org.openrdf.model.vocabulary.XMLSchema; + +import com.google.common.collect.ImmutableMap; + +/** + * Detects if two entities contain data that's nearly identical based on a set + * tolerance for each field's type. Two entities are considered nearly + * identical if all their properties are equal and/or within the specified + * tolerance for the property's object type. Setting all object type tolerances + * to 0 means that the objects need to be exactly equal to each other to be + * considered duplicates. Duplicate data detection can be enabled/disabled + * through configuration and each object type can have a tolerance based on + * either the difference or the percentage difference between the objects being + * compared. + */ +public class DuplicateDataDetector { + private final Map<URI, ApproxEqualsDetector<?>> uriMap = new HashMap<>(); + private final Map<Class<?>, ApproxEqualsDetector<?>> classMap = new HashMap<>(); + + private boolean isDetectionEnabled; + + /** + * Creates a new instance of {@link DuplicateDataDetector} with the + * values provided by the configuration file. + * @param duplicateDataConfig the {@link DuplicateDataConfig} + */ + public DuplicateDataDetector(final DuplicateDataConfig duplicateDataConfig) { + this(duplicateDataConfig.getBooleanTolerance(), + duplicateDataConfig.getByteTolerance(), + duplicateDataConfig.getDateTolerance(), + duplicateDataConfig.getDoubleTolerance(), + duplicateDataConfig.getFloatTolerance(), + duplicateDataConfig.getIntegerTolerance(), + duplicateDataConfig.getLongTolerance(), + duplicateDataConfig.getShortTolerance(), + duplicateDataConfig.getStringTolerance(), + duplicateDataConfig.getUriTolerance(), + duplicateDataConfig.getEquivalentTermsMap(), + duplicateDataConfig.isDetectionEnabled() + ); + } + + /** + * Creates a new instance of {@link DuplicateDataDetector} with the values + * from the config. + * @throws ConfigurationException + */ + public DuplicateDataDetector() throws ConfigurationException { + this(new DuplicateDataConfig()); + } + + /** + * Creates a new instance of {@link DuplicateDataDetector}. + * @param tolerance the tolerance to assign to all types. + */ + public DuplicateDataDetector(final double tolerance) { + this(new Tolerance(tolerance, ToleranceType.DIFFERENCE), new LinkedHashMap<>()); + } + + /** + * Creates a new instance of {@link DuplicateDataDetector}. + * @param tolerance the tolerance to assign to all types. + * @param equivalentTermsMap the {@link Map} of terms that are considered + * equivalent to each other. (not {@code null}) + */ + public DuplicateDataDetector(final Tolerance tolerance, final Map<String, List<String>> equivalentTermsMap) { + this(tolerance, tolerance, tolerance, tolerance, tolerance, + tolerance, tolerance, tolerance, tolerance, tolerance , equivalentTermsMap, true); + } + + /** + * Creates a new instance of {@link DuplicateDataDetector}. + * @param booleanTolerance the {@link Boolean} tolerance value or + * {@code null} if not specified. + * @param byteTolerance the {@link Byte} tolerance value or {@code null} if + * not specified. + * @param dateTolerance the {@link Date} tolerance value or {@code null} if + * not specified. + * @param doubleTolerance the {@link Double} tolerance value or {@code null} + * if not specified. + * @param floatTolerance the {@link Float} tolerance value or {@code null} + * if not specified. + * @param integerTolerance the {@link Integer} tolerance value or + * {@code null} if not specified. + * @param longTolerance the {@link Long} tolerance value or {@code null} if + * not specified. + * @param shortTolerance the {@link Short} tolerance value or {@code null} + * if not specified. + * @param stringTolerance the {@link String} tolerance value or {@code null} + * if not specified. + * @param uriTolerance the {@link URI} tolerance value or {@code null} if + * not specified. + * @param equivalentTermsMap the {@link Map} of terms that are considered + * equivalent to each other. (not {@code null}) + * @param isDetectionEnabled {@code true} to enable detection. {@code false} + * to disable detection. + */ + public DuplicateDataDetector(final Tolerance booleanTolerance, final Tolerance byteTolerance, + final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance, + final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance, + final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap, + final boolean isDetectionEnabled) + { + init(booleanTolerance, byteTolerance, dateTolerance, doubleTolerance, floatTolerance, + integerTolerance, longTolerance, shortTolerance, stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled); + } + + private void init(final Tolerance booleanTolerance, final Tolerance byteTolerance, + final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance, + final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance, + final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap, + final boolean isDetectionEnabled) + { + final List<ApproxEqualsDetector<?>> detectors = new ArrayList<>(); + detectors.add(new BooleanApproxEqualsDetector(booleanTolerance)); + detectors.add(new ByteApproxEqualsDetector(byteTolerance)); + detectors.add(new DateApproxEqualsDetector(dateTolerance)); + detectors.add(new DateTimeApproxEqualsDetector(dateTolerance)); + detectors.add(new DoubleApproxEqualsDetector(doubleTolerance)); + detectors.add(new FloatApproxEqualsDetector(floatTolerance)); + detectors.add(new IntegerApproxEqualsDetector(integerTolerance)); + detectors.add(new LongApproxEqualsDetector(longTolerance)); + detectors.add(new ShortApproxEqualsDetector(shortTolerance)); + detectors.add(new StringApproxEqualsDetector(stringTolerance, equivalentTermsMap)); + detectors.add(new UriApproxEqualsDetector(uriTolerance)); + + for (final ApproxEqualsDetector<?> approxEqualsDetector : detectors) { + uriMap.put(approxEqualsDetector.getXmlSchemaUri(), approxEqualsDetector); + classMap.put(approxEqualsDetector.getTypeClass(), approxEqualsDetector); + } + + this.isDetectionEnabled = isDetectionEnabled; + } + + /** + * @return {@code true} to enable detection. {@code false} to disable + * detection. + */ + public boolean isDetectionEnabled() { + return isDetectionEnabled; + } + + /** + * Removes any duplicate (nearly identical) entities from the collection + * of entities. + * @param entities the {@link List} of {@link Entity}s. (not {@code null}) + * @throws SmartUriException + */ + public void removeDuplicatesFromCollection(final List<Entity> entities) throws SmartUriException { + requireNonNull(entities); + // Use a Sorted Set in reverse order to hold the indices + final Set<Integer> indicesToRemove = new TreeSet<>((a, b) -> Integer.compare(b, a)); + if (entities != null && entities.size() > 1) { + // Compare all entities to each other while avoiding making the + // same comparisons again and not comparing an entity to itself. + for (int i = 0; i < entities.size() - 1; i++) { + final Entity entity1 = entities.get(i); + for (int j = entities.size() - 1; j > i; j--) { + final Entity entity2 = entities.get(j); + final boolean areDuplicates = compareEntities(entity1, entity2); + if (areDuplicates) { + indicesToRemove.add(j); + } + } + } + } + if (!indicesToRemove.isEmpty()) { + // Remove indices in reverse order (already sorted in descending + // order so just loop through them) + for (final int index : indicesToRemove) { + entities.remove(index); + } + } + } + + /** + * Compares two Smart URI's to determine if they have nearly identical data. + * @param uri1 the first Smart {@link URI}. (not {@code null}) + * @param uri2 the second Smart {@link URI}. (not {@code null}) + * @return {@code true} if the two Smart URI's have nearly identical data. + * {@code false} otherwise. + * @throws SmartUriException + */ + public boolean compareSmartUris(final URI uri1, final URI uri2) throws SmartUriException { + requireNonNull(uri1); + requireNonNull(uri2); + final Entity entity1 = SmartUriAdapter.deserializeUriEntity(uri1); + final Entity entity2 = SmartUriAdapter.deserializeUriEntity(uri2); + return compareEntities(entity1, entity2); + } + + /** + * Compares two entities to determine if they have nearly identical data. + * @param entity1 the first {@link Entity}. (not {@code null}) + * @param entity2 the second {@link Entity}. (not {@code null}) + * @return {@code true} if the two entities have nearly identical data. + * {@code false} otherwise. + * @throws SmartUriException + */ + public boolean compareEntities(final Entity entity1, final Entity entity2) throws SmartUriException { + requireNonNull(entity1); + requireNonNull(entity2); + boolean allValuesNearlyEqual = true; + + final List<RyaURI> types1 = entity1.getExplicitTypeIds(); + final List<RyaURI> types2 = entity2.getExplicitTypeIds(); + final boolean doBothHaveSameTypes = types1.containsAll(types2); + if (!doBothHaveSameTypes) { + return false; + } + for (final Entry<RyaURI, ImmutableMap<RyaURI, Property>> entry : entity1.getProperties().entrySet()) { + final RyaURI typeIdUri = entry.getKey(); + for (final Entry<RyaURI, Property> typeProperty : entry.getValue().entrySet()) { + final RyaURI propertyNameUri = typeProperty.getKey(); + final Property property1 = typeProperty.getValue(); + + final Optional<Property> p2 = entity2.lookupTypeProperty(typeIdUri, propertyNameUri); + if (p2.isPresent()) { + final Property property2 = p2.get(); + final RyaType value1 = property1.getValue(); + final RyaType value2 = property2.getValue(); + final String data1 = value1.getData(); + final String data2 = value2.getData(); + final URI xmlSchemaUri1 = value1.getDataType(); + final ApproxEqualsDetector<?> approxEqualsDetector = uriMap.get(xmlSchemaUri1); + if (approxEqualsDetector == null) { + throw new SmartUriException("No appropriate detector found for the type: " + xmlSchemaUri1); + } + final boolean approxEquals = approxEqualsDetector.areApproxEquals(data1, data2); + if (!approxEquals) { + allValuesNearlyEqual = false; + break; + } + } else { + allValuesNearlyEqual = false; + break; + } + } + if (!allValuesNearlyEqual) { + break; + } + } + return allValuesNearlyEqual; + } + + /** + * Gets the appropriate {@link ApproxEqualsDetector} for the specified + * class. + * @param clazz the {@link Class} to find an {@link ApproxEqualsDetector} + * for. + * @return the {@link ApproxEqualsDetector} for the class or {@code null} if + * none could be found. + */ + public ApproxEqualsDetector<?> getDetectorForType(final Class<?> clazz) { + return classMap.get(clazz); + } + + private static boolean isOnlyOneNull(final Object lhs, final Object rhs) { + return (lhs == null && rhs != null) || (lhs != null && rhs == null); + } + + /** + * Class to detect if two booleans are considered approximately equal to + * each other. + */ + public static class BooleanApproxEqualsDetector implements ApproxEqualsDetector<Boolean> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0, ToleranceType.DIFFERENCE); + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link BooleanApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public BooleanApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final Boolean lhs, final Boolean rhs) { + // Should never be almost equals when tolerance is 0, only exactly equals + // Otherwise if there's any tolerance specified everything is equal + return tolerance.getValue() == 0 ? Objects.equals(lhs, rhs) : true; + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public Boolean convertStringToObject(final String string) throws SmartUriException { + return Boolean.valueOf(string); + } + + @Override + public Class<?> getTypeClass() { + return Boolean.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.BOOLEAN; + } + } + + /** + * Class to detect if two bytes are considered approximately equal to each + * other. + */ + public static class ByteApproxEqualsDetector implements ApproxEqualsDetector<Byte> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0, ToleranceType.DIFFERENCE); + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link ByteApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public ByteApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final Byte lhs, final Byte rhs) { + if (isOnlyOneNull(lhs, rhs)) { + return false; + } + if (Objects.equals(lhs, rhs)) { + // They're exactly equals so get out + return true; + } else if (tolerance.getValue() == 0) { + // If they're not exactly equals with zero tolerance then get out + return false; + } + // Check based on tolerance + switch (tolerance.getToleranceType()) { + case PERCENTAGE: + if (lhs == 0) { + return lhs == rhs; + } + if (tolerance.getValue() >= 1) { + return true; + } + return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue(); + case DIFFERENCE: + default: + return Math.abs(lhs - rhs) <= tolerance.getValue(); + } + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public Byte convertStringToObject(final String string) throws SmartUriException { + return Byte.valueOf(string); + } + + @Override + public Class<?> getTypeClass() { + return Byte.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.BYTE; + } + } + + /** + * Class to detect if two dates are considered approximately equal to each + * other. + */ + public static class DateApproxEqualsDetector implements ApproxEqualsDetector<Date> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(500.0, ToleranceType.DIFFERENCE); // milliseconds + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link DateApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public DateApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final Date lhs, final Date rhs) { + if (isOnlyOneNull(lhs, rhs)) { + return false; + } + if (Objects.equals(lhs, rhs)) { + // They're exactly equals so get out + return true; + } else if (tolerance.getValue() == 0) { + // If they're not exactly equals with zero tolerance then get out + return false; + } + // Check based on tolerance + final long lhsTime = lhs.getTime(); + final long rhsTime = rhs.getTime(); + switch (tolerance.getToleranceType()) { + case PERCENTAGE: + if (lhsTime == 0) { + return lhsTime == rhsTime; + } + if (tolerance.getValue() >= 1) { + return true; + } + return ((double)Math.abs(lhsTime - rhsTime) / lhsTime) <= tolerance.getValue(); + case DIFFERENCE: + default: + return Math.abs(lhsTime - rhsTime) <= tolerance.getValue(); + } + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public Date convertStringToObject(final String string) throws SmartUriException { + DateTime dateTime = null; + try { + dateTime = DateTime.parse(string, DateTimeRyaTypeResolver.XMLDATETIME_PARSER); + } catch (final TypeEncodingException e) { + throw new SmartUriException("Exception occurred serializing data[" + string + "]", e); + } + final Date date = dateTime.toDate(); + return date; + } + + @Override + public Class<?> getTypeClass() { + return Date.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.DATE; + } + } + + /** + * Class to detect if two datetimes are considered approximately equal to + * each other. + */ + public static class DateTimeApproxEqualsDetector implements ApproxEqualsDetector<DateTime> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(500.0, ToleranceType.DIFFERENCE); // milliseconds + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link DateTimeApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public DateTimeApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final DateTime lhs, final DateTime rhs) { + if (isOnlyOneNull(lhs, rhs)) { + return false; + } + if (Objects.equals(lhs, rhs)) { + // They're exactly equals so get out + return true; + } else if (tolerance.getValue() == 0) { + // If they're not exactly equals with zero tolerance then get out + return false; + } + // Check based on tolerance + final long lhsTime = lhs.getMillis(); + final long rhsTime = rhs.getMillis(); + switch (tolerance.getToleranceType()) { + case PERCENTAGE: + if (lhsTime == 0) { + return lhsTime == rhsTime; + } + if (tolerance.getValue() >= 1) { + return true; + } + return ((double)Math.abs(lhsTime - rhsTime) / lhsTime) <= tolerance.getValue(); + case DIFFERENCE: + default: + return Math.abs(lhsTime - rhsTime) <= tolerance.getValue(); + } + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public DateTime convertStringToObject(final String string) throws SmartUriException { + DateTime dateTime = null; + try { + dateTime = DateTime.parse(string, DateTimeRyaTypeResolver.XMLDATETIME_PARSER); + } catch (final TypeEncodingException e) { + throw new SmartUriException("Exception occurred serializing data[" + string + "]", e); + } + return dateTime; + } + + @Override + public Class<?> getTypeClass() { + return DateTime.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.DATETIME; + } + } + + /** + * Class to detect if two doubles are considered approximately equal to each + * other. + */ + public static class DoubleApproxEqualsDetector implements ApproxEqualsDetector<Double> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0001, ToleranceType.PERCENTAGE); + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link DoubleApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public DoubleApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final Double lhs, final Double rhs) { + if (isOnlyOneNull(lhs, rhs)) { + return false; + } + if (Objects.equals(lhs, rhs)) { + // They're exactly equals so get out + return true; + } else if (tolerance.getValue() == 0) { + // If they're not exactly equals with zero tolerance then get out + return false; + } + // Doubles can be unpredictable with how they store a value + // like 0.1. So use BigDecimal with its String constructor + // to make things more predictable. + final BigDecimal lhsBd = new BigDecimal(String.valueOf(lhs)); + final BigDecimal rhsBd = new BigDecimal(String.valueOf(rhs)); + switch (tolerance.getToleranceType()) { + case PERCENTAGE: + if (lhs == 0) { + return lhs == rhs; + } + if (tolerance.getValue() >= 1) { + return true; + } + final BigDecimal absDiff = lhsBd.subtract(rhsBd).abs(); + try { + final BigDecimal percent = absDiff.divide(lhsBd); + return percent.doubleValue() <= tolerance.getValue(); + } catch (final ArithmeticException e) { + // BigDecimal quotient did not have a terminating + // decimal expansion. So, try without BigDecimal. + return (Math.abs(lhs - rhs) / lhs) <= tolerance.getValue(); + } + case DIFFERENCE: + default: + final BigDecimal absDiff1 = lhsBd.subtract(rhsBd).abs(); + return absDiff1.doubleValue() <= tolerance.getValue(); + //return Math.abs(lhs - rhs) <= tolerance.getValue(); + } + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public Double convertStringToObject(final String string) throws SmartUriException { + return Double.valueOf(string); + } + + @Override + public Class<?> getTypeClass() { + return Double.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.DOUBLE; + } + } + + /** + * Class to detect if two floats are considered approximately equal to each + * other. + */ + public static class FloatApproxEqualsDetector implements ApproxEqualsDetector<Float> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.0001, ToleranceType.PERCENTAGE); + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link FloatApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public FloatApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final Float lhs, final Float rhs) { + if (isOnlyOneNull(lhs, rhs)) { + return false; + } + if (Objects.equals(lhs, rhs)) { + // They're exactly equals so get out + return true; + } else if (tolerance.getValue() == 0) { + // If they're not exactly equals with zero tolerance then get out + return false; + } + // Check based on tolerance + // Floats can be unpredictable with how they store a value + // like 0.1. So use BigDecimal with its String constructor + // to make things more predictable. + final BigDecimal lhsBd = new BigDecimal(String.valueOf(lhs)); + final BigDecimal rhsBd = new BigDecimal(String.valueOf(rhs)); + switch (tolerance.getToleranceType()) { + case PERCENTAGE: + if (lhs == 0) { + return lhs == rhs; + } + if (tolerance.getValue() >= 1) { + return true; + } + final BigDecimal absDiff = lhsBd.subtract(rhsBd).abs(); + try { + final BigDecimal percent = absDiff.divide(lhsBd); + return percent.floatValue() <= tolerance.getValue(); + } catch (final ArithmeticException e) { + // BigDecimal quotient did not have a terminating + // decimal expansion. So, try without BigDecimal. + return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue(); + } + case DIFFERENCE: + default: + final BigDecimal absDiff1 = lhsBd.subtract(rhsBd).abs(); + return absDiff1.floatValue() <= tolerance.getValue(); + //return Math.abs(lhs - rhs) <= tolerance.getValue(); + } + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public Float convertStringToObject(final String string) throws SmartUriException { + return Float.valueOf(string); + } + + @Override + public Class<?> getTypeClass() { + return Float.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.FLOAT; + } + } + + /** + * Class to detect if two integers are considered approximately equal to + * each other. + */ + public static class IntegerApproxEqualsDetector implements ApproxEqualsDetector<Integer> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE); + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link IntegerApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public IntegerApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final Integer lhs, final Integer rhs) { + if (isOnlyOneNull(lhs, rhs)) { + return false; + } + if (Objects.equals(lhs, rhs)) { + // They're exactly equals so get out + return true; + } else if (tolerance.getValue() == 0) { + // If they're not exactly equals with zero tolerance then get out + return false; + } + // Check based on tolerance + switch (tolerance.getToleranceType()) { + case PERCENTAGE: + if (lhs == 0) { + return lhs == rhs; + } + if (tolerance.getValue() >= 1) { + return true; + } + return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue(); + case DIFFERENCE: + default: + return Math.abs(lhs - rhs) <= tolerance.getValue(); + } + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public Integer convertStringToObject(final String string) throws SmartUriException { + return Integer.valueOf(string); + } + + @Override + public Class<?> getTypeClass() { + return Integer.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.INTEGER; + } + } + + /** + * Class to detect if two longs are considered approximately equal to + * each other. + */ + public static class LongApproxEqualsDetector implements ApproxEqualsDetector<Long> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE); + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link LongApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public LongApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final Long lhs, final Long rhs) { + if (isOnlyOneNull(lhs, rhs)) { + return false; + } + if (Objects.equals(lhs, rhs)) { + // They're exactly equals so get out + return true; + } else if (tolerance.getValue() == 0) { + // If they're not exactly equals with zero tolerance then get out + return false; + } + // Check based on tolerance + switch (tolerance.getToleranceType()) { + case PERCENTAGE: + if (lhs == 0) { + return lhs == rhs; + } + if (tolerance.getValue() >= 1) { + return true; + } + return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue(); + case DIFFERENCE: + default: + return Math.abs(lhs - rhs) <= tolerance.getValue(); + } + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public Long convertStringToObject(final String string) throws SmartUriException { + return Long.valueOf(string); + } + + @Override + public Class<?> getTypeClass() { + return Long.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.LONG; + } + } + + /** + * Class to detect if two shorts are considered approximately equal to each + * other. + */ + public static class ShortApproxEqualsDetector implements ApproxEqualsDetector<Short> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE); + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link ShortApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public ShortApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final Short lhs, final Short rhs) { + if (isOnlyOneNull(lhs, rhs)) { + return false; + } + if (Objects.equals(lhs, rhs)) { + // They're exactly equals so get out + return true; + } else if (tolerance.getValue() == 0) { + // If they're not exactly equals with zero tolerance then get out + return false; + } + // Check based on tolerance + switch (tolerance.getToleranceType()) { + case PERCENTAGE: + if (lhs == 0) { + return lhs == rhs; + } + if (tolerance.getValue() >= 1) { + return true; + } + return ((double)Math.abs(lhs - rhs) / lhs) <= tolerance.getValue(); + case DIFFERENCE: + default: + return Math.abs(lhs - rhs) <= tolerance.getValue(); + } + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public Short convertStringToObject(final String string) throws SmartUriException { + return Short.valueOf(string); + } + + @Override + public Class<?> getTypeClass() { + return Short.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.SHORT; + } + } + + /** + * Class to detect if two string are considered approximately equal to each + * other. + */ + public static class StringApproxEqualsDetector implements ApproxEqualsDetector<String> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(0.05, ToleranceType.PERCENTAGE); + private final Tolerance tolerance; + private final Map<String, List<String>> equivalentTermsMap; + + /** + * Creates a new instance of {@link StringApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public StringApproxEqualsDetector(final Tolerance tolerance, final Map<String, List<String>> equivalentTermsMap) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + this.equivalentTermsMap = equivalentTermsMap; + } + + @Override + public boolean areObjectsApproxEquals(final String lhs, final String rhs) { + if (isOnlyOneNull(lhs, rhs)) { + return false; + } + if (StringUtils.equalsIgnoreCase(lhs, rhs)) { + // They're exactly equals so get out + return true; + } else if (tolerance.getValue() == 0) { + // If they're not exactly equals with zero tolerance then get out + return false; + } + + // Only check one-way. Terms are not bi-directionally equivalent + // unless specified. + final List<String> lhsTermEquivalents = equivalentTermsMap.get(lhs); + if (lhsTermEquivalents != null && lhsTermEquivalents.contains(rhs)) { + return true; + } + final int distance = StringUtils.getLevenshteinDistance(lhs, rhs); + // Check based on tolerance + switch (tolerance.getToleranceType()) { + case PERCENTAGE: + if (lhs.length() == 0) { + return lhs.length() == rhs.length(); + } + if (tolerance.getValue() >= 1) { + return true; + } + return ((double)distance / lhs.length()) <= tolerance.getValue(); + case DIFFERENCE: + default: + return distance <= tolerance.getValue(); + } + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public String convertStringToObject(final String string) throws SmartUriException { + return string; + } + + @Override + public Class<?> getTypeClass() { + return String.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.STRING; + } + } + + /** + * Class to detect if two URIs are considered approximately equal to each + * other. + */ + public static class UriApproxEqualsDetector implements ApproxEqualsDetector<URI> { + private static final Tolerance DEFAULT_TOLERANCE = new Tolerance(1.0, ToleranceType.DIFFERENCE); + private final Tolerance tolerance; + + /** + * Creates a new instance of {@link UriApproxEqualsDetector}. + * @param tolerance the {@link Tolerance}. + */ + public UriApproxEqualsDetector(final Tolerance tolerance) { + this.tolerance = tolerance != null ? tolerance : getDefaultTolerance(); + } + + @Override + public boolean areObjectsApproxEquals(final URI lhs, final URI rhs) { + if (isOnlyOneNull(lhs, rhs)) { + return false; + } + if (Objects.equals(lhs, rhs)) { + return true; + } + final String uriString1 = lhs.stringValue(); + final String uriString2 = rhs.stringValue(); + if (StringUtils.equalsIgnoreCase(uriString1, uriString2)) { + // They're exactly equals so get out + return true; + } else if (tolerance.getValue() == 0) { + // If they're not exactly equals with zero tolerance then get out + return false; + } + final int distance = StringUtils.getLevenshteinDistance(uriString1, uriString2); + // Check based on tolerance + switch (tolerance.getToleranceType()) { + case PERCENTAGE: + if (uriString1.length() == 0) { + return uriString1.length() == uriString2.length(); + } + if (tolerance.getValue() >= 1) { + return true; + } + return ((double)distance / uriString1.length()) <= tolerance.getValue(); + case DIFFERENCE: + default: + return distance <= tolerance.getValue(); + } + } + + @Override + public Tolerance getDefaultTolerance() { + return DEFAULT_TOLERANCE; + } + + @Override + public URI convertStringToObject(final String string) throws SmartUriException { + return new URIImpl(string); + } + + @Override + public Class<?> getTypeClass() { + return URI.class; + } + + @Override + public URI getXmlSchemaUri() { + return XMLSchema.ANYURI; + } + } +}
http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java new file mode 100644 index 0000000..8bdf54f --- /dev/null +++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/EntityNearDuplicateException.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.rya.indexing.smarturi.duplication; + +import org.apache.rya.indexing.entity.model.Entity; +import org.apache.rya.indexing.entity.storage.EntityStorage.EntityStorageException; + +/** + * An {@link Entity} could not be created because another entity is a nearly + * identical duplicate based on the configured tolerances. + */ +public class EntityNearDuplicateException extends EntityStorageException { + private static final long serialVersionUID = 1L; + + /** + * Creates a new instance of {@link EntityNearDuplicateException}. + * @param message the message to be displayed by the exception. + */ + public EntityNearDuplicateException(final String message) { + super(message); + } + + /** + * Creates a new instance of {@link EntityNearDuplicateException}. + * @param message the message to be displayed by the exception. + * @param throwable the source {#link Throwable} cause of the exception. + */ + public EntityNearDuplicateException(final String message, final Throwable throwable) { + super(message, throwable); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java new file mode 100644 index 0000000..772522c --- /dev/null +++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/Tolerance.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.rya.indexing.smarturi.duplication; + +import static java.util.Objects.requireNonNull; + +import java.text.NumberFormat; + +/** + * The types of methods available to use for calculating tolerance. + */ +public class Tolerance { + private final Double value; + private final ToleranceType toleranceType; + + /** + * Creates a new instance of {@link Tolerance}. + * @param value the tolerance value. (not {@code null}) + * @param toleranceType the {@link ToleranceType}. (not {@code null}) + */ + public Tolerance(final Double value, final ToleranceType toleranceType) { + this.value = requireNonNull(value); + this.toleranceType = requireNonNull(toleranceType); + } + + /** + * @return the tolerance value. + */ + public Double getValue() { + return value; + } + + /** + * @return the {@link ToleranceType}. + */ + public ToleranceType getToleranceType() { + return toleranceType; + } + + @Override + public String toString() { + switch (toleranceType) { + case PERCENTAGE: + return NumberFormat.getPercentInstance().format(value); + case DIFFERENCE: + return value.toString(); + default: + return "Unknown Tolerance Type with value: " + value.toString(); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java new file mode 100644 index 0000000..29faff1 --- /dev/null +++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/ToleranceType.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.rya.indexing.smarturi.duplication; + +import org.apache.commons.lang3.StringUtils; + +/** + * The types of methods available to use for calculating tolerance. + */ +public enum ToleranceType { + /** + * Indicates that the difference between two values must be within the + * specified tolerance value to be accepted. + */ + DIFFERENCE, + /** + * Indicates that the difference between two values divided by the original + * value must fall within the specified tolerance percentage value to be + * accepted. + */ + PERCENTAGE; + + /** + * Returns the {@link ToleranceType} that matches the specified name. + * @param name the name to find. + * @return the {@link ToleranceType} or {@code null} if none could be found. + */ + public static ToleranceType getToleranceTypeByName(final String name) { + if (StringUtils.isNotBlank(name)) { + return ToleranceType.valueOf(name); + } + return null; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java new file mode 100644 index 0000000..98f65c7 --- /dev/null +++ b/extras/indexing/src/main/java/org/apache/rya/indexing/smarturi/duplication/conf/DuplicateDataConfig.java @@ -0,0 +1,337 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.rya.indexing.smarturi.duplication.conf; + +import static java.util.Objects.requireNonNull; + +import java.text.NumberFormat; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.configuration.ConfigurationException; +import org.apache.commons.configuration.XMLConfiguration; +import org.apache.rya.indexing.smarturi.duplication.Tolerance; +import org.apache.rya.indexing.smarturi.duplication.ToleranceType; + +/** + * Configuration options for data duplication. + */ +public class DuplicateDataConfig { + public static final String DEFAULT_CONFIG_FILE_PATH = "conf/duplicate_data_detection_config.xml"; + + private Tolerance booleanTolerance; + private Tolerance byteTolerance; + private Tolerance dateTolerance; + private Tolerance doubleTolerance; + private Tolerance floatTolerance; + private Tolerance integerTolerance; + private Tolerance longTolerance; + private Tolerance shortTolerance; + private Tolerance stringTolerance; + private Tolerance uriTolerance; + + private Map<String, List<String>> equivalentTermsMap; + + private boolean isDetectionEnabled; + + /** + * Creates a new instance of {@link DuplicateDataConfig}. + * @throws ConfigurationException + */ + public DuplicateDataConfig() throws ConfigurationException { + this(new XMLConfiguration(DEFAULT_CONFIG_FILE_PATH)); + } + + /** + * Creates a new instance of {@link DuplicateDataConfig}. + * @param xmlFilePath the config's XML file path. (not {@code null}) + * @throws ConfigurationException + */ + public DuplicateDataConfig(final String xmlFileLocation) throws ConfigurationException { + this(new XMLConfiguration(requireNonNull(xmlFileLocation))); + } + + /** + * Creates a new instance of {@link DuplicateDataConfig}. + * @param xmlConfig the {@link XMLConfiguration} file. (not {@code null}) + * @throws ConfigurationException + */ + public DuplicateDataConfig(final XMLConfiguration xmlConfig) throws ConfigurationException { + requireNonNull(xmlConfig); + + final Tolerance booleanTolerance = parseTolerance("tolerances.booleanTolerance", xmlConfig); + final Tolerance byteTolerance = parseTolerance("tolerances.byteTolerance", xmlConfig); + final Tolerance dateTolerance = parseTolerance("tolerances.dateTolerance", xmlConfig); + final Tolerance doubleTolerance = parseTolerance("tolerances.doubleTolerance", xmlConfig); + final Tolerance floatTolerance = parseTolerance("tolerances.floatTolerance", xmlConfig); + final Tolerance integerTolerance = parseTolerance("tolerances.integerTolerance", xmlConfig); + final Tolerance longTolerance = parseTolerance("tolerances.longTolerance", xmlConfig); + final Tolerance shortTolerance = parseTolerance("tolerances.shortTolerance", xmlConfig); + final Tolerance stringTolerance = parseTolerance("tolerances.stringTolerance", xmlConfig); + final Tolerance uriTolerance = parseTolerance("tolerances.uriTolerance", xmlConfig); + + final Map<String, List<String>> equivalentTermsMap = parseEquivalentTermsMap(xmlConfig); + + final boolean isDetectionEnabled = xmlConfig.getBoolean("enableDetection", false); + init(booleanTolerance, byteTolerance, dateTolerance, doubleTolerance, floatTolerance, integerTolerance, longTolerance, shortTolerance, stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled); + } + + /** + * Creates a new instance of {@link DuplicateDataConfig}. + * @param booleanTolerance the {@link Boolean} tolerance value or + * {@code null} if not specified. + * @param byteTolerance the {@link Byte} tolerance value or {@code null} if + * not specified. + * @param dateTolerance the {@link Date} tolerance value or {@code null} if + * not specified. + * @param doubleTolerance the {@link Double} tolerance value or {@code null} + * if not specified. + * @param floatTolerance the {@link Float} tolerance value or {@code null} + * if not specified. + * @param integerTolerance the {@link Integer} tolerance value or + * {@code null} if not specified. + * @param longTolerance the {@link Long} tolerance value or {@code null} if + * not specified. + * @param shortTolerance the {@link Short} tolerance value or {@code null} + * if not specified. + * @param stringTolerance the {@link String} tolerance value or {@code null} + * if not specified. + * @param uriTolerance the {@link URI} tolerance value or {@code null} if + * not specified. + * @param equivalentTermsMap the {@link Map} of terms that are considered + * equivalent to each other. (not {@code null}) + * @param isDetectionEnabled {@code true} to enable detection. {@code false} + * to disable detection. + */ + public DuplicateDataConfig(final Tolerance booleanTolerance, final Tolerance byteTolerance, + final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance, + final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance, + final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap, + final boolean isDetectionEnabled) + { + init(booleanTolerance, byteTolerance, dateTolerance, doubleTolerance, floatTolerance, integerTolerance, longTolerance, shortTolerance, stringTolerance, uriTolerance, equivalentTermsMap, isDetectionEnabled); + } + + private void init(final Tolerance booleanTolerance, final Tolerance byteTolerance, + final Tolerance dateTolerance, final Tolerance doubleTolerance, final Tolerance floatTolerance, + final Tolerance integerTolerance, final Tolerance longTolerance, final Tolerance shortTolerance, + final Tolerance stringTolerance, final Tolerance uriTolerance, final Map<String, List<String>> equivalentTermsMap, + final boolean isDetectionEnabled) + { + this.booleanTolerance = booleanTolerance; + this.byteTolerance = byteTolerance; + this.dateTolerance= dateTolerance; + this.doubleTolerance = doubleTolerance; + this.floatTolerance = floatTolerance; + this.integerTolerance = integerTolerance; + this.longTolerance = longTolerance; + this.shortTolerance = shortTolerance; + this.stringTolerance = stringTolerance; + this.uriTolerance = uriTolerance; + this.equivalentTermsMap = requireNonNull(equivalentTermsMap); + this.isDetectionEnabled = isDetectionEnabled; + } + + private static Tolerance parseTolerance(final String key, final XMLConfiguration xmlConfig) throws ConfigurationException { + final String type = xmlConfig.getString(key + ".type", null); + final ToleranceType toleranceType = ToleranceType.getToleranceTypeByName(type); + Double doubleValue = null; + if (toleranceType != null) { + switch (toleranceType) { + case PERCENTAGE: + final String value = xmlConfig.getString(key + ".value", null); + if (value != null && value.contains("%")) { + try { + final Number number = NumberFormat.getPercentInstance().parse(value); + doubleValue = number.doubleValue(); + } catch (final ParseException e) { + throw new ConfigurationException(e); + } + } else { + doubleValue = xmlConfig.getDouble(key + ".value", null); + } + if (doubleValue != null) { + if (doubleValue < 0) { + throw new ConfigurationException("The " + toleranceType + " tolerance type for \"" + key + "\" must be a positive value. Found this value: " + doubleValue); + } + if (doubleValue > 1) { + throw new ConfigurationException("The " + toleranceType + " tolerance type for \"" + key + "\" can NOT be greater than 100%. Found this value: " + doubleValue); + } + } + break; + case DIFFERENCE: + doubleValue = xmlConfig.getDouble(key + ".value", null); + if (doubleValue != null && doubleValue < 0) { + throw new ConfigurationException("The " + toleranceType + " tolerance type for \"" + key + "\" must be a positive value. Found this value: " + doubleValue); + } + break; + default: + throw new ConfigurationException("Unknown Tolerance Type specified in config for <" + type + ">: " + toleranceType); + } + if (doubleValue != null) { + return new Tolerance(doubleValue, toleranceType); + } + } + return null; + } + + private static Map<String, List<String>> parseEquivalentTermsMap(final XMLConfiguration xmlConfig) { + final Map<String, List<String>> equivalentTermsMap = new LinkedHashMap<>(); + final Object prop = xmlConfig.getProperty("termMappings.termMapping.term"); + if (prop != null) { + if (prop instanceof Collection) { + final int size = ((Collection<?>) prop).size(); + for (int i = 0; i < size; i++) { + final String termElement = "termMappings.termMapping(" + i + ")"; + parseTermMapping(termElement, xmlConfig, equivalentTermsMap); + } + } else { + final String termElement = "termMappings.termMapping"; + parseTermMapping(termElement, xmlConfig, equivalentTermsMap); + } + } + return equivalentTermsMap; + } + + private static void parseTermMapping(final String termElement, final XMLConfiguration xmlConfig, final Map<String, List<String>> equivalentTermsMap) { + final String term = xmlConfig.getString(termElement + ".term"); + final Object equivalentProp = xmlConfig.getString(termElement + ".equivalents.equivalent"); + if (equivalentProp instanceof Collection) { + final int equivalentSize = ((Collection<?>) equivalentProp).size(); + if (term != null && equivalentSize > 1) { + final List<String> equivalents = new ArrayList<>(); + for (int j = 0; j < equivalentSize; j++) { + final String equivalent = xmlConfig.getString(termElement + ".equivalents.equivalent(" + j + ")"); + if (equivalent != null) { + equivalents.add(equivalent); + } + } + equivalentTermsMap.put(term, equivalents); + } + } else { + final List<String> equivalents = new ArrayList<>(); + final String equivalent = xmlConfig.getString(termElement + ".equivalents.equivalent"); + if (equivalent != null) { + equivalents.add(equivalent); + if (term != null) { + equivalentTermsMap.put(term, equivalents); + } + } + } + } + + /** + * @return the {@link Boolean} tolerance value or {@code null} if not + * specified. + */ + public Tolerance getBooleanTolerance() { + return booleanTolerance; + } + + /** + * @return the {@link Byte} tolerance value or {@code null} if not + * specified. + */ + public Tolerance getByteTolerance() { + return byteTolerance; + } + + /** + * @return the {@link Date} tolerance value or {@code null} if not + * specified. + */ + public Tolerance getDateTolerance() { + return dateTolerance; + } + + /** + * @return the {@link Double} tolerance value or {@code null} if not + * specified. + */ + public Tolerance getDoubleTolerance() { + return doubleTolerance; + } + + /** + * @return the {@link Float} tolerance value or {@code null} if not + * specified. + */ + public Tolerance getFloatTolerance() { + return floatTolerance; + } + + /** + * @return the {@link Integer} tolerance value or {@code null} if not + * specified. + */ + public Tolerance getIntegerTolerance() { + return integerTolerance; + } + + /** + * @return the {@link Long} tolerance value or {@code null} if not + * specified. + */ + public Tolerance getLongTolerance() { + return longTolerance; + } + + /** + * @return the {@link Short} tolerance value or {@code null} if not + * specified. + */ + public Tolerance getShortTolerance() { + return shortTolerance; + } + + /** + * @return the {@link String} tolerance value or {@code null} if not + * specified. + */ + public Tolerance getStringTolerance() { + return stringTolerance; + } + + /** + * @return the {@link URI} tolerance value or {@code null} if not specified. + */ + public Tolerance getUriTolerance() { + return uriTolerance; + } + + /** + * @return the {@link Map} of terms that are considered equivalent to each + * other. + */ + public Map<String, List<String>> getEquivalentTermsMap() { + return equivalentTermsMap; + } + + /** + * @return {@code true} to enable detection. {@code false} to disable + * detection. + */ + public boolean isDetectionEnabled() { + return isDetectionEnabled; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-rya/blob/b319365e/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java ---------------------------------------------------------------------- diff --git a/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java b/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java index 60efbed..dff271f 100644 --- a/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java +++ b/extras/indexing/src/test/java/org/apache/rya/indexing/mongo/MongoDbSmartUriTest.java @@ -245,7 +245,6 @@ public class MongoDbSmartUriTest { final Entity resultEntity = SmartUriAdapter.deserializeUriEntity(smartUri); System.out.println(resultEntity); assertEquals(BOB_ENTITY.getSubject(), resultEntity.getSubject()); - //assertTrue(Paths.get(BOB_ENTITY.getSubject().getData()).equals(Paths.get(resultEntity.getSubject().getData()))); } @Test
