Dominic.sauer has uploaded a new change for review.
https://gerrit.wikimedia.org/r/215008
Change subject: Add Levenshtein distance and data cleaning
......................................................................
Add Levenshtein distance and data cleaning
Change-Id: Ibe46d7b013f024c8749514f2115bbbad77926da6
---
M includes/CrossCheck/Comparer/StringComparer.php
M tests/phpunit/CrossCheck/Comparer/StringComparerTest.php
2 files changed, 146 insertions(+), 22 deletions(-)
git pull
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikidataQualityExternalValidation
refs/changes/08/215008/1
diff --git a/includes/CrossCheck/Comparer/StringComparer.php
b/includes/CrossCheck/Comparer/StringComparer.php
index fccf16d..8e5ffb6 100755
--- a/includes/CrossCheck/Comparer/StringComparer.php
+++ b/includes/CrossCheck/Comparer/StringComparer.php
@@ -15,7 +15,7 @@
class StringComparer {
/**
- * Threshold for matching compliance in prefix/postfix similarity checks
+ * Threshold for matching compliance in prefix/suffix similarity checks
*/
const SIMILARITY_THRESHOLD = 0.8;
@@ -29,6 +29,8 @@
public function compare( $value, $comparativeValue ) {
$this->assertIsString( $value, '$value' );
$this->assertIsString( $comparativeValue, '$comparativeValue' );
+ $value = $this->cleanDataString( $value );
+ $comparativeValue = $this->cleanDataString( $comparativeValue );
if ( $value === $comparativeValue ) {
return CompareResult::STATUS_MATCH;
@@ -49,6 +51,8 @@
public function compareArray( $value, array $comparativeValues ) {
$this->assertIsString( $value, '$value' );
$this->assertIsArrayOfStrings( $comparativeValues,
'$comparativeValues' );
+ $value = $this->cleanDataString($value);
+ $comparativeValues = $this->cleanDataArray( $comparativeValues );
if ( in_array( $value, $comparativeValues ) ) {
return CompareResult::STATUS_MATCH;
@@ -73,6 +77,8 @@
public function compareArrays( array $values, array $comparativeValues ) {
$this->assertIsArrayOfStrings( $values, '$values' );
$this->assertIsArrayOfStrings( $comparativeValues,
'$comparativeValues' );
+ $values = $this->cleanDataArray( $values );
+ $comparativeValues = $this->cleanDataArray( $comparativeValues );
if ( count( array_intersect( $values, $comparativeValues ) ) > 0 ) {
return CompareResult::STATUS_MATCH;
@@ -114,7 +120,7 @@
}
/**
- * Checks the similarity of two strings by prefix/postfix check.
+ * Checks the similarity of two strings by prefix/suffix check.
*
* @param string $value
* @param string $comparativeValue
@@ -123,24 +129,52 @@
private function checkSimilarity( $value, $comparativeValue ) {
return
$this->percentagePrefixSimilarity( $value, $comparativeValue ) >
self::SIMILARITY_THRESHOLD ||
- $this->percentagePostfixSimilarity( $value, $comparativeValue ) >
self::SIMILARITY_THRESHOLD;
+ $this->percentageSuffixSimilarity( $value, $comparativeValue ) >
self::SIMILARITY_THRESHOLD ||
+ $this->percentageLevenshteinDistance( $value, $comparativeValue )
> self::SIMILARITY_THRESHOLD;
+ }
+
+ /**
+ * Returns cleaned (without whitespaces at beginning/end and lowercase)
string of a given input string.
+ *
+ * @param string $value
+ *
+ * @return float
+ */
+ private function cleanDataString( $value ) {
+ $value = trim( $value );
+
+ return strtolower( $value );
+ }
+
+ /**
+ * Returns cleaned (without whitespaces at beginning/end and lowercase)
array of strings of a given input array.
+ *
+ * @param array $array
+ *
+ * @return float
+ */
+ private function cleanDataArray( array $array ) {
+
+ return array_map(
+ array( $this, 'cleanDataString'),
+ $array );
}
/**
* Returns percentage of local value prefix-matching the external values.
*
- * @param localValue - value to prefix-match with external value
- * @param $externalValue - value to prefix-match with local value
+ * @param $value - value to prefix-match with external value
+ * @param $comparativeValue - value to prefix-match with local value
*
* @return float
*/
- private function percentagePrefixSimilarity( $localValue, $externalValue )
{
+ private function percentagePrefixSimilarity( $value, $comparativeValue ) {
$prefixLength = 0; // common prefix length
- $localLength = strlen( $localValue );
- $externalLength = strlen( $externalValue );
+ $localLength = strlen( $value );
+ $externalLength = strlen( $comparativeValue );
while ( $prefixLength < $localLength ) {
- $c = $localValue[$prefixLength];
- if ( $externalLength > $prefixLength &&
$externalValue[$prefixLength] !== $c ) {
+ $c = $value[$prefixLength];
+ if ( $externalLength > $prefixLength &&
$comparativeValue[$prefixLength] !== $c ) {
break;
}
$prefixLength++;
@@ -150,25 +184,42 @@
}
/**
- * Returns percentage of local value postfix-matching the external values.
+ * Returns percentage of local value suffix-matching the external values.
*
- * @param $localValue - value to postfix-match with local value
- * @param $externalValue - value to postfix-match with external value
+ * @param $value - value to suffix-match with local value
+ * @param $comparativeValue - value to suffix-match with external value
*
* @return float
*/
- private function percentagePostfixSimilarity( $localValue, $externalValue
) {
- $postfixLength = 0; // common postfix length
- $localLength = strlen( $localValue );
- $externalLength = strlen( $externalValue );
- while ( $postfixLength < $localLength ) {
- $c = $localValue[$localLength - 1 - $postfixLength];
- if ( $externalLength > $postfixLength &&
$externalValue[$externalLength - 1 - $postfixLength] !== $c ) {
+ private function percentageSuffixSimilarity( $value, $comparativeValue ) {
+ $suffixLength = 0; // common suffix length
+ $localLength = strlen( $value );
+ $externalLength = strlen( $comparativeValue );
+ while ( $suffixLength < $localLength ) {
+ $c = $value[$localLength - 1 - $suffixLength];
+ if ( $externalLength > $suffixLength &&
$comparativeValue[$externalLength - 1 - $suffixLength] !== $c ) {
break;
}
- $postfixLength++;
+ $suffixLength++;
}
- return $postfixLength / $externalLength;
+ return $suffixLength / $externalLength;
}
+
+ /**
+ * Returns percentage of similarity check using levenshtein distance.
+ *
+ * @param $value
+ * @param $comparativeValue
+ *
+ * @return float
+ */
+ private function percentageLevenshteinDistance( $value, $comparativeValue
) {
+ $distance = levenshtein( $value, $comparativeValue );
+ $percentage = 1.0 - $distance/max( strlen( $value ), strlen(
$comparativeValue ) );
+
+ return $percentage;
+ }
+
+
}
\ No newline at end of file
diff --git a/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php
b/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php
index 18965f8..c70804d 100755
--- a/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php
+++ b/tests/phpunit/CrossCheck/Comparer/StringComparerTest.php
@@ -53,6 +53,37 @@
),
array(
'foobar',
+ 'Foobar',
+ CompareResult::STATUS_MATCH
+ ),
+ array(
+ 'foobar',
+ 'foObar',
+ CompareResult::STATUS_MATCH
+ ),
+ array(
+ 'foobar',
+ 'FOOBAR',
+ CompareResult::STATUS_MATCH
+ ),
+ array(
+ 'foobar',
+ ' foobar',
+ CompareResult::STATUS_MATCH
+ ),
+ array(
+ 'foobar',
+ 'foobar ',
+ CompareResult::STATUS_MATCH
+ ),
+ array(
+ ' foobar',
+ ' foobar ',
+ CompareResult::STATUS_MATCH
+ ),
+ // prefix/suffix partial match
+ array(
+ 'foobar',
'foobaz',
CompareResult::STATUS_PARTIAL_MATCH
),
@@ -81,6 +112,28 @@
'oobar',
CompareResult::STATUS_PARTIAL_MATCH
),
+ // levenshtein partial match
+ array(
+ 'foobar',
+ 'fooobar',
+ CompareResult::STATUS_PARTIAL_MATCH
+ ),
+ array(
+ 'fobar',
+ 'foobar',
+ CompareResult::STATUS_PARTIAL_MATCH
+ ),
+ array(
+ 'foubar',
+ 'foobar',
+ CompareResult::STATUS_PARTIAL_MATCH
+ ),
+ array(
+ 'Schlossstraße',
+ 'Schloßstraße',
+ CompareResult::STATUS_PARTIAL_MATCH
+ ),
+ // mismatches
array(
'fo',
'foobar',
@@ -153,6 +206,16 @@
array(
'foobar',
array( 'fo', 'foobar' ),
+ CompareResult::STATUS_MATCH
+ ),
+ array(
+ 'foobar',
+ array( 'fo', 'FOOBAR' ),
+ CompareResult::STATUS_MATCH
+ ),
+ array(
+ 'foobar',
+ array( 'fo', ' FOOBAR ' ),
CompareResult::STATUS_MATCH
),
array(
@@ -235,6 +298,16 @@
CompareResult::STATUS_PARTIAL_MATCH
),
array(
+ array( 'foobar', 'fubar' ),
+ array( 'bar', 'FOOBAR' ),
+ CompareResult::STATUS_MATCH
+ ),
+ array(
+ array( 'foobar', 'fubar' ),
+ array( 'bar', ' FOOBAR ' ),
+ CompareResult::STATUS_MATCH
+ ),
+ array(
array( 'foobar', 'foo' ),
array( 'fubar', 'baz' ),
CompareResult::STATUS_MISMATCH
--
To view, visit https://gerrit.wikimedia.org/r/215008
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibe46d7b013f024c8749514f2115bbbad77926da6
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikidataQualityExternalValidation
Gerrit-Branch: master
Gerrit-Owner: Dominic.sauer <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits