Eileen has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/374951 )

Change subject: Resolve diacritic merge conflicts.
......................................................................

Resolve diacritic merge conflicts.

Use script from http://php.net/manual/en/normalizer.normalize.php#92592 to
identify where names are equivalent and failing to merge on diacritic (not 
English)
characters.

If they are the same we use the php function to determine which one does not 
contain
diacritics. If they both do we don't make a decision (just take the second). 
This
feels like a rare issue & one where it would be hard to choose correctly
given that the user has entered both at different times, but also one where
we don't lose much by randomly choosing one or the other of their name-entry
efforts.

Note doing this makes me sad. I'm not enjoying adding yet more to this function.

Nice to get a quick win, but I think the merge handling code should be moved
out into an extension in the next few iterations. Am unsure how many challenges 
remain
on the test side doing that though.

Bug: T149763

Change-Id: I7d91ec3afc21515e2eede2777b304a4155bfd0f3
---
M sites/all/modules/wmf_civicrm/tests/phpunit/MergeTest.php
M sites/all/modules/wmf_civicrm/wmf_civicrm.module
2 files changed, 148 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/fundraising/crm 
refs/changes/51/374951/1

diff --git a/sites/all/modules/wmf_civicrm/tests/phpunit/MergeTest.php 
b/sites/all/modules/wmf_civicrm/tests/phpunit/MergeTest.php
index f9ef27e..69c1dd3 100644
--- a/sites/all/modules/wmf_civicrm/tests/phpunit/MergeTest.php
+++ b/sites/all/modules/wmf_civicrm/tests/phpunit/MergeTest.php
@@ -524,6 +524,28 @@
   }
 
   /**
+   * Make sure José whomps Jose.
+   *
+   * Test discritic matches are resolved to the one using 'authentic' 
charactores.
+   */
+  public function testBatchMergeConflictNameDiacritic() {
+    $this->callAPISuccess('Contact', 'create', array('id' => $this->contactID, 
'first_name' => 'Jose', 'last_name' => 'Duck'));
+    // This will merge.
+    $this->callAPISuccess('Contact', 'create', array('id' => 
$this->contactID2, 'first_name' => 'José', 'last_name' => 'duck'));
+    // This will merge.
+    $this->breedDuck(array('first_name' => 'Josè', 'last_name' => 'Duck'));
+    // This will not merge.
+    $this->breedDuck(array('first_name' => 'Josà', 'last_name' => 'Duck'));
+    $result = $this->callAPISuccess('Job', 'process_batch_merge', array('mode' 
=> 'safe'));
+    $this->assertEquals(1, count($result['values']['skipped']));
+    $this->assertEquals(2, count($result['values']['merged']));
+
+    $contacts = $this->callAPISuccess('Contact', 'get', array('email' => 
'[email protected]', 'sequential' => 1));
+    $this->assertEquals('Josà', $contacts['values'][1]['first_name']);
+    $this->assertEquals('Josè', $contacts['values'][0]['first_name']);
+  }
+
+  /**
    * Get address combinations for the merge test.
    *
    * @return array
@@ -1175,7 +1197,11 @@
    * Also get rid of the nest.
    */
   protected function doDuckHunt() {
-    CRM_Core_DAO::executeQuery('DELETE FROM civicrm_contact WHERE display_name 
= "Donald Duck"');
+    CRM_Core_DAO::executeQuery('
+      DELETE c, e
+      FROM civicrm_contact c
+      LEFT JOIN civicrm_email e ON e.contact_id = c.id
+      WHERE display_name = "Donald Duck" OR email = "[email protected]"');
     CRM_Core_DAO::executeQuery('DELETE FROM civicrm_prevnext_cache');
   }
 
diff --git a/sites/all/modules/wmf_civicrm/wmf_civicrm.module 
b/sites/all/modules/wmf_civicrm/wmf_civicrm.module
index 81f2be6..3493c0b 100644
--- a/sites/all/modules/wmf_civicrm/wmf_civicrm.module
+++ b/sites/all/modules/wmf_civicrm/wmf_civicrm.module
@@ -2350,6 +2350,15 @@
                   continue;
               }
 
+              if (_wmf_civicrm_merge_resolve_diacritic_conflict(
+                str_replace('move_', '', $moveField),
+                $refs['migration_info'][$moveField],
+                $refs['migration_info']['rows'][$moveField]['other'],
+                $refs['migration_info']['rows'][$moveField]['main'])) {
+                unset($refs['fields_in_conflict'][$moveField]);
+                continue;
+              }
+
               if (substr($moveField, 0, 13) === 'move_location') {
                 if (!isset($refs['is_major_gift'])) {
                   $refs['is_major_gift'] = 
_wmf_civicrm_is_merged_contact_major_donor($mainId, $otherId);
@@ -2455,6 +2464,118 @@
 }
 
 /**
+ * Resolve conflicts that are only about diacritic vs english characters 
casing.
+ *
+ * If we have a conflict of 'Jose' vs 'José' chose the one with a diacritic 
character.
+ *
+ * If we find a match we need to alter the $moveFieldValue & $valueToKeep vars.
+ * That is not hugely logical but unfortunately the hook interaction has been 
organic
+ * and until we started using batch merge was not actually tested in core.
+ *
+ * @param string $fieldName
+ * @param string $moveFieldValue
+ * @param string $valueToKeep
+ * @param string $valueToOverwrite
+ *
+ * @return bool
+ */
+function _wmf_civicrm_merge_resolve_diacritic_conflict($fieldName, 
&$moveFieldValue, &$valueToKeep, &$valueToOverwrite) {
+  $fieldsToResolve = array('last_name', 'first_name');
+  if (!in_array(str_replace('move_', '', $fieldName), $fieldsToResolve)) {
+    return FALSE;
+  }
+  // Let's get rid of any preceding or trailing spaces.
+  $valueToKeep = trim($valueToKeep);
+  $valueToOverwrite = trim($valueToOverwrite);
+  $moveFieldValue = trim($moveFieldValue);
+  if (strtoupper(wmf_civicrm_normalizeUtf8String($valueToKeep)) != 
strtoupper(wmf_civicrm_normalizeUtf8String($valueToOverwrite))) {
+    return FALSE;
+  }
+
+  if (Normalizer::isNormalized($valueToOverwrite)) {
+    // We have a conflict to resolve and we are going to resolve it
+    // by keeping the default copy behaviour, since the value to overwrite is 
the one without diacritics
+    // (AKA isNormalised).
+    return TRUE;
+  }
+  // We have a conflict to resolve and we are going to resolve it by keeping
+  //the value that was to be overwritten.
+  $moveFieldValue = $valueToOverwrite;
+  $valueToKeep = $valueToOverwrite;
+  return TRUE;
+}
+
+/**
+ * From http://nz2.php.net/manual/en/normalizer.normalize.php
+ *
+ * @param $original_string
+ * @return mixed
+ */
+function wmf_civicrm_normalizeUtf8String( $original_string)
+{
+
+  // maps German (umlauts) and other European characters onto two characters 
before just removing diacritics
+  $s    = preg_replace( '@\x{00c4}@u'    , "AE",    $original_string );    // 
umlaut Ä => AE
+  $s    = preg_replace( '@\x{00d6}@u'    , "OE",    $s );    // umlaut Ö => OE
+  $s    = preg_replace( '@\x{00dc}@u'    , "UE",    $s );    // umlaut Ü => UE
+  $s    = preg_replace( '@\x{00e4}@u'    , "ae",    $s );    // umlaut ä => ae
+  $s    = preg_replace( '@\x{00f6}@u'    , "oe",    $s );    // umlaut ö => oe
+  $s    = preg_replace( '@\x{00fc}@u'    , "ue",    $s );    // umlaut ü => ue
+  $s    = preg_replace( '@\x{00f1}@u'    , "ny",    $s );    // ñ => ny
+  $s    = preg_replace( '@\x{00ff}@u'    , "yu",    $s );    // ÿ => yu
+
+
+  // maps special characters (characters with diacritics) on their 
base-character followed by the diacritical mark
+  // exmaple:  Ú => U´,  á => a`
+  $s    = Normalizer::normalize( $s, Normalizer::FORM_D );
+
+
+  $s    = preg_replace( '@\pM@u'        , "",    $s );    // removes diacritics
+
+
+  $s    = preg_replace( '@\x{00df}@u'    , "ss",    $s );    // maps German ß 
onto ss
+  $s    = preg_replace( '@\x{00c6}@u'    , "AE",    $s );    // Æ => AE
+  $s    = preg_replace( '@\x{00e6}@u'    , "ae",    $s );    // æ => ae
+  $s    = preg_replace( '@\x{0132}@u'    , "IJ",    $s );    // ? => IJ
+  $s    = preg_replace( '@\x{0133}@u'    , "ij",    $s );    // ? => ij
+  $s    = preg_replace( '@\x{0152}@u'    , "OE",    $s );    // Œ => OE
+  $s    = preg_replace( '@\x{0153}@u'    , "oe",    $s );    // œ => oe
+
+  $s    = preg_replace( '@\x{00d0}@u'    , "D",    $s );    // Ð => D
+  $s    = preg_replace( '@\x{0110}@u'    , "D",    $s );    // Ð => D
+  $s    = preg_replace( '@\x{00f0}@u'    , "d",    $s );    // ð => d
+  $s    = preg_replace( '@\x{0111}@u'    , "d",    $s );    // d => d
+  $s    = preg_replace( '@\x{0126}@u'    , "H",    $s );    // H => H
+  $s    = preg_replace( '@\x{0127}@u'    , "h",    $s );    // h => h
+  $s    = preg_replace( '@\x{0131}@u'    , "i",    $s );    // i => i
+  $s    = preg_replace( '@\x{0138}@u'    , "k",    $s );    // ? => k
+  $s    = preg_replace( '@\x{013f}@u'    , "L",    $s );    // ? => L
+  $s    = preg_replace( '@\x{0141}@u'    , "L",    $s );    // L => L
+  $s    = preg_replace( '@\x{0140}@u'    , "l",    $s );    // ? => l
+  $s    = preg_replace( '@\x{0142}@u'    , "l",    $s );    // l => l
+  $s    = preg_replace( '@\x{014a}@u'    , "N",    $s );    // ? => N
+  $s    = preg_replace( '@\x{0149}@u'    , "n",    $s );    // ? => n
+  $s    = preg_replace( '@\x{014b}@u'    , "n",    $s );    // ? => n
+  $s    = preg_replace( '@\x{00d8}@u'    , "O",    $s );    // Ø => O
+  $s    = preg_replace( '@\x{00f8}@u'    , "o",    $s );    // ø => o
+  $s    = preg_replace( '@\x{017f}@u'    , "s",    $s );    // ? => s
+  $s    = preg_replace( '@\x{00de}@u'    , "T",    $s );    // Þ => T
+  $s    = preg_replace( '@\x{0166}@u'    , "T",    $s );    // T => T
+  $s    = preg_replace( '@\x{00fe}@u'    , "t",    $s );    // þ => t
+  $s    = preg_replace( '@\x{0167}@u'    , "t",    $s );    // t => t
+
+  // remove all non-ASCii characters
+  $s    = preg_replace( '@[^\0-\x80]@u'    , "",    $s );
+
+
+  // possible errors in UTF8-regular-expressions
+  if (empty($s))
+    return $original_string;
+  else
+    return $s;
+}
+
+/**
  * Resolve conflicts on preferred language.
  *
  * If the underlying language (first 2 letters) is the same we make a value 
choice.

-- 
To view, visit https://gerrit.wikimedia.org/r/374951
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7d91ec3afc21515e2eede2777b304a4155bfd0f3
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/fundraising/crm
Gerrit-Branch: master
Gerrit-Owner: Eileen <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to