Eileen has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/374951 )
Change subject: Resolve diacritic merge conflicts. ...................................................................... Resolve diacritic merge conflicts. Use script from http://php.net/manual/en/normalizer.normalize.php#92592 to identify where names are equivalent and failing to merge on diacritic (not English) characters. If they are the same we use the php function to determine which one does not contain diacritics. If they both do we don't make a decision (just take the second). This feels like a rare issue & one where it would be hard to choose correctly given that the user has entered both at different times, but also one where we don't lose much by randomly choosing one or the other of their name-entry efforts. Note doing this makes me sad. I'm not enjoying adding yet more to this function. Nice to get a quick win, but I think the merge handling code should be moved out into an extension in the next few iterations. Am unsure how many challenges remain on the test side doing that though. Bug: T149763 Change-Id: I7d91ec3afc21515e2eede2777b304a4155bfd0f3 --- M sites/all/modules/wmf_civicrm/tests/phpunit/MergeTest.php M sites/all/modules/wmf_civicrm/wmf_civicrm.module 2 files changed, 148 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/wikimedia/fundraising/crm refs/changes/51/374951/1 diff --git a/sites/all/modules/wmf_civicrm/tests/phpunit/MergeTest.php b/sites/all/modules/wmf_civicrm/tests/phpunit/MergeTest.php index f9ef27e..69c1dd3 100644 --- a/sites/all/modules/wmf_civicrm/tests/phpunit/MergeTest.php +++ b/sites/all/modules/wmf_civicrm/tests/phpunit/MergeTest.php @@ -524,6 +524,28 @@ } /** + * Make sure José whomps Jose. + * + * Test discritic matches are resolved to the one using 'authentic' charactores. + */ + public function testBatchMergeConflictNameDiacritic() { + $this->callAPISuccess('Contact', 'create', array('id' => $this->contactID, 'first_name' => 'Jose', 'last_name' => 'Duck')); + // This will merge. + $this->callAPISuccess('Contact', 'create', array('id' => $this->contactID2, 'first_name' => 'José', 'last_name' => 'duck')); + // This will merge. + $this->breedDuck(array('first_name' => 'Josè', 'last_name' => 'Duck')); + // This will not merge. + $this->breedDuck(array('first_name' => 'Josà', 'last_name' => 'Duck')); + $result = $this->callAPISuccess('Job', 'process_batch_merge', array('mode' => 'safe')); + $this->assertEquals(1, count($result['values']['skipped'])); + $this->assertEquals(2, count($result['values']['merged'])); + + $contacts = $this->callAPISuccess('Contact', 'get', array('email' => '[email protected]', 'sequential' => 1)); + $this->assertEquals('Josà', $contacts['values'][1]['first_name']); + $this->assertEquals('Josè', $contacts['values'][0]['first_name']); + } + + /** * Get address combinations for the merge test. * * @return array @@ -1175,7 +1197,11 @@ * Also get rid of the nest. */ protected function doDuckHunt() { - CRM_Core_DAO::executeQuery('DELETE FROM civicrm_contact WHERE display_name = "Donald Duck"'); + CRM_Core_DAO::executeQuery(' + DELETE c, e + FROM civicrm_contact c + LEFT JOIN civicrm_email e ON e.contact_id = c.id + WHERE display_name = "Donald Duck" OR email = "[email protected]"'); CRM_Core_DAO::executeQuery('DELETE FROM civicrm_prevnext_cache'); } diff --git a/sites/all/modules/wmf_civicrm/wmf_civicrm.module b/sites/all/modules/wmf_civicrm/wmf_civicrm.module index 81f2be6..3493c0b 100644 --- a/sites/all/modules/wmf_civicrm/wmf_civicrm.module +++ b/sites/all/modules/wmf_civicrm/wmf_civicrm.module @@ -2350,6 +2350,15 @@ continue; } + if (_wmf_civicrm_merge_resolve_diacritic_conflict( + str_replace('move_', '', $moveField), + $refs['migration_info'][$moveField], + $refs['migration_info']['rows'][$moveField]['other'], + $refs['migration_info']['rows'][$moveField]['main'])) { + unset($refs['fields_in_conflict'][$moveField]); + continue; + } + if (substr($moveField, 0, 13) === 'move_location') { if (!isset($refs['is_major_gift'])) { $refs['is_major_gift'] = _wmf_civicrm_is_merged_contact_major_donor($mainId, $otherId); @@ -2455,6 +2464,118 @@ } /** + * Resolve conflicts that are only about diacritic vs english characters casing. + * + * If we have a conflict of 'Jose' vs 'José' chose the one with a diacritic character. + * + * If we find a match we need to alter the $moveFieldValue & $valueToKeep vars. + * That is not hugely logical but unfortunately the hook interaction has been organic + * and until we started using batch merge was not actually tested in core. + * + * @param string $fieldName + * @param string $moveFieldValue + * @param string $valueToKeep + * @param string $valueToOverwrite + * + * @return bool + */ +function _wmf_civicrm_merge_resolve_diacritic_conflict($fieldName, &$moveFieldValue, &$valueToKeep, &$valueToOverwrite) { + $fieldsToResolve = array('last_name', 'first_name'); + if (!in_array(str_replace('move_', '', $fieldName), $fieldsToResolve)) { + return FALSE; + } + // Let's get rid of any preceding or trailing spaces. + $valueToKeep = trim($valueToKeep); + $valueToOverwrite = trim($valueToOverwrite); + $moveFieldValue = trim($moveFieldValue); + if (strtoupper(wmf_civicrm_normalizeUtf8String($valueToKeep)) != strtoupper(wmf_civicrm_normalizeUtf8String($valueToOverwrite))) { + return FALSE; + } + + if (Normalizer::isNormalized($valueToOverwrite)) { + // We have a conflict to resolve and we are going to resolve it + // by keeping the default copy behaviour, since the value to overwrite is the one without diacritics + // (AKA isNormalised). + return TRUE; + } + // We have a conflict to resolve and we are going to resolve it by keeping + //the value that was to be overwritten. + $moveFieldValue = $valueToOverwrite; + $valueToKeep = $valueToOverwrite; + return TRUE; +} + +/** + * From http://nz2.php.net/manual/en/normalizer.normalize.php + * + * @param $original_string + * @return mixed + */ +function wmf_civicrm_normalizeUtf8String( $original_string) +{ + + // maps German (umlauts) and other European characters onto two characters before just removing diacritics + $s = preg_replace( '@\x{00c4}@u' , "AE", $original_string ); // umlaut Ä => AE + $s = preg_replace( '@\x{00d6}@u' , "OE", $s ); // umlaut Ö => OE + $s = preg_replace( '@\x{00dc}@u' , "UE", $s ); // umlaut Ü => UE + $s = preg_replace( '@\x{00e4}@u' , "ae", $s ); // umlaut ä => ae + $s = preg_replace( '@\x{00f6}@u' , "oe", $s ); // umlaut ö => oe + $s = preg_replace( '@\x{00fc}@u' , "ue", $s ); // umlaut ü => ue + $s = preg_replace( '@\x{00f1}@u' , "ny", $s ); // ñ => ny + $s = preg_replace( '@\x{00ff}@u' , "yu", $s ); // ÿ => yu + + + // maps special characters (characters with diacritics) on their base-character followed by the diacritical mark + // exmaple: Ú => U´, á => a` + $s = Normalizer::normalize( $s, Normalizer::FORM_D ); + + + $s = preg_replace( '@\pM@u' , "", $s ); // removes diacritics + + + $s = preg_replace( '@\x{00df}@u' , "ss", $s ); // maps German ß onto ss + $s = preg_replace( '@\x{00c6}@u' , "AE", $s ); // Æ => AE + $s = preg_replace( '@\x{00e6}@u' , "ae", $s ); // æ => ae + $s = preg_replace( '@\x{0132}@u' , "IJ", $s ); // ? => IJ + $s = preg_replace( '@\x{0133}@u' , "ij", $s ); // ? => ij + $s = preg_replace( '@\x{0152}@u' , "OE", $s ); // Œ => OE + $s = preg_replace( '@\x{0153}@u' , "oe", $s ); // œ => oe + + $s = preg_replace( '@\x{00d0}@u' , "D", $s ); // Ð => D + $s = preg_replace( '@\x{0110}@u' , "D", $s ); // Ð => D + $s = preg_replace( '@\x{00f0}@u' , "d", $s ); // ð => d + $s = preg_replace( '@\x{0111}@u' , "d", $s ); // d => d + $s = preg_replace( '@\x{0126}@u' , "H", $s ); // H => H + $s = preg_replace( '@\x{0127}@u' , "h", $s ); // h => h + $s = preg_replace( '@\x{0131}@u' , "i", $s ); // i => i + $s = preg_replace( '@\x{0138}@u' , "k", $s ); // ? => k + $s = preg_replace( '@\x{013f}@u' , "L", $s ); // ? => L + $s = preg_replace( '@\x{0141}@u' , "L", $s ); // L => L + $s = preg_replace( '@\x{0140}@u' , "l", $s ); // ? => l + $s = preg_replace( '@\x{0142}@u' , "l", $s ); // l => l + $s = preg_replace( '@\x{014a}@u' , "N", $s ); // ? => N + $s = preg_replace( '@\x{0149}@u' , "n", $s ); // ? => n + $s = preg_replace( '@\x{014b}@u' , "n", $s ); // ? => n + $s = preg_replace( '@\x{00d8}@u' , "O", $s ); // Ø => O + $s = preg_replace( '@\x{00f8}@u' , "o", $s ); // ø => o + $s = preg_replace( '@\x{017f}@u' , "s", $s ); // ? => s + $s = preg_replace( '@\x{00de}@u' , "T", $s ); // Þ => T + $s = preg_replace( '@\x{0166}@u' , "T", $s ); // T => T + $s = preg_replace( '@\x{00fe}@u' , "t", $s ); // þ => t + $s = preg_replace( '@\x{0167}@u' , "t", $s ); // t => t + + // remove all non-ASCii characters + $s = preg_replace( '@[^\0-\x80]@u' , "", $s ); + + + // possible errors in UTF8-regular-expressions + if (empty($s)) + return $original_string; + else + return $s; +} + +/** * Resolve conflicts on preferred language. * * If the underlying language (first 2 letters) is the same we make a value choice. -- To view, visit https://gerrit.wikimedia.org/r/374951 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7d91ec3afc21515e2eede2777b304a4155bfd0f3 Gerrit-PatchSet: 1 Gerrit-Project: wikimedia/fundraising/crm Gerrit-Branch: master Gerrit-Owner: Eileen <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
