Hi Greg,
I should look at my code before I write comments from memory...
The result is a *double *value being the sum of:
· number of times the same letter appears in both strings
· 10 times the number of times the same two letters appears in both
strings
· 100 times the number of times the same three letters appears in
both strings
*Which is then divided by the length of the two strings to sort of
“normalise” the result.*
Mixed case is ignored, only compares letters A-Z and 0-9, everything else
is excluded.
I added a Greg unit test to better show the results which is following…
Regards
Greg Harris
[TestMethod] public void Test_10_Compare3_ForGregKeogh()
{
// 123456789-123456789-123456789-123456789-12456
string lTestLine1 = "Lovelock - Trumpet Concerto (SSO
Concert).mp3";
string lTestLine2 = "Trumpet Concerto (William Lovelock).mp3";
double lExpected = 3033/(36.0 + 33.0); // = 43.9
double lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
Assert.AreEqual<double>( lExpected, lResult );
// This is an example of exactly the same string, so will get the
best posible match
lTestLine1 = "Lovelock - Trumpet Concerto (SSO Concert).mp3";
lTestLine2 = "Lovelock - Trumpet Concerto (SSO Concert).mp3";
lExpected = 5256/(36.0 + 36.0); // = 73.0
lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
Assert.AreEqual<double>( lExpected, lResult );
// This is an example of exactly the same string, with case
difference, which is ignored,
// so will also get the best possible match
lTestLine1 = "Lovelock - Trumpet Concerto (SSO Concert).mp3";
lTestLine2 = "LOVELOCK - TRUMPET CONCERTO (SSO CONCERT).mp3";
lExpected = 5256/(36.0 + 36.0); // = 73.0
lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
Assert.AreEqual<double>( lExpected, lResult );
// This is an example of a spelling/typing mistake, so will get a
very good match
lTestLine1 = "Lovelock - Trumpet Concerto (SSO Concert).mp3";
lTestLine2 = "Lovelock - Trumpet Concerto (SoSo Concert).mp3";
lExpected = 5272/(36.0 + 37.0); // = 72.2
lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
Assert.AreEqual<double>( lExpected, lResult );
// This is an example of a truncation, so will get a poor match
lTestLine1 = "Lovelock - Trumpet Concerto (SSO Concert).mp3";
lTestLine2 = "Lovelock - Trumpet Concerto.mp3";
lExpected = 3237/(36.0 + 26.0); // = 52.2
lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
Assert.AreEqual<double>( lExpected, lResult );
// This will get a match on William and a little else...
lTestLine1 = "Trumpet Concerto (William Lovelock).mp3";
lTestLine2 = "The Complete Works of William Shakespeare.txt";
lExpected = 1202/(33.0 + 39.0); // = 16.69
lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
Assert.AreEqual<double>( lExpected, lResult );
// This will get a match on each of the letters, but no double letters
lTestLine1 = "QWERTY";
lTestLine2 = "ytrewq";
lExpected = 6/(6.0 + 6.0); // = 0.5
lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
Assert.AreEqual<double>( lExpected, lResult );
// This will get a match on nothing
lTestLine1 = "QWERTY";
lTestLine2 = "ASDFGHJKL";
lExpected = 0/(6.0 + 9.0); // = 0.0
lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
Assert.AreEqual<double>( lExpected, lResult );
}
On Sat, Nov 29, 2014 at 10:16 AM, Greg Harris <
[email protected]> wrote:
> Hi Greg,
>
> Please find following what I have used in the past.
> It is very expensive, but I can not see a better way of doing it.
> It returns an integer which is the sum of:
>
> - number of times the same letter appears in both strings
> - 10 times the number of times the same two letters appears in both
> strings
> - 100 times the number of times the same three letters appears in both
> strings
>
> Once you get your results, sort them, the most similar strings will have
> higher results.
> I used this many years ago and not used it since.
> There may be (far) better ways to do this.
>
> Regards
> Greg Harris
>
> public static string CleanStr ( this string aText )
>
> {
>
> int diff = 'A' - 'a';
>
> StringBuilder result = new StringBuilder();
>
> foreach ( char ch in aText )
>
> {
>
> if ( ( ch >= '0' && ch <= '9' )
>
> || ( ch >= 'A' && ch <= 'Z' ) )
>
> {
>
> result.Append(ch);
>
> }
>
> else
>
> {
>
> if ( ch >= 'a' && ch <= 'z' )
>
> {
>
> result.Append((char)(ch+diff));
>
> }
>
>
>
> }
>
> }
>
> return result.ToString();
>
> }
>
> /// <summary>
>
> /// Do a sounds like compare, the higher the result, the more the
> words/phrases sound the same
>
> /// </summary>
>
> /// <param name="aStr1">First word / phrase</param>
>
> /// <param name="aStr2">Second word / phrase</param>
>
> /// <returns>Score</returns>
>
> public static double CompareSoundsLike ( this string aStr1,
> string aStr2 )
>
> {
>
> aStr1 = aStr1.CleanStr();
>
> aStr2 = aStr2.CleanStr();
>
> double result = 0;
>
> for (int i = 0; i < aStr1.Length; i++)
>
> {
>
> char outerChar = aStr1[i];
>
> for (int j = 0; j < aStr2.Length; j++)
>
> {
>
> char innerInner = aStr2[j];
>
> if ( outerChar == innerInner )
>
> {
>
> result++;
>
> if ( ( i < aStr1.Length-1 ) && ( j < aStr2.Length-1 ) && (
> aStr1[i+1] == aStr2[j+1] ) ) result += 10 ;
>
> if ( ( i < aStr1.Length-2 ) && ( j < aStr2.Length-2 ) && (
> aStr1[i+2] == aStr2[j+2] ) ) result += 100;
>
> }
>
> }
>
> }
>
> return result / ( aStr1.Length + aStr2.Length );
>
> }
>
>
>
>
>
>
>
> [TestMethod] public void Test_10_Compare1()
>
> {
>
> // 123456
>
> string lTestLine1 = "qwerty";
>
> string lTestLine2 = "QWERTY";
>
> double lExpected = 456/(6+6);
>
> double lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
>
> Assert.AreEqual<double>( lExpected, lResult );
>
> }
>
> [TestMethod] public void Test_10_Compare2()
>
> {
>
> //
> 123456789-123456789-123456789-123456789-12xxx
>
> string lTestLine1 = "The quick brown fox jumped over the !@#$
> dog!";
>
> string lTestLine2 = "T H E - Q U I C K - B R O W N - F O
> X - J U M P E D - O V E R - T H E - D O G";
>
> double lExpected = 3856.0/(32.0 + 32.0);
>
> double lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
>
> Assert.AreEqual<double>( lExpected, lResult );
>
> }
>
>
>
>
> On Sat, Nov 29, 2014 at 9:46 AM, Greg Keogh <[email protected]> wrote:
>
>> Folks, I was about this write some utility code to search through my
>> 20,000 audio files looking for probable duplicates. I say "probable"
>> because I found file names like these:
>>
>> Lovelock - Trumpet Concerto (SSO Concert).mp3
>> Trumpet Concerto (William Lovelock).mp3
>>
>> There are many other duplicates with rearranged, abbreviated or misspelt
>> words in the names. I was about to click "New Project" and start typing but
>> I suddenly realised I had no idea what algorithm to use to find probable
>> duplicates and rate them. Has anyone done this sort of thing before or know
>> where to find a description of a suitable algorithm?
>>
>> *Greg K*
>>
>
>