Hi Greg,
Please find following what I have used in the past.
It is very expensive, but I can not see a better way of doing it.
It returns an integer which is the sum of:
- number of times the same letter appears in both strings
- 10 times the number of times the same two letters appears in both
strings
- 100 times the number of times the same three letters appears in both
strings
Once you get your results, sort them, the most similar strings will have
higher results.
I used this many years ago and not used it since.
There may be (far) better ways to do this.
Regards
Greg Harris
public static string CleanStr ( this string aText )
{
int diff = 'A' - 'a';
StringBuilder result = new StringBuilder();
foreach ( char ch in aText )
{
if ( ( ch >= '0' && ch <= '9' )
|| ( ch >= 'A' && ch <= 'Z' ) )
{
result.Append(ch);
}
else
{
if ( ch >= 'a' && ch <= 'z' )
{
result.Append((char)(ch+diff));
}
}
}
return result.ToString();
}
/// <summary>
/// Do a sounds like compare, the higher the result, the more the
words/phrases sound the same
/// </summary>
/// <param name="aStr1">First word / phrase</param>
/// <param name="aStr2">Second word / phrase</param>
/// <returns>Score</returns>
public static double CompareSoundsLike ( this string aStr1, string
aStr2 )
{
aStr1 = aStr1.CleanStr();
aStr2 = aStr2.CleanStr();
double result = 0;
for (int i = 0; i < aStr1.Length; i++)
{
char outerChar = aStr1[i];
for (int j = 0; j < aStr2.Length; j++)
{
char innerInner = aStr2[j];
if ( outerChar == innerInner )
{
result++;
if ( ( i < aStr1.Length-1 ) && ( j < aStr2.Length-1 ) && (
aStr1[i+1] == aStr2[j+1] ) ) result += 10 ;
if ( ( i < aStr1.Length-2 ) && ( j < aStr2.Length-2 ) && (
aStr1[i+2] == aStr2[j+2] ) ) result += 100;
}
}
}
return result / ( aStr1.Length + aStr2.Length );
}
[TestMethod] public void Test_10_Compare1()
{
// 123456
string lTestLine1 = "qwerty";
string lTestLine2 = "QWERTY";
double lExpected = 456/(6+6);
double lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
Assert.AreEqual<double>( lExpected, lResult );
}
[TestMethod] public void Test_10_Compare2()
{
// 123456789-123456789-123456789-123456789-12xxx
string lTestLine1 = "The quick brown fox jumped over the !@#$
dog!";
string lTestLine2 = "T H E - Q U I C K - B R O W N - F O X
- J U M P E D - O V E R - T H E - D O G";
double lExpected = 3856.0/(32.0 + 32.0);
double lResult = lTestLine1.CompareSoundsLike( lTestLine2 );
Assert.AreEqual<double>( lExpected, lResult );
}
On Sat, Nov 29, 2014 at 9:46 AM, Greg Keogh <[email protected]> wrote:
> Folks, I was about this write some utility code to search through my
> 20,000 audio files looking for probable duplicates. I say "probable"
> because I found file names like these:
>
> Lovelock - Trumpet Concerto (SSO Concert).mp3
> Trumpet Concerto (William Lovelock).mp3
>
> There are many other duplicates with rearranged, abbreviated or misspelt
> words in the names. I was about to click "New Project" and start typing but
> I suddenly realised I had no idea what algorithm to use to find probable
> duplicates and rate them. Has anyone done this sort of thing before or know
> where to find a description of a suitable algorithm?
>
> *Greg K*
>