Hi Greg,

Please find following what I have used in the past.
It is very expensive, but I can not see a better way of doing it.
It returns an integer which is the sum of:

   - number of times the same letter appears in both strings
   - 10 times the number of times the same two letters appears in both
   strings
   - 100 times the number of times the same three letters appears in both
   strings

Once you get your results, sort them, the most similar strings will have
higher results.
I used this many years ago and not used it since.
There may be (far) better ways to do this.

Regards
Greg Harris

    public static string   CleanStr               ( this string aText )

    {

      int           diff   = 'A' - 'a';

      StringBuilder result = new StringBuilder();

      foreach ( char ch in aText )

      {

        if (    ( ch >= '0' && ch <= '9' )

             || ( ch >= 'A' && ch <= 'Z' ) )

        {

          result.Append(ch);

        }

        else

        {

          if ( ch >= 'a' && ch <= 'z' )

          {

            result.Append((char)(ch+diff));

          }



        }

      }

      return result.ToString();

    }

    /// <summary>

    /// Do a sounds like compare, the higher the result, the more the
words/phrases sound the same

    /// </summary>

    /// <param name="aStr1">First word / phrase</param>

    /// <param name="aStr2">Second word / phrase</param>

    /// <returns>Score</returns>

    public static double   CompareSoundsLike     ( this string aStr1, string
aStr2 )

    {

      aStr1 = aStr1.CleanStr();

      aStr2 = aStr2.CleanStr();

      double result = 0;

      for (int i = 0; i < aStr1.Length; i++)

      {

        char outerChar = aStr1[i];

        for (int j = 0; j < aStr2.Length; j++)

        {

          char innerInner = aStr2[j];

          if ( outerChar == innerInner )

          {

            result++;

            if ( ( i < aStr1.Length-1 ) && ( j < aStr2.Length-1 ) && (
aStr1[i+1] == aStr2[j+1] ) ) result += 10 ;

            if ( ( i < aStr1.Length-2 ) && ( j < aStr2.Length-2 ) && (
aStr1[i+2] == aStr2[j+2] ) ) result += 100;

          }

        }

      }

      return result / ( aStr1.Length + aStr2.Length );

    }







    [TestMethod] public void Test_10_Compare1()

    {

      //                       123456

      string lTestLine1     = "qwerty";

      string lTestLine2     = "QWERTY";

      double lExpected      = 456/(6+6);

      double lResult        = lTestLine1.CompareSoundsLike( lTestLine2 );

      Assert.AreEqual<double>( lExpected, lResult );

    }

    [TestMethod] public void Test_10_Compare2()

    {

      //                       123456789-123456789-123456789-123456789-12xxx

      string lTestLine1     = "The quick brown fox jumped over the !@#$
dog!";

      string lTestLine2     = "T H E  -  Q U I C K  -  B R O W N  -  F O X
-  J U M P E D  -  O V E R  -  T H E  -  D O G";

      double lExpected      = 3856.0/(32.0 + 32.0);

      double lResult        = lTestLine1.CompareSoundsLike( lTestLine2 );

      Assert.AreEqual<double>( lExpected, lResult );

    }




On Sat, Nov 29, 2014 at 9:46 AM, Greg Keogh <[email protected]> wrote:

> Folks, I was about this write some utility code to search through my
> 20,000 audio files looking for probable duplicates. I say "probable"
> because I found file names like these:
>
> Lovelock - Trumpet Concerto (SSO Concert).mp3
> Trumpet Concerto (William Lovelock).mp3
>
> There are many other duplicates with rearranged, abbreviated or misspelt
> words in the names. I was about to click "New Project" and start typing but
> I suddenly realised I had no idea what algorithm to use to find probable
> duplicates and rate them. Has anyone done this sort of thing before or know
> where to find a description of a suitable algorithm?
>
> *Greg K*
>

Reply via email to