This is a useful little algorithm and I particularly like it for its simplicity and the fact that the heuristics used in the algorithm work well in most cases (one limitation of Soundex is that it falls short of covering words that sound the same but have a different first letter, e.g. "site" and "cite" produce different Soundex codes). Soundex is useful when writing search functionality where you want to account for misspellings in the users query. It's worth pointing out that SQL Server natively supports Soundex (see the Soundex function in T-SQL, for example).
My C# implementation is below - I opted to implement it in a static class that exposes one public method "For". You can download the Visual Studio solution from here (the zip contains a class library project with a corresponding test project). Note that the downloadable solution includes code comments.
public static class Soundex
{
public static string For(string word)
{
const int MaxSoundexCodeLength = 4;
var soundexCode = new StringBuilder();
var previousWasHOrW = false;
word = Regex.Replace(
word == null ? string.Empty : word.ToUpper(),
@"[^\w\s]",
string.Empty);
if (string.IsNullOrEmpty(word))
return string.Empty.PadRight(MaxSoundexCodeLength, '0');
soundexCode.Append(word.First());
for (var i = 1; i < word.Length; i++)
{
var numberCharForCurrentLetter =
GetCharNumberForLetter(word[i]);
if (i == 1 &&
numberCharForCurrentLetter ==
GetCharNumberForLetter(soundexCode[0]))
continue;
if (soundexCode.Length > 2 && previousWasHOrW &&
numberCharForCurrentLetter ==
soundexCode[soundexCode.Length - 2])
continue;
if (soundexCode.Length > 0 &&
numberCharForCurrentLetter ==
soundexCode[soundexCode.Length - 1])
continue;
soundexCode.Append(numberCharForCurrentLetter);
previousWasHOrW = "HW".Contains(word[i]);
}
return soundexCode
.Replace("0", string.Empty)
.ToString()
.PadRight(MaxSoundexCodeLength, '0')
.Substring(0, MaxSoundexCodeLength);
}
private static char GetCharNumberForLetter(char letter)
{
if ("BFPV".Contains(letter)) return '1';
if ("CGJKQSXZ".Contains(letter)) return '2';
if ("DT".Contains(letter)) return '3';
if ('L' == letter) return '4';
if ("MN".Contains(letter)) return '5';
if ('R' == letter) return '6';
return '0';
}
}
Example: {
public static string For(string word)
{
const int MaxSoundexCodeLength = 4;
var soundexCode = new StringBuilder();
var previousWasHOrW = false;
word = Regex.Replace(
word == null ? string.Empty : word.ToUpper(),
@"[^\w\s]",
string.Empty);
if (string.IsNullOrEmpty(word))
return string.Empty.PadRight(MaxSoundexCodeLength, '0');
soundexCode.Append(word.First());
for (var i = 1; i < word.Length; i++)
{
var numberCharForCurrentLetter =
GetCharNumberForLetter(word[i]);
if (i == 1 &&
numberCharForCurrentLetter ==
GetCharNumberForLetter(soundexCode[0]))
continue;
if (soundexCode.Length > 2 && previousWasHOrW &&
numberCharForCurrentLetter ==
soundexCode[soundexCode.Length - 2])
continue;
if (soundexCode.Length > 0 &&
numberCharForCurrentLetter ==
soundexCode[soundexCode.Length - 1])
continue;
soundexCode.Append(numberCharForCurrentLetter);
previousWasHOrW = "HW".Contains(word[i]);
}
return soundexCode
.Replace("0", string.Empty)
.ToString()
.PadRight(MaxSoundexCodeLength, '0')
.Substring(0, MaxSoundexCodeLength);
}
private static char GetCharNumberForLetter(char letter)
{
if ("BFPV".Contains(letter)) return '1';
if ("CGJKQSXZ".Contains(letter)) return '2';
if ("DT".Contains(letter)) return '3';
if ('L' == letter) return '4';
if ("MN".Contains(letter)) return '5';
if ('R' == letter) return '6';
return '0';
}
}
// Both lines below output R100
Console.WriteLine(Soundex.For("Ravi"));
Console.WriteLine(Soundex.For("Ravee"));
Console.WriteLine(Soundex.For("Ravi"));
Console.WriteLine(Soundex.For("Ravee"));
Không có nhận xét nào:
Đăng nhận xét