Soundex算法(函数)在不同编程语言中的实现

本文概述

  • C
  • C#
  • D
  • F#
  • Go
  • Java
  • JavaScript
  • Objective-C
  • PHP
  • python
  • Ruby
  • Scala
  • Swift
  • VBScript
【Soundex算法(函数)在不同编程语言中的实现】Soundex是一种语音算法, 用于按声音索引名称(英语发音), 可以将来自不同字符串的SOUNDEX代码进行比较, 以查看说话时字符串听起来的相似程度。
代码的第一个字符是表达式的第一个字符, 转换为大写。该代码的第二个到第四个字符是代表表达式中字母的数字。字母A, E, I, O, U, H, W和Y会被忽略, 除非它们是字符串的第一个字母。所有A-Z范围以外的国际字母字符都被视为元音。因此, 听起来几乎相同的两个弦应该具有相同的soundex弦。例如, 单词” text” 和” tixt” 都产生” T230″ 的声音。
让我们开始吧 !
C
#include < stdio.h> static char code[128] = { 0 }; const char* soundex(const char *s){ static char out[5]; int c, prev, i; out[0] = out[4] = 0; if (!s || !*s) return out; out[0] = *s++; /* first letter, though not coded, can still affect next letter: Pfister */ prev = code[(int)out[0]]; for (i = 1; *s & & i < 4; s++) {if ((c = code[(int)*s]) == prev) continue; if (c == -1) prev = 0; /* vowel as separator */else if (c > 0) {out[i++] = c + '0'; prev = c; } } while (i < 4) out[i++] = '0'; return out; }void add_code(const char *s, int c){ while (*s) {code[(int)*s] = code[0x20 ^ (int)*s] = c; s++; }} void init(){ static const char *cls[] ={ "AEIOU", "", "BFPV", "CGJKQSXZ", "DT", "L", "MN", "R", 0}; int i; for (i = 0; cls[i]; i++)add_code(cls[i], i - 1); }

用法
int main(){init(); /* J126 */printf(soundex("Javascript")); return 0; }

C#
using System.Text.RegularExpressions; public static class Soundex{public static string For(string word){const int MaxSoundexCodeLength = 4; var soundexCode = new StringBuilder(); var previousWasHOrW = false; word = Regex.Replace(word == null ? string.Empty : word.ToUpper(), @"[^\w\s]", string.Empty); if (string.IsNullOrEmpty(word))return string.Empty.PadRight(MaxSoundexCodeLength, '0'); soundexCode.Append(word.First()); for (var i = 1; i < word.Length; i++){var numberCharForCurrentLetter =GetCharNumberForLetter(word[i]); if (i == 1 & & numberCharForCurrentLetter ==GetCharNumberForLetter(soundexCode[0]))continue; if (soundexCode.Length > 2 & & previousWasHOrW & & numberCharForCurrentLetter ==soundexCode[soundexCode.Length - 2])continue; if (soundexCode.Length > 0 & & numberCharForCurrentLetter ==soundexCode[soundexCode.Length - 1])continue; soundexCode.Append(numberCharForCurrentLetter); previousWasHOrW = "HW".Contains(word[i]); }return soundexCode.Replace("0", string.Empty).ToString().PadRight(MaxSoundexCodeLength, '0').Substring(0, MaxSoundexCodeLength); }private static char GetCharNumberForLetter(char letter){if ("BFPV".Contains(letter)) return '1'; if ("CGJKQSXZ".Contains(letter)) return '2'; if ("DT".Contains(letter)) return '3'; if ('L' == letter) return '4'; if ("MN".Contains(letter)) return '5'; if ('R' == letter) return '6'; return '0'; }}

用法
Soundex.For("CSharp Language") == Soundex.For("CSherp Language"); // True as C614 == C614

DD标准库(Phobos)已包含soundex函数。
import std.stdio: writeln; import std.string: soundex; void main() {assert(soundex("soundex") == "S532"); assert(soundex("example") == "E251"); assert(soundex("ciondecks") == "C532"); assert(soundex("ekzampul") == "E251"); assert(soundex("Robert") == "R163"); assert(soundex("Rupert") == "R163"); assert(soundex("Rubin") == "R150"); assert(soundex("Ashcraft") == "A261"); assert(soundex("Ashcroft") == "A261"); assert(soundex("Tymczak") == "T522"); }

F#
let americanSoundex (x : string) = let toString (xs : char list) = new System.String(xs |> Array.ofList)let _americanSoundex =let toUpper (x : string) = x.ToUpper()let toArray (x : string) = x.ToCharArray()let f1 ch = match ch with| 'H' | 'W' -> false| _ -> truelet f2 ch =match ch with| 'B' | 'F' | 'P' | 'V' -> '1'| 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> '2'| 'D' | 'T' -> '3'| 'L' -> '4'| 'M' | 'N' -> '5'| 'R' -> '6'| _ -> chlet rec f3 xs =match xs with| h0 :: h1 :: t -> h0 :: f3 (if (h0 = h1) then t else (h1 :: t))| h :: _ -> [h]| _ -> []let f4 ch = match ch with| 'A' | 'E' | 'I' | 'O' | 'U' | 'Y' -> false| _ -> truelet f5 ch first =if ('0' < = ch & & ch < = '9') then firstelse chlet f6 xs =let len = List.length xsseq{for i = 0 to 3 - len do yield '0'} |> Seq.append (xs |> Seq.take (System.Math.Min(4, len)))|> Seq.toListlet a = x |> toUpper |> toArray |> Array.toListlet b = a |> List.filter f1 //1let c = b |> List.map f2 //2let d = c |> f3 //3let e = d |> List.tail |> List.filter f4 //4let f = f5 (d |> List.head) (a |> List.head) :: e //5f6 f //6if (x.Length > 0) then toString(_americanSoundex) else "0000"["Robert"; "Rupert"; "Robbert"; "Rubin"; "Beer"; "Bear"; "Bearer"; "Smith"; "Smyth"; "Ashcraft"; "Ashcroft"; "Tymczak"; "Pfister"] |> List.map (fun x -> (x, americanSoundex x)) |> List.iter (fun (x, y) -> printfn "%-8s = %s" x y)(*Robert= R163Rupert= R163Robbert= R163Rubin= R150Beer= B600Bear= B600Bearer= B660Smith= S530Smyth= S530Ashcraft = A261Ashcroft = A261Tymczak= T522Pfister= P236*)

Go
package myPackageNameimport ( "bytes" "strings" "fmt")const codeLen = 4var codes = map[string]string{ "a": "", "b": "1", "c": "2", "d": "3", "e": "", "f": "1", "g": "2", "h": "", "i": "", "j": "2", "k": "2", "l": "4", "m": "5", "n": "5", "o": "", "p": "1", "q": "2", "r": "6", "s": "2", "t": "3", "u": "", "v": "1", "w": "", "x": "2", "y": "", "z": "2", }func Soundex(s string) string { var encoded bytes.Buffer encoded.WriteByte(s[0]) for i := 1; i < len(s); i++ {if encoded.Len() == codeLen {break}previous, current := strings.ToLower(string(s[i-1])), strings.ToLower(string(s[i]))var next stringif i+1 < len(s) {next = strings.ToLower(string(s[i+1]))}if (current == "h" || current == "w") & & (codes[previous] == codes[next]) {i = i + 1continue}if c, ok := codes[current]; ok & & len(c) > 0 {encoded.WriteByte(c[0])}if codes[current] == codes[next] {i = i + 1continue} } if encoded.Len() < codeLen {padding := strings.Repeat("0", codeLen-encoded.Len())encoded.WriteString(padding) } return strings.ToUpper(encoded.String())}

用法
func main() {/* J126 */fmt.Println(Soundex("Javascript"))}

Java
private static String getCode(char c){switch(c){case 'B': case 'F': case 'P': case 'V':return "1"; case 'C': case 'G': case 'J': case 'K':case 'Q': case 'S': case 'X': case 'Z':return "2"; case 'D': case 'T':return "3"; case 'L':return "4"; case 'M': case 'N':return "5"; case 'R':return "6"; default:return ""; }} public static String soundex(String s){String code, previous, soundex; code = s.toUpperCase().charAt(0) + ""; previous = "7"; for(int i = 1; i < s.length(); i++){String current = getCode(s.toUpperCase().charAt(i)); if(current.length() > 0 & & !current.equals(previous)){code = code + current; }previous = current; }soundex = (code + "0000").substring(0, 4); return soundex; }

用法
public static void main(String[] args){System.out.println(soundex("Soundex")); //S532System.out.println(soundex("Example")); //E251System.out.println(soundex("Sownteks")); //S532System.out.println(soundex("Ekzampul")); //E251}

JavaScript
var soundex = function(s) {var a = s.toLowerCase().split(''), f = a.shift(), r = '', codes = { a: '', e: '', i: '', o: '', u: '', b: 1, f: 1, p: 1, v: 1, c: 2, g: 2, j: 2, k: 2, q: 2, s: 2, x: 2, z: 2, d: 3, t: 3, l: 4, m: 5, n: 5, r: 6 }; r = f +a.map(function(v, i, a) {return codes[v]}).filter(function(v, i, a) {return ((i === 0) ? v !== codes[f] : v !== a[i - 1]); }).join(''); return (r + '000').slice(0, 4).toUpperCase(); };

用法
soundex("Javascript") == soundex("Jabascript"); // True as J126 == J126

Objective-C你可以在Darkseed编写的github gist中找到Soundex算法Objective-C的实现。
PHPPHP已经将soundex作为内置函数来计算字符串的soundex键。
用法
soundex("PHP Server Language") == soundex("PHP Serber language"); // True as P100 == P100

python函数
def get_soundex(name): """Get the soundex code for the string""" name = name.upper() soundex = "" soundex += name[0] dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5", "R":"6", "AEIOUHWY":"."} for char in name[1:]:for key in dictionary.keys():if char in key:code = dictionary[key]if code != soundex[-1]:soundex += code soundex = soundex.replace(".", "") soundex = soundex[:4].ljust(4, "0") return soundex

用法
list = ["Smith", "Smythe", "Robert", "Rupert", "Schultz", "Shultz"] print("NAME\t\tSOUNDEX") for name in list:print("%s\t\t%s" % (name, get_soundex(name)))

图书馆
如果你更喜欢使用库, 则可以使用模糊包(使用C扩展(通过Pyrex)来提高速度)。
Ruby
class String SoundexChars = 'BFPVCGJKQSXZDTLMNR'SoundexNums= '111122222222334556'SoundexCharsEx = '^' + SoundexCharsSoundexCharsDel = '^A-Z' # desc: http://en.wikipedia.org/wiki/Soundexdef soundex(census = true)str = self.upcase.delete(SoundexCharsDel)str[0, 1] + str[1..-1].delete(SoundexCharsEx).tr_s(SoundexChars, SoundexNums)\[0 .. (census ? 2 : -1)].ljust(3, '0') rescue ''end def sounds_like(other)self.soundex == other.soundexendend

用法
%w(Soundex Sownteks Example Ekzampul foo bar).each_slice(2) do |word1, word2|[word1, word2].each {|word| puts '%-8s -> %s' % [word, word.soundex]} print "'#{word1}' "print word1.sounds_like(word2) ? "sounds" : "does not sound"print " like '#{word2}'\n"end#Soundex-> S532#Sownteks -> S532#'Soundex' sounds like 'Sownteks'#Example-> E251#Ekzampul -> E251#'Example' sounds like 'Ekzampul'#foo-> F000#bar-> B600#'foo' does not sound like 'bar'

Scala
def soundex(s:String)={var code=s.head.toUpper.toStringvar previous=getCode(code.head)for(ch < - s.drop(1); current=getCode(ch.toUpper)){if (!current.isEmpty & & current!=previous)code+=currentprevious=current}code+="0000"code.slice(0, 4)} def getCode(c:Char)={val code=Map("1"-> List('B', 'F', 'P', 'V'), "2"-> List('C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'), "3"-> List('D', 'T'), "4"-> List('L'), "5"-> List('M', 'N'), "6"-> List('R')) code.find(_._2.exists(_==c)) match {case Some((k, _)) => kcase _ => ""}}

用法
def main(args: Array[String]): Unit = {val tests=Map("Soundex"-> "S532", "Euler"-> "E460", "Gauss"-> "G200", "Hilbert"-> "H416", "Knuth"-> "K530", "Lloyd"-> "L300", "Lukasiewicz" -> "L222", "Ellery"-> "E460", "Ghosh"-> "G200", "Heilbronn"-> "H416", "Kant"-> "K530", "Ladd"-> "L300", "Lissajous"-> "L222", "Wheaton"-> "W350", "Ashcraft"-> "A226", "Burroughs"-> "B622", "Burrows"-> "B620", "O'Hara"-> "O600") tests.foreach{(v)=> val code=soundex(v._1)val status=if (code==v._2) "OK" else "ERROR"printf("Name: %-20sCode: %sFound: %s- %s\n", v._1, v._2, code, status)}}

Swift在这个github仓库中, cafford编写的类是Swift语言中原始Soundex算法的实现。
////Soundex.swift//speller////Created by Clifford Helsel on 4/28/16.////Based on standard Soundex algorithm and loosely ported from Apache Commons//https://commons.apache.org/proper/commons-codec/apidocs/src-html/org/apache/commons/codec/language/Soundex.htmlpublic class Soundex {private static let en_mapping_string = Array("01230120022455012623010202".characters)private static let en_alphabet = Array("ABCDEFGHIJKLMNOPQRSTUVWXYZ".characters)private let mapping: [Character:Character] = Soundex.buildMapping(codes:en_alphabet, alphabet:en_mapping_string)private static func buildMapping(codes: Array< Character> , alphabet: Array< Character> ) -> [Character:Character] {var retval: [Character:Character] = [:]for (index, code) in codes.enumerated() {retval[code] = alphabet[index]}return retval}private var soundexMapping: Array< Character> = Array(repeating:" ", count:4)private func getMappingCode(s: String, index:Int) -> Character {let i = s.index(s.startIndex, offsetBy: index)let mappedChar = mapChar(c:s[i])if (index> 1 & & !(mappedChar=="0")){let j = s.index(s.startIndex, offsetBy:index-1)let hwChar = s[j]if (hwChar=="H" || hwChar=="W"){let k = s.index(s.startIndex, offsetBy:index-2)let prehwChar = s[k]let firstCode = mapChar(c:prehwChar)if (firstCode==mappedChar || "H"==prehwChar || "W"==prehwChar) {return "0"}}}return mappedChar}private func mapChar(c: Character) -> Character {if let val = mapping[c] {return val}return "0" // not specified in original Soundex specification, if character is not found, code is 0}public func soundex(of: String) -> String {guard (of.characters.count> 0) else {return ""}let str=of.uppercased()var out: Array< Character> = Array("".characters)var last: Character = " "var mapped: Character = " "var incount=1var count = 1out[0]=str[str.startIndex]last = getMappingCode(s:str, index: 0)while (incount < str.characters.count & & count < out.count) {mapped = getMappingCode(s:str, index: incount)incount += 1if (mapped != "0") {if (mapped != "0" & & mapped != last) {out[count]=mappedcount += 1}}}return String(out)}}

用法
let c = Soundex()c.soundex(of:"Christopher") // C631

VBScript
Function getCode(c)Select Case cCase "B", "F", "P", "V"getCode = "1"Case "C", "G", "J", "K", "Q", "S", "X", "Z"getCode = "2"Case "D", "T"getCode = "3"Case "L"getCode = "4"Case "M", "N"getCode = "5"Case "R"getCode = "6"End SelectEnd Function Function soundex(s)Dim code, previouscode = UCase(Mid(s, 1, 1))previous = 7For i = 2 to (Len(s) + 1)current = getCode(UCase(Mid(s, i, 1)))If Len(current) > 0 And current < > previous Thencode = code & currentEnd Ifprevious = currentNextsoundex = Mid(code, 1, 4)If Len(code) < 4 Thensoundex = soundex & String(4 - Len(code), "0")End IfEnd Function

最后, 如果你知道Soundex算法在另一种语言中的实现(或者你对现有语言有更好的摘录), 请不要害羞, 并在评论框中与我们分享, 祝你玩得开心!

    推荐阅读