# 50行Python代码写一个语言检测器

jopen 10年前

La femme boit du lait. (译者注： 法语：女性喝牛奶。)

• 单个字符的重复性
• 字符串的重复性

`class NGram(object):   def __init__(self, text, n=3):    self.length = None    self.n = n    self.table = {}    self.parse_text(text)     def parse_text(self, text):    chars = ' ' * self.n # initial sequence of spaces with length n      for letter in (" ".join(text.split()) + " "):     chars = chars[1:] + letter # append letter to sequence of length n     self.table[chars] = self.table.get(chars, 0) + 1 # increment count`

`{    '  S': 1,    ' Sn': 1,    'Sna': 1,    'nai': 1,    'ail': 2,    'il ': 1,    'l M': 1,    ' Ma': 1,    'Mai': 1,    'il.': 1  }`

`class NGram(object):   def __init__(self, text, n=3):    self.length = None    self.n = n    self.table = {}    self.parse_text(text)    self.calculate_length()     def parse_text(self, text):    chars = ' ' * self.n # initial sequence of spaces with length n      for letter in (" ".join(text.split()) + " "):     chars = chars[1:] + letter # append letter to sequence of length n     self.table[chars] = self.table.get(chars, 0) + 1 # increment count     def calculate_length(self):    """ Treat the N-Gram table as a vector and return its scalar magnitude    to be used for performing a vector-based search.    """    self.length = sum([x * x for x in self.table.values()]) ** 0.5    return self.length     def __sub__(self, other):    """ Find the difference between two NGram objects by finding the cosine    of the angle between the two vector representations of the table of    N-Grams. Return a float value between 0 and 1 where 0 indicates that    the two NGrams are exactly the same.    """    if not isinstance(other, NGram):     raise TypeError("Can't compare NGram with non-NGram object.")      if self.n != other.n:     raise TypeError("Can't compare NGram objects of different size.")      total = 0    for k in self.table:     total += self.table[k] * other.table.get(k, 0)      return 1.0 - (float(total) / (float(self.length) * float(other.length))     def find_match(self, languages):    """ Out of a list of NGrams that represent individual languages, return    the best match.    """    return min(languages, lambda n: self - n)`

`english = NGram(training_text, n=3) #trigram`

`similarity = english - NGram(text, n=3)`

`languages = [english, spanish, french]  NGram(text, n=3).best_match(languages)`

N-Grams的概念可以在不同的领域应用。比如：

• 语法拼写建议（建议改正非正确语法词汇）
• 鉴定DNA序列
• 提高压缩算法的有效性
• 改进搜索引擎
• 改进语音识别系统和特征，通过某个特定词语会出现在另一个词语后面的概率