Java 敏感词过滤算法

jopen 8年前

1.DFA算法

DFA算法的原理可以参考 这里 ,简单来说就是通过Map构造出一颗敏感词树,树的每一条由根节点到叶子节点的路径构成一个敏感词,例如下图:

代码简单实现如下:

public class TextFilterUtil {   //日志   private static final Logger LOG = LoggerFactory.getLogger(TextFilterUtil.class);   //敏感词库   private static HashMap sensitiveWordMap = null;   //默认编码格式   private static final String ENCODING = "gbk";   //敏感词库的路径   private static final InputStream in = TextFilterUtil.class.getClassLoader().getResourceAsStream("sensitive/keyWords.txt");   /**    * 初始化敏感词库    */   private static void init() {    //读取文件    Set<String> keyWords = readSensitiveWords();    //创建敏感词库    sensitiveWordMap = new HashMap<>(keyWords.size());    for (String keyWord : keyWords) {     createKeyWord(keyWord);    }   }   /**    * 构建敏感词库    *    * @param keyWord    */   private static void createKeyWord(String keyWord) {    if (sensitiveWordMap == null) {     LOG.error("sensitiveWordMap 未初始化!");     return;    }    Map nowMap = sensitiveWordMap;    for (Character c : keyWord.toCharArray()) {     Object obj = nowMap.get(c);     if (obj == null) {      Map<String, Object> childMap = new HashMap<>();      childMap.put("isEnd", "false");      nowMap.put(c, childMap);      nowMap = childMap;     } else {      nowMap = (Map) obj;     }    }    nowMap.put("isEnd", "true");   }   /**    * 读取敏感词文件    *    * @return    */   private static Set<String> readSensitiveWords() {    Set<String> keyWords = new HashSet<>();    BufferedReader reader = null;    try {     reader = new BufferedReader(new InputStreamReader(in, ENCODING));     String line;     while ((line = reader.readLine()) != null) {      keyWords.add(line.trim());     }    } catch (UnsupportedEncodingException e) {     LOG.error("敏感词库文件转码失败!");    } catch (FileNotFoundException e) {     LOG.error("敏感词库文件不存在!");    } catch (IOException e) {     LOG.error("敏感词库文件读取失败!");    } finally {     if (reader != null) {      try {       reader.close();      } catch (IOException e) {       e.printStackTrace();      }      reader = null;     }    }    return keyWords;   }   /**    * 检查敏感词    *    * @return    */   private static List<String> checkSensitiveWord(String text) {    if (sensitiveWordMap == null) {     init();    }    List<String> sensitiveWords = new ArrayList<>();    Map nowMap = sensitiveWordMap;    for (int i = 0; i < text.length(); i++) {     Character word = text.charAt(i);     Object obj = nowMap.get(word);     if (obj == null) {      continue;     }     int j = i + 1;     Map childMap = (Map) obj;     while (j < text.length()) {      if ("true".equals(childMap.get("isEnd"))) {       sensitiveWords.add(text.substring(i, j));      }      obj = childMap.get(text.charAt(j));      if (obj != null) {       childMap = (Map) obj;      } else {       break;      }      j++;     }    }    return sensitiveWords;   }  }

2.TTMP算法

TTMP算法由网友原创,关于它的起源可以查看 这里 ,TTMP算法的原理是将敏感词拆分成“脏字”的序列,只有待比对字符串完全由“脏字”组成时,才去判断它是否为敏感词,减少了比对次数。这个算法的简单实现如下:

public class TextFilterUtil {   //日志   private static final Logger LOG = LoggerFactory.getLogger(TextFilterUtil.class);   //默认编码格式   private static final String ENCODING = "gbk";   //敏感词库的路径   private static final InputStream in = TextFilterUtil.class.getClassLoader().getResourceAsStream("sensitive/keyWords.txt");   //脏字库   private static Set<Character> sensitiveCharSet = null;   //敏感词库   private static Set<String> sensitiveWordSet = null;   /**    * 初始化敏感词库    */   private static void init() {    //初始化容器    sensitiveCharSet = new HashSet<>();    sensitiveWordSet = new HashSet<>();    //读取文件 创建敏感词库    readSensitiveWords();   }   /**    * 读取本地的敏感词文件    *    * @return    */   private static void readSensitiveWords() {    BufferedReader reader = null;    try {     reader = new BufferedReader(new InputStreamReader(in, ENCODING));     String line;     while ((line = reader.readLine()) != null) {      String word = line.trim();      sensitiveWordSet.add(word);      for (Character c : word.toCharArray()) {       sensitiveCharSet.add(c);      }     }    } catch (UnsupportedEncodingException e) {     LOG.error("敏感词库文件转码失败!");    } catch (FileNotFoundException e) {     LOG.error("敏感词库文件不存在!");    } catch (IOException e) {     LOG.error("敏感词库文件读取失败!");    } finally {     if (reader != null) {      try {       reader.close();      } catch (IOException e) {       e.printStackTrace();      }      reader = null;     }    }    return;   }   /**    * 检查敏感词    *    * @return    */   private static List<String> checkSensitiveWord(String text) {    if (sensitiveWordSet == null || sensitiveCharSet == null) {     init();    }    List<String> sensitiveWords = new ArrayList<>();    for (int i = 0; i < text.length(); i++) {     Character word = text.charAt(i);     if (!sensitiveCharSet.contains(word)) {      continue;     }     int j = i;     while (j < text.length()) {      if (!sensitiveCharSet.contains(word)) {       break;      }      String key = text.substring(i, j + 1);      if (sensitiveWordSet.contains(key)) {       sensitiveWords.add(key);      }      j++;     }    }    return sensitiveWords;   }  }

注:以上代码实现仅用于展示思路,在实际使用中还有很多地方可以优化。