算法:
* 0、构建Trie树
* 1、遍历目标字符串中的每一个字符,如果字符不存在,继续下一个。
* 2、如果字符存在,从该字符下一个字符开始遍历,匹配成功后直接返回匹配字符长度,然后跳过这个长度并不是+1, 算是优化了【匹配到一个词后直接返回,没有往下找最长】,否则回到第1步
* 3、重复1-2
/* * 敏感词返回 * * 例: 敏感词: [白粉,白粉人] 吸白粉的白粉人 => [白粉] [白粉] */ class TreeMap { // 节点字符 public $data; // 存放子节点引用(因为有任意个子节点,所以靠数组来存储) public $children = []; // 是否是字符串结束字符 public $isEndingChar = false; public function __construct($data) { $this->data = $data; } } class TrieTree { /** * 敏感词数组 * * @var array * @author qpf */ public $trieTreeMap = []; public function __construct() { $this->trieTreeMap = new TreeMap('/'); } /** * 获取敏感词Map * * @return array * @author qpf */ public function getTreeMap() { return $this->trieTreeMap; } /** * 添加敏感词 * * @param array $txtWords * @author qpf */ public function addWords(array $words) { foreach ($words as $word) { $trieTreeMap = $this->trieTreeMap; $len = mb_strlen($word); for ($i = 0; $i < $len; $i++) { $char = mb_substr($word, $i, 1); if(!isset($trieTreeMap->children[$char])){ $newNode = new TreeMap($char); $trieTreeMap->children[$char] = $newNode; } $trieTreeMap = $trieTreeMap->children[$char]; } $trieTreeMap->isEndingChar = true; } // var_dump($this->trieTreeMap); } /** * 查找对应敏感词 * * @param string $txt * @return array * @author qpf */ public function search($txt) { $wordsList = []; $txtLength = mb_strlen($txt); for ($i = 0; $i < $txtLength; $i++) { $wordLength = $this->checkWord($txt, $i, $txtLength); if ($wordLength > 0) { $words = mb_substr($txt, $i, $wordLength); $wordsList[] = $words; // 匹配到一个词后,跳过整个词 $i += $wordLength - 1; } } return $wordsList; } /** * 敏感词检测 * * @param $txt * @param $beginIndex * @param $length * @return int */ private function checkWord($txt, $beginIndex, $length) { $flag = false; $wordLength = 0; // 获取敏感词树 $trieTree = $this->trieTreeMap; for ($i = $beginIndex; $i < $length; $i++) { // 检验单个字 $word = mb_substr($txt, $i, 1); // 如果树中不存在,结束 if (!isset($trieTree->children[$word])) { break; } // 如果存在 $wordLength++; $trieTree = $trieTree->children[$word]; if ($trieTree->isEndingChar === true) { // 直接退出了,没有匹配最长的敏感词 $flag = true; break; } } if ($beginIndex > 0) { // 如果$flag == false 赋值$wordLenth为0 $flag || $wordLength = 0; } return $wordLength; } } // 有bug,如果是返回有共同前缀的字符串,就会跳过。比如这句话,之后匹配到[白粉] [白粉人匹配不到] // $data = ['白粉', '白粉人']; $data = ['白粉', '白粉人', '白粉人嫩','不该大']; $wordObj = new TrieTree(); $wordObj->addWords($data); $txt = '中国人民白粉不应该吸收白粉人'; $words = $wordObj->search($txt); var_dump($words);die;