//Monkey's 二元分词
function sp_str($str) {
//所有汉字后添加ASCII的0字符,此法是为了排除特殊中文拆分错误的问题
$str=preg_replace("/[\x80-\xff]{2}/","\\0".chr(0x00),$str);
//拆分的分割符
$search = array(",", "/", "\\", ".", ";", ":", "\"", "!", "~", "`", "^", "(", ")", "?", "-", "\t", "\n", "'", "<", ">", "\r", "\r\n", "$", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", ":", ")", "(", ".", "。", ",", "!", ";", "“", "”", "‘", "’", "[", "]", "、", "—", " ", "《", "》", "-", "…", "【", "】",);
www.phperz.com //替换所有的分割符为空格
$str = str_replace($search,' ',$str);
//用正则匹配半角单个字符或者全角单个字符,存入数组$ar
preg_match_all("/[\x80-\xff]?./",$str,$ar);$ar=$ar[0];
//去掉$ar中ASCII为0字符的项目
for ($i=0;$i<count($ar);$i++) if ($ar[$i]!=chr(0x00)) $ar_new[]=$ar[$i];
$ar=$ar_new;unset($ar_new);$oldsw=0;
//把连续的半角存成一个数组下标,或者全角的每2个字符存成一个数组的下标
for ($ar_str='',$i=0;$i<count($ar);$i++) {
$sw=strlen($ar[$i]);
if ($i>0 and $sw!=$oldsw) $ar_str.=" ";
if ($sw==1) $ar_str.=$ar[$i];
else
if (strlen($ar[$i+1])==2) $ar_str.=$ar[$i].$ar[$i+1].' ';
elseif ($oldsw==1 or $oldsw==0) $ar_str.=$ar[$i];
$oldsw=$sw;
}
//去掉连续的空格
$ar_str=trim(preg_replace("# {1,}#i"," ",$ar_str));//$ar_str = "Monkey s 二元 元分 分词"
phperz~com
//返回拆分后的结果
return explode(' ',$ar_str);
}
接下来,就该考虑如何把分好的词转换成单字节的,可以使用base64,sha1,md5。但有个问题就是转换后的字符有点长,那如何才能缩短字符呢,对了,就是使用区位码,因为区位码短啊,一个中文只占四个字节。$query = "SELECT title, MATCH( title_ft ) AGAINST( '$title_ft' IN BOOLEAN MODE ) AS score
FROM info
WHERE MATCH( title_ft ) AGAINST( '$title_ft' IN BOOLEAN MODE )
ORDER BY score DESC ";
其中 $title_ft是经过两个函数处理后的字符串,用它去匹配title_ft。