require_once 'Zend/Search/Lucene.php';
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$index=new Zend_Search_Lucene('myindex',true);
$doc=new Zend_Search_Lucene_Document();
$doc->addField(Zend_Search_Lucene_Field::Text('content','这是','utf-8'));
$doc1=new Zend_Search_Lucene_Document();
$doc1->addField(Zend_Search_Lucene_Field::Text('content','本身没有提供中文分词算法,具体应用中要自己写。我这里使用简单的二元分词算法','utf-8'));
$doc2=new Zend_Search_Lucene_Document();
$doc2->addField(Zend_Search_Lucene_Field::Text('content','徐华梁比如上面的输出就是','utf-8'));
$index->addDocument($doc);
$index->addDocument($doc1);
$index->addDocument($doc2);
$index->commit();
$index=new Zend_Search_Lucene('myindex');
require_once('Analyzer.php');
Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Analyzer());
$analyzer=Zend_Search_Lucene_Analysis_Analyzer::getDefault();
$cnStopWords = array('的');
$analyzer->setCnStopWords($cnStopWords);
$value = '徐华梁;
$analyzer->setInput($value, 'utf-8');
$position = 0;
$tokenCounter = 0;
while (($token = $analyzer->nextToken()) !== null) {
$tokenCounter++;
$tokens[] = $token;
}
foreach( $tokens as $tokenObject )
{
$keyword = $tokenObject->getTermText();
// echo $keyword;
$query=Zend_Search_Lucene_Search_QueryParser::parse($keyword,'utf-8');
$hits = $index->find($query);
foreach ($hits as $hit)
{
$count ++;
echo $hit->content.'<br>;
}
}
if( $count == 0 )
{
echo "没有找到任何匹配的记录,唉……<br />";
}
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
class Helper_Analyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
private $_position;
private $_cnStopWords = array( );
public function setCnStopWords( $cnStopWords )
{
$this->_cnStopWords = $cnStopWords;
}
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$search = array(",", "/", "、", ".", ";", ":", "\"", "!", "~", "`", "^", "(", ")", "?", "-", "'", "<", ">", "$", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", ":", ")", "(", ".", "。", ",", "!", ";", "“", "”", "‘", "’", "[", "]", "、", "—", " ", "《", "》", "-", "…", "【", "】", "?", "¥" );
$this->_input = str_replace( $search, '', $this->_input );
$this->_input = str_replace( $this->_cnStopWords, ' ', $this->_input );
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null)
{
return null;
}
$len = strlen($this->_input);
//print "原始数据:".$this->_input."<br />";
while ($this->_position < $len)
{
// 去掉开头的空格
while ($this->_position < $len &&$this->_input[$this->_position]==' ' )
{
$this->_position++;
}
$termStartPosition = $this->_position;
$temp_char = $this->_input[$this->_position];
$isCnWord = false;
if(ord($temp_char)>127)
{
$i = 0;
while( $this->_position < $len && ord( $this->_input[$this->_position] )>127 )
{
$this->_position = $this->_position + 3;
$i ++;
if($i==2)
{
$isCnWord = true;
break;
}
}
if($i==1) continue;
}
else
{
while ($this->_position < $len && ctype_alnum( $this->_input[$this->_position] ))
{
$this->_position++;
}
//echo $this->_position.":".$this->_input[$this->_position-1]."\n";
}
if ($this->_position == $termStartPosition)
{
$this->_position++;
continue;
}
$tmp_str = substr($this->_input, $termStartPosition, $this->_position - $termStartPosition);
$token = new Zend_Search_Lucene_Analysis_Token( $tmp_str, $termStartPosition,$this->_position );
$token = $this->normalize($token);
if($isCnWord)
{
$this->_position = $this->_position - 3;
}
if ($token !== null)
{
return $token;
}
}
return null;
}
}