//TSESearch.cpp中:
[csharp] view plaincopy
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //將get到的查詢變量分詞分成 "我/ 愛/ 妳們/ 的/ 格式"
vector<string></string> vecTerm;
iQuery.ParseQuery(vecTerm); //將以"/"劃分開的關鍵字壹壹順序放入壹個向量容器中
set<string></string> setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//搜索完畢
[php] view plaincopy
看CHzSeg 中的這個方法
[php] view plaincopy
//ChSeg/HzSeg.h
[html] view plaincopy
/**
* 程序翻譯說明
* 進壹步凈化數據,轉換漢字
* @access public
* @param CDict, string 參數的漢字說明:字典,查詢字符串
* @return string 0
*/
// process a sentence before segmentation
//在分詞前處理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;
while (!s1.empty())
{
unsigned char ch=(unsigned char) s1[0];
if(ch<128)
{ // deal with ASCII
i=1;
len = s1.size();
while (i<len cr="" s2="" ch="=13)" added="" by="" s1="s1.substr(i);" yhf="" else="" if="" 中文標點等非漢字字符="" i="0;" len="s1.length();"></len>=161)
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
&& (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
|| (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
|| (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
{
i=i+2; // 假定沒有半個漢字
}
if (i==0) i=i+2;
// 不處理中文空格
if (!(ch==161 && (unsigned char)s1[1]==161))
{
if (i <= s1.size()) // yhf
// 其他的非漢字雙字節字符可能連續輸出
s2 += s1.substr(0, i) + SEPARATOR;
else break; // yhf
}
if (i <= s1.size()) // yhf
s1=s1.substr(i);
else break; //yhf
continue;
}
}
// 以下處理漢字串
i = 2;
len = s1.length();
while(i<len></len>=176)
// while(i<len></len>=128 && (unsigned char)s1[i]!=161)
i+=2;
s2+=SegmentHzStrMM(dict, s1.substr(0,i));
if (i <= len) // yhf
s1=s1.substr(i);
else break; // yhf
}
return s2;
}
[html] view plaincopy
[html] view plaincopy
//Query.cpp
[html] view plaincopy
<pre class="csharp" name="code">/**
* 程序翻譯說明
* 將以"/"劃分開的關鍵字壹壹順序放入壹個向量容器中
*
* @access public
* @param vector<string></string> 參數的漢字說明:向量容器
* @return void
*/
void CQuery::ParseQuery(vector<string></string> &vecTerm)
{
string::size_type idx;
while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
vecTerm.push_back(m_sSegQuery.substr(0,idx));
m_sSegQuery = m_sSegQuery.substr(idx+3);
}
}
</pre>
<pre class="csharp" name="code"> </pre>
<pre class="csharp" name="code"><pre class="csharp" name="code">/**
* 程序翻譯說明
* 相關性分析查詢,構造結果集合setRelevantRst //瓶頸所在
*
* @access public
* @param vector<string></string> map set<string></string> 參數的漢字說明: 用戶提交關鍵字的分詞組,倒排索引映射,相關性結果集合
* @return string 0
*/
bool CQuery::GetRelevantRst
(
vector<string></string> &vecTerm,
map &mapBuckets,
set<string></string> &setRelevantRst
) const
{
set<string></string> setSRst;
bool bFirst=true;
vector<string></string>::iterator itTerm = vecTerm.begin();
for ( ; itTerm != vecTerm.end(); ++itTerm )
{
setSRst.clear();
copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));
map mapRstDoc;
string docid;
int doccnt;
map::iterator itBuckets = mapBuckets.find(*itTerm);
if (itBuckets != mapBuckets.end())
{
string strBucket = (*itBuckets).second;
string::size_type idx;
idx = strBucket.find_first_not_of(" ");
strBucket = strBucket.substr(idx);
while ( (idx = strBucket.find(" ")) != string::npos )
{
docid = strBucket.substr(0,idx);
doccnt = 0;
if (docid.empty()) continue;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
strBucket = strBucket.substr(idx+1);
}
// remember the last one
docid = strBucket;
doccnt = 0;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
}
// sort by term frequencty
multimap > newRstDoc;
map::iterator it0 = mapRstDoc.begin();
for ( ; it0 != mapRstDoc.end(); ++it0 ){
newRstDoc.insert( pair((*it0).second,(*it0).first) );
}
multimap::iterator itNewRstDoc = newRstDoc.begin();
setRelevantRst.clear();
for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
string docid = (*itNewRstDoc).second;
if (bFirst==true) {
setRelevantRst.insert(docid);
continue;
}
if ( setSRst.find(docid) != setSRst.end() ){
setRelevantRst.insert(docid);
}
}
//cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<br>";
bFirst = false;
}
return true;
}</pre>
</pre>
接下來的就是現實了