private IndexWriter writer;
private Analyzer analyzer;
public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(new File(indexDir));
analyzer = new SmartChineseAnalyzer(Version.LUCENE_35, true); // 中文分詞
writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
}
// 建立索引
public void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc);
}
// 輸出關鍵詞文本內容
public void getTermText(String fieldName, String text) {
TokenStream stream = analyzer.reusableTokenStream(fieldName, new StringReader(text)); // 重用流,以便提速性能
CharTermAttribute charTerm = stream.addAttribute(CharTermAttribute.class); // 獲取關鍵詞
String word;
while(stream.incrementToken()) {
word = charTerm.toString();
System.out.print(word + " ");
}
}
protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f)));
doc.add(new Field("filename", f.getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("fullpath", f.getCanonicalPath(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
return doc;
}
}
imdict-chinese-analyzer - imdict智能詞典所采用的智能中文分詞程序
/p/imdict-chinese-analyzer/
這些應該對妳有用。