package player.kent.chen.temp.lucene.synonymon;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
public class MySynonymFilter extends TokenFilter {
private final TermAttribute termAttr;
private final PositionIncrementAttribute piAttr;
private final Queue<String> synonyms = new LinkedList<String>();
private AttributeSource.State attrsState;
protected MySynonymFilter(TokenStream input) {
super(input);
this.piAttr = addAttribute(PositionIncrementAttribute.class);
this.termAttr = addAttribute(TermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
String syn = synonyms.poll();
if (syn == null) { //上次incrementToken()时没有同义词剩下
boolean hasToken = input.incrementToken(); //正常递进
if (!hasToken) {
return false; //已到end of input
}
String term = termAttr.term(); //当前term
List<String> synGroup = MySynonymRepository.getAliasGroup(term);
synonyms.addAll(synGroup); //把当前term的同义词都存起来,接下来incrementToken()时再把这些同义词视为token
//当前的属性状态也存起来
attrsState = captureState();
return true;
} else {
restoreState(attrsState); //把发现同义词时的属性状态复制过来作为当前状态,但下面两个属性要重设
termAttr.setTermBuffer(syn); //当前token的term是同义词
piAttr.setPositionIncrement(0); // 使token位置重叠
return true;
}
}
}
package player.kent.chen.temp.lucene.synonymon;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
public class MySynonymAnalyzer extends Analyzer {
private final StandardAnalyzer standardAnalyzer;
public MySynonymAnalyzer(StandardAnalyzer standardAnalyzer) {
this.standardAnalyzer = standardAnalyzer;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new MySynonymFilter(standardAnalyzer.tokenStream(fieldName, reader));
}
}
package player.kent.chen.temp.lucene.synonymon;
import java.util.ArrayList;
public class MySynonymRepository {
private static final List<String[]> wordGroups = new ArrayList<String[]>();
static {
wordGroups.add(new String[] { "hello", "hi", "aloha", "nihao" });
wordGroups.add(new String[] { "goodbye", "bye", "farewell", "ciao" });
}
public static List<String> getAliasGroup(String word) {
for (String[] wordGroup : wordGroups) {
List<String> wordGroupList = Arrays.asList(wordGroup);
if (wordGroupList.contains(word)) {
return wordGroupList;
}
}
return Collections.emptyList();
}
}
package player.kent.chen.temp.lucene.synonymon;
import java.io.File;
public class MySynonymIndexer {
public static void main(String[] args) throws Exception {
String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene";
File contentDir = new File(rootDir, "content");
File indexDir = new File(rootDir, "index");
FileUtils.deleteDirectory(indexDir);
indexDir.mkdirs();
long begin = now();
doIndex(contentDir, indexDir);
System.out.println("Done in miliseconds of : " + (now() - begin));
}
private static void doIndex(File cd, File id) throws IOException {
Directory indexDir = FSDirectory.open(id);
IndexWriter writer = new IndexWriter(indexDir, new MySynonymAnalyzer(new StandardAnalyzer(
Version.LUCENE_30)), true, IndexWriter.MaxFieldLength.UNLIMITED);
File[] files = cd.listFiles();
for (File file : files) {
System.out.println("Indexing ... " + file.getAbsolutePath());
Document doc = new Document();
doc.add(new Field("contents", new FileReader(file)));
doc.add(new Field("filepath", file.getAbsolutePath(), Field.Store.YES,
Field.Index.ANALYZED));
writer.addDocument(doc);
}
writer.numDocs();
writer.close();
}
private static long now() {
return System.currentTimeMillis();
}
}
package player.kent.chen.temp.lucene.synonymon;
import java.io.File;
import java.text.MessageFormat;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class MySynonymSearcher {
public static void main(String[] args) throws Exception {
String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene";
File id = new File(rootDir, "index");
String keyword = "ciao";
Directory indexDir = FSDirectory.open(id);
IndexSearcher is = new IndexSearcher(indexDir);
QueryParser qp = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(
Version.LUCENE_30));
Query query = qp.parse(keyword);
long begin = now();
TopDocs hits = is.search(query, 10);
System.out.println(MessageFormat.format("Found {0} matches in {1} milliseconds",
hits.totalHits, now() - begin));
System.out.println("They are:");
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc);
String file = doc.get("filepath");
String grepCmd = MessageFormat.format("cat {0}", file); //可以命中含有"bye"的文档
System.out.println("Please do: " + grepCmd);
}
is.close();
}
private static long now() {
return System.currentTimeMillis();
}
}