例:lucene 的同义词分析器

package player.kent.chen.temp.lucene.synonymon;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;

public class MySynonymFilter extends TokenFilter {

    private final TermAttribute              termAttr;
    private final PositionIncrementAttribute piAttr;

    private final Queue<String>              synonyms = new LinkedList<String>();
    private AttributeSource.State            attrsState;

    protected MySynonymFilter(TokenStream input) {
        super(input);
        this.piAttr = addAttribute(PositionIncrementAttribute.class);
        this.termAttr = addAttribute(TermAttribute.class);
    }

    @Override
    public boolean incrementToken() throws IOException {

        String syn = synonyms.poll();

        if (syn == null) { //上次incrementToken()时没有同义词剩下
            boolean hasToken = input.incrementToken(); //正常递进
            if (!hasToken) {
                return false; //已到end of input
            }
            String term = termAttr.term(); //当前term
            List<String> synGroup = MySynonymRepository.getAliasGroup(term);
            synonyms.addAll(synGroup); //把当前term的同义词都存起来,接下来incrementToken()时再把这些同义词视为token            
            //当前的属性状态也存起来
            attrsState = captureState();
            return true;
        } else {
            restoreState(attrsState); //把发现同义词时的属性状态复制过来作为当前状态,但下面两个属性要重设
            termAttr.setTermBuffer(syn); //当前token的term是同义词
            piAttr.setPositionIncrement(0); // 使token位置重叠
            return true;

        }

    }
}

package player.kent.chen.temp.lucene.synonymon;

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class MySynonymAnalyzer extends Analyzer {

    private final StandardAnalyzer standardAnalyzer;

    public MySynonymAnalyzer(StandardAnalyzer standardAnalyzer) {
        this.standardAnalyzer = standardAnalyzer;
    }

    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
        return new MySynonymFilter(standardAnalyzer.tokenStream(fieldName, reader));
    }

}

package player.kent.chen.temp.lucene.synonymon;

import java.util.ArrayList;

public class MySynonymRepository {

    private static final List<String[]> wordGroups = new ArrayList<String[]>();
    static {
        wordGroups.add(new String[] { "hello", "hi", "aloha", "nihao" });
        wordGroups.add(new String[] { "goodbye", "bye", "farewell", "ciao" });
    }

    public static List<String> getAliasGroup(String word) {
        for (String[] wordGroup : wordGroups) {
            List<String> wordGroupList = Arrays.asList(wordGroup);
            if (wordGroupList.contains(word)) {
                return wordGroupList;
            }
        }

        return Collections.emptyList();
    }

}

package player.kent.chen.temp.lucene.synonymon;

import java.io.File;

public class MySynonymIndexer {

    public static void main(String[] args) throws Exception {
        String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene";

        File contentDir = new File(rootDir, "content");
        File indexDir = new File(rootDir, "index");

        FileUtils.deleteDirectory(indexDir);
        indexDir.mkdirs();

        long begin = now();
        doIndex(contentDir, indexDir);
        System.out.println("Done in miliseconds of : " + (now() - begin));

    }

    private static void doIndex(File cd, File id) throws IOException {
        Directory indexDir = FSDirectory.open(id);
        IndexWriter writer = new IndexWriter(indexDir, new MySynonymAnalyzer(new StandardAnalyzer(
                Version.LUCENE_30)), true, IndexWriter.MaxFieldLength.UNLIMITED);

        File[] files = cd.listFiles();
        for (File file : files) {
            System.out.println("Indexing ... " + file.getAbsolutePath());
            Document doc = new Document();
            doc.add(new Field("contents", new FileReader(file)));
            doc.add(new Field("filepath", file.getAbsolutePath(), Field.Store.YES,
                    Field.Index.ANALYZED));
            writer.addDocument(doc);
        }

        writer.numDocs();
        writer.close();

    }

    private static long now() {
        return System.currentTimeMillis();
    }

}

package player.kent.chen.temp.lucene.synonymon;

import java.io.File;
import java.text.MessageFormat;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class MySynonymSearcher {

    public static void main(String[] args) throws Exception {
        String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene";
        File id = new File(rootDir, "index");

        String keyword = "ciao";

        Directory indexDir = FSDirectory.open(id);
        IndexSearcher is = new IndexSearcher(indexDir);

        QueryParser qp = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(
                Version.LUCENE_30));
        Query query = qp.parse(keyword);

        long begin = now();
        TopDocs hits = is.search(query, 10);
        System.out.println(MessageFormat.format("Found {0} matches in {1} milliseconds",
                hits.totalHits, now() - begin));

        System.out.println("They are:");

        for (ScoreDoc scoreDoc : hits.scoreDocs) {
            Document doc = is.doc(scoreDoc.doc);
            String file = doc.get("filepath");
            String grepCmd = MessageFormat.format("cat {0}", file); //可以命中含有"bye"的文档
            System.out.println("Please do: " + grepCmd);
        }

        is.close();

    }

    private static long now() {
        return System.currentTimeMillis();
    }

}

Leave a Comment

Your email address will not be published.

This site uses Akismet to reduce spam. Learn how your comment data is processed.