Lucene代码示例:使用SpanQuery找到keyword在文档中第一次出现的地方

无干货,仅供复制

位置信息类


package player.kent.chen.temp.lucene.span;

import org.apache.commons.lang.builder.ToStringBuilder;

public class KeywordLocation {
    private String file;
    /**
     * position in the token stream
     */
    private int    position;

    private KeywordLocation() {
    }

    public static final KeywordLocation createInstance(String file, int position) {
        KeywordLocation instance = new KeywordLocation();
        instance.file = file;
        instance.position = position;
        return instance;
    }

    public String getFile() {
        return file;
    }

    public void setFile(String file) {
        this.file = file;
    }

    public int getPosition() {
        return position;
    }

    public void setPosition(int position) {
        this.position = position;
    }

    @Override
    public String toString() {
        return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
    }
}

搜索器


package player.kent.chen.temp.lucene.span;

import java.io.File;

public class FindFirstOccurenceSearcher {

    public static void main(String[] args) throws Exception {
        String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene-sanguo";
        File id = new File(rootDir, "index");

        Directory indexDir = FSDirectory.open(id);
        IndexSearcher searcher = new IndexSearcher(indexDir);

        String keyword = "Brotherhood";

        KeywordLocation kl = findFirstOccurence(searcher, keyword);
        System.out.println(MessageFormat.format("\"{0}\":{1}", keyword, kl));

        searcher.close();

    }

    private static KeywordLocation findFirstOccurence(IndexSearcher searcher, String keyword)
            throws IOException, CorruptIndexException {
        SpanTermQuery spanTermQuery = new SpanTermQuery(new Term("contents", keyword.toLowerCase()));

        IndexReader indexReader = searcher.getIndexReader();
        Spans spans = spanTermQuery.getSpans(indexReader);

        TopDocs hits = searcher.search(spanTermQuery, 1);
        if (hits.totalHits == 0) {
            return null;
        }

        spans.next();

        Document doc = indexReader.document(spans.doc());
        String file = doc.get("filepath");
        int position = spans.start();

        return KeywordLocation.createInstance(file, position);

    }

}

另附索引器

package player.kent.chen.temp.lucene.span;

import java.io.File;

public class LearnSpanLuceneIndexer {

    public static void main(String[] args) throws Exception {
        String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene-sanguo";

        File contentDir = new File(rootDir, "content");
        File indexDir = new File(rootDir, "index");

        FileUtils.deleteDirectory(indexDir);
        indexDir.mkdirs();

        long begin = now();
        doIndex(contentDir, indexDir);
        System.out.println("Done in miliseconds of : " + (now() - begin));

    }

    private static void doIndex(File cd, File id) throws IOException {
        Directory indexDir = FSDirectory.open(id);
        IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30),
                true, IndexWriter.MaxFieldLength.UNLIMITED);

        File[] files = cd.listFiles();
        for (File file : files) {
            System.out.println("Indexing ... " + file.getAbsolutePath());
            Document doc = new Document();
            doc.add(new Field("contents", new FileReader(file)));
            doc.add(new Field("filepath", file.getAbsolutePath(), Field.Store.YES,
                    Field.Index.ANALYZED));
            writer.addDocument(doc);
        }

        writer.numDocs();
        writer.close();

    }

    private static long now() {
        return System.currentTimeMillis();
    }

}

Leave a Comment

Your email address will not be published.

This site uses Akismet to reduce spam. Learn how your comment data is processed.