Monthly Archives: December 2012

lucene几种Query对象示例

package player.kent.chen.temp.lucene.miscquery;

import java.io.IOException;

public class MyLuceneMiscQueryDemo {

    public static void main(String[] args) throws Exception {

        //创建index writer对象
        Directory indexDir = new RAMDirectory();
        IndexWriter indexWriter = new IndexWriter(indexDir,
                new StandardAnalyzer(Version.LUCENE_30), IndexWriter.MaxFieldLength.UNLIMITED);

        String text1 = "adam";
        Document doc1 = new Document();
        doc1.add(new Field("content", text1, Field.Store.YES, Field.Index.ANALYZED));
        indexWriter.addDocument(doc1);

        String text2 = "brings";
        Document doc2 = new Document();
        doc2.add(new Field("content", text2, Field.Store.YES, Field.Index.ANALYZED));
        indexWriter.addDocument(doc2);

        String text3 = "cups";
        Document doc3 = new Document();
        doc3.add(new Field("content", text3, Field.Store.YES, Field.Index.ANALYZED));
        indexWriter.addDocument(doc3);

        IndexSearcher indexSearcher = new IndexSearcher(indexWriter.getReader());

        //term query
        doSearch(indexSearcher, new TermQuery(new Term("content", "adam"))); //命中admin
        //term range query
        doSearch(indexSearcher, new TermRangeQuery("content", "b", "c", true, true)); //命中adam, brings; 按字典序cups > c, 所以cups不会被命中
        //prefix query
        doSearch(indexSearcher, new PrefixQuery(new Term("content", "b"))); //命中brings
        //wildcard query
        doSearch(indexSearcher, new WildcardQuery(new Term("content", "c*s"))); //命中cups
        //fuzzy query
        doSearch(indexSearcher, new FuzzyQuery(new Term("content", "caps"))); //命中cups
        doSearch(indexSearcher, new FuzzyQuery(new Term("content", "csp"))); //无命中

        indexSearcher.close();
        indexWriter.close();
    }

    private static void doSearch(IndexSearcher indexSearcher, Query query) throws IOException {
        System.out.println("========================================");
        System.out.println("The query is : " + query);
        TopDocs searchResult = indexSearcher.search(query, 10);
        System.out.println(MessageFormat.format("Found {0} matches", searchResult.totalHits));
        System.out.println("They are:");
        printResult(indexSearcher, searchResult);

    }

    private static void printResult(IndexSearcher indexSearcher, TopDocs result)
            throws CorruptIndexException, IOException {
        for (ScoreDoc scoreDoc : result.scoreDocs) {
            Document doc = indexSearcher.doc(scoreDoc.doc);
            System.out.println(doc.get("content"));
        }
    }
}


lucene near-real-time search代码示例

package player.kent.chen.temp.lucene.nrts;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class MyNearRealTimeSearch {

    public static void main(String[] args) throws Exception {

        //创建index writer对象
        Directory indexDir = new RAMDirectory();
        IndexWriter indexWriter = new IndexWriter(indexDir,
                new StandardAnalyzer(Version.LUCENE_30), IndexWriter.MaxFieldLength.UNLIMITED);

        //为第一个文档建索引,但不commit()
        String text1 = "I have a dream";
        Document doc1 = new Document();
        doc1.add(new Field("content", text1, Field.Store.YES, Field.Index.ANALYZED));
        indexWriter.addDocument(doc1);

        //搜一下
        IndexSearcher indexSearcher = new IndexSearcher(indexWriter.getReader()); //关键语句,searcher的reader来自上面的writer
        TopDocs result1 = indexSearcher.search(new TermQuery(new Term("content", "dream")), 1);
        System.out.println(result1.totalHits); //1

        //把第二个doc加入索引
        String text2 = " the sons of former slaves and the sons of former slave owners";
        Document doc2 = new Document();
        doc2.add(new Field("content", text2, Field.Store.YES, Field.Index.ANALYZED));
        indexWriter.addDocument(doc2);

        //再搜一下。 这时要更新一下search对象
        IndexReader newReader = indexSearcher.getIndexReader().reopen(); //reader重开,并返回新实例
        indexSearcher.getIndexReader().close(); //关闭原来的reader
        indexSearcher = new IndexSearcher(newReader);

        //搜
        TopDocs result2 = indexSearcher.search(new TermQuery(new Term("content", "slaves")), 1);
        System.out.println(result2.totalHits); //1

        indexWriter.close();
    }
}

[lucene] QueryParser中的default field是什么意思?

直接上例子

假设已有Index: 对文本文件进行索引,有两个Field, 分别是 文件名(fileName)和文件内容(content)

使用content作为default field:

        QueryParser qp = new QueryParser(Version.LUCENE_30, "content", new StandardAnalyzer(
                Version.LUCENE_30));     

   Query query = qp.parse("人");  //会搜出内容中含有“人”字样的文档
   Query query = qp.parse("fileName:人");  //会搜出标题中含有“人”字样的文档   

可以看出:

1. 使用content作为default field构建的Parser,仍然可以对其他Field进行搜索

2. 如果在搜索的term里不指定field, 则parser会默认使用content作为目标Field

nginx 后接jett/tomcat

不需要改jetty/tomcat的配置,只需要修改nginx.conf

引用

http{

      

        server{

                listen 80; 

                server_name www.xxx.com www2.xxx.com;

                location / {

                    proxy_pass              http://localhost:8080;

                    proxy_set_header        X-Real-IP $remote_addr;

                    proxy_set_header        X-Forwarded-For $proxy_add_x_forwarded_for;

                    proxy_set_header        Host $http_host;

                }

        }

}

另外,经过以上设置后,在servlet里拿request.getServerName()和request.getServerPort()会跟浏览器里输入的一致。

想象一下Lunece索引的逻辑结构

 

想象:

 

假设一个文本有以下几部分组成:

 

 

              title:   "Hadoop: The Definitive Guide"

        content:   "Hadoop got its start in Nutch"

unbreakable:    "united kingdom"   (先不要理会unbreakable的意义)

        ignored:    "Hadoop Nonsense" (注释同上)

 

 

 

如果按下列语句来建索引,索引大概会是什么样?

 

 

            Document doc = new Document();
            doc.add(new Field("ignored", passage.ignored, Field.Store.YES, Field.Index.NO));
            doc.add(new Field("content", passage.content, Field.Store.NO, Field.Index.ANALYZED));
            doc.add(new Field("unbreakable", passage.unbreakable, Field.Store.YES, Index.NOT_ANALYZED));
            doc.add(new Field("title", passage.title, Field.Store.YES, Field.Index.ANALYZED));
            indexWriter.addDocument(doc);

 

 

 

我觉得,索引的逻辑结构可以想象为:

 

 

 

Token Field Stored Text 备注
Hadoop  content
got content
its content
start  content
in content
Nutch content
Hadoop title Hadoop: The Definitive Guide
The  title Hadoop: The Definitive Guide
Definitive  title Hadoop: The Definitive Guide
Guide title Hadoop: The Definitive Guide
united kingdom unbreakable united kingdom "united kingdom"整体作为一个token
Nonsense ignored Nonsense "Nonsense"这个Token未被收录

 

 

验证代码:

package player.kent.chen.temp.lucene.indexcomponent;

import static player.kent.chen.temp.lucene.indexcomponent.MiscIndexCreator.MyPassage.SAMPLE_PASSAGE;

import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class MiscIndexCreator {

    public static void main(String[] args) throws Exception {
        String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene/indexOptions";
        File indexDir = new File(rootDir, "index");

        FileUtils.deleteDirectory(indexDir);
        indexDir.mkdirs();

        doIndex(indexDir);

    }

    private static void doIndex(File id) throws IOException {
        Directory indexDir = FSDirectory.open(id);
        IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30),
                true, IndexWriter.MaxFieldLength.UNLIMITED);

        System.out.println("Indexing ... " + SAMPLE_PASSAGE.title);
        Document doc = new Document();
        doc.add(new Field("ignored", SAMPLE_PASSAGE.ignored, Field.Store.YES, Field.Index.NO));
        doc.add(new Field("content", SAMPLE_PASSAGE.content, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("unbreakable", SAMPLE_PASSAGE.unbreakable, Field.Store.YES,
                Index.NOT_ANALYZED));
        doc.add(new Field("title", SAMPLE_PASSAGE.title, Field.Store.YES, Field.Index.ANALYZED));
        writer.addDocument(doc);

        writer.close();

    }

    public static final class MyPassage {

        String                        title;
        String                        content;
        String                        unbreakable;
        String                        ignored;

        public static final MyPassage SAMPLE_PASSAGE = MyPassage.newInstance(
                                                             "Hadoop: The Definitive Guide",
                                                             "Hadoop got its start in Nutch",
                                                             "united kingdom", "Nonsense");

        private MyPassage() {
        }

        public static final MyPassage newInstance(String title, String content, String unbreakable,
                                                  String ignored) {
            MiscIndexCreator.MyPassage instance = new MiscIndexCreator.MyPassage();
            instance.title = title;
            instance.content = content;
            instance.unbreakable = unbreakable;
            instance.ignored = ignored;
            return instance;
        }

    }

}

 

 

 

 

 

 

 

 

package player.kent.chen.temp.lucene.indexcomponent;

import java.io.File;

public class MiscIndexInspector {

    public static void main(String[] args) throws Exception {
        String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene/indexOptions";
        Directory indexDir = FSDirectory.open(new File(rootDir, "index"));
        IndexSearcher indexSearcher = new IndexSearcher(indexDir);

        //按ignored field搜索
        TopDocs searchOnIgnored = doSearch(indexSearcher, "ignored", "nonsense");
        println("Num of result matching 'ignored:nonsense' is :" + searchOnIgnored.totalHits); //为0, 因为ignored field没有被index

        //按content field搜索
        TopDocs searchOnContent = doSearch(indexSearcher, "content", "hadoop");
        println("Result matching 'content:hadoop' is :");
        Document docOfContentMatching = indexSearcher.doc(searchOnContent.scoreDocs[0].doc);
        println("   content : " + docOfContentMatching.get("content")); //为null, 因为content field在index中没有store
        println("   ignored : " + docOfContentMatching.get("ignored")); //不为空,因为ignored在index中已store,虽然没有被index

        //按unbreakable field搜索
        TopDocs searchOnUnbreakable = doSearch(indexSearcher, "unbreakable", "united kingdom");
        println("Num of result matching 'unbreakable:united kingdom' is :"
                + searchOnUnbreakable.totalHits); //为1

        TopDocs searchOnUnbreakable2 = doSearch(indexSearcher, "unbreakable", "kingdom");
        println("Num of result matching 'unbreakable:kingdom' is :"
                + searchOnUnbreakable2.totalHits); //为0, 因为"united kingdom"没有被analyze, 所以"kingdom"不是一个token

        //按title field搜索
        TopDocs searchOnTitle = doSearch(indexSearcher, "title", "hadoop");
        println("Result matching 'title:hadoop' is :");
        Document docOfTitleMatching = indexSearcher.doc(searchOnTitle.scoreDocs[0].doc);
        println("   title : " + docOfTitleMatching.get("title")); //为"Hadoop: The Definitive Guide"这整个标题,因为title有被store

        indexSearcher.close();

    }

    private static void println(String o) {
        System.out.println(o);
    }

    private static TopDocs doSearch(IndexSearcher indexSearcher, String field, String keyword)
            throws IOException, ParseException {
        return indexSearcher.search(buildQp(field, keyword), 2);
    }

    private static Query buildQp(String field, String keyword) throws ParseException {
        Query query = new TermQuery(new Term(field, keyword));
        return query;
    }

}

lucene indexer/searcher简单代码示例

仅供拷贝

<!--pom.xml-->

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>3.0.0</version>
		</dependency>
 

package player.kent.chen.temp.lucene;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class MyLuceneIndexer {

    public static void main(String[] args) throws Exception {
        String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene";

        File contentDir = new File(rootDir, "content");
        File indexDir = new File(rootDir, "index");

        FileUtils.deleteDirectory(indexDir);
        indexDir.mkdirs();

        long begin = now();
        doIndex(contentDir, indexDir);
        System.out.println("Done in miliseconds of : " + (now() - begin));

    }

    private static void doIndex(File cd, File id) throws IOException {
        Directory indexDir = FSDirectory.open(id);
        IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30),
                true, IndexWriter.MaxFieldLength.UNLIMITED);

        File[] files = cd.listFiles();
        for (File file : files) {
            System.out.println("Indexing ... " + file.getAbsolutePath());
            Document doc = new Document();
            doc.add(new Field("contents", new FileReader(file)));
            doc.add(new Field("filepath", file.getAbsolutePath(), Field.Store.YES,
                    Field.Index.NOT_ANALYZED));
            writer.addDocument(doc);
        }

        writer.numDocs();
        writer.close();

    }

    private static long now() {
        return System.currentTimeMillis();
    }

}


package player.kent.chen.temp.lucene;

import java.io.File;
import java.text.MessageFormat;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class MyLuceneSearcher {

    public static void main(String[] args) throws Exception {
        String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene";
        File id = new File(rootDir, "index");

        String keyword = "搜索";

        Directory indexDir = FSDirectory.open(id);
        IndexSearcher is = new IndexSearcher(indexDir);

        QueryParser qp = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(
                Version.LUCENE_30));
        Query query = qp.parse(keyword);

        long begin = now();
        TopDocs hits = is.search(query, 10);
        System.out.println(MessageFormat.format("Found {0} matches in {1} milliseconds",
                hits.totalHits, now() - begin));

        System.out.println("They are:");

        for (ScoreDoc scoreDoc : hits.scoreDocs) {
            Document doc = is.doc(scoreDoc.doc);
            String file = doc.get("filepath");
            String grepCmd = MessageFormat.format("cat {0} | grep -5 {1}", file, keyword);
            System.out.println("Please do: " + grepCmd);
        }

        is.close();

    }

    private static long now() {
        return System.currentTimeMillis();
    }

}