想象:
假设一个文本有以下几部分组成:
title: "Hadoop: The Definitive Guide"
content: "Hadoop got its start in Nutch"
unbreakable: "united kingdom" (先不要理会unbreakable的意义)
ignored: "Hadoop Nonsense" (注释同上)
如果按下列语句来建索引,索引大概会是什么样?
Document doc = new Document();
doc.add(new Field("ignored", passage.ignored, Field.Store.YES, Field.Index.NO));
doc.add(new Field("content", passage.content, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("unbreakable", passage.unbreakable, Field.Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("title", passage.title, Field.Store.YES, Field.Index.ANALYZED));
indexWriter.addDocument(doc);
我觉得,索引的逻辑结构可以想象为:
| Token | Field | Stored Text | 备注 |
| Hadoop | content | ||
| got | content | ||
| its | content | ||
| start | content | ||
| in | content | ||
| Nutch | content | ||
| Hadoop | title | Hadoop: The Definitive Guide | |
| The | title | Hadoop: The Definitive Guide | |
| Definitive | title | Hadoop: The Definitive Guide | |
| Guide | title | Hadoop: The Definitive Guide | |
| united kingdom | unbreakable | united kingdom | "united kingdom"整体作为一个token |
| Nonsense | ignored | Nonsense | "Nonsense"这个Token未被收录 |
验证代码:
package player.kent.chen.temp.lucene.indexcomponent;
import static player.kent.chen.temp.lucene.indexcomponent.MiscIndexCreator.MyPassage.SAMPLE_PASSAGE;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class MiscIndexCreator {
public static void main(String[] args) throws Exception {
String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene/indexOptions";
File indexDir = new File(rootDir, "index");
FileUtils.deleteDirectory(indexDir);
indexDir.mkdirs();
doIndex(indexDir);
}
private static void doIndex(File id) throws IOException {
Directory indexDir = FSDirectory.open(id);
IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30),
true, IndexWriter.MaxFieldLength.UNLIMITED);
System.out.println("Indexing ... " + SAMPLE_PASSAGE.title);
Document doc = new Document();
doc.add(new Field("ignored", SAMPLE_PASSAGE.ignored, Field.Store.YES, Field.Index.NO));
doc.add(new Field("content", SAMPLE_PASSAGE.content, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("unbreakable", SAMPLE_PASSAGE.unbreakable, Field.Store.YES,
Index.NOT_ANALYZED));
doc.add(new Field("title", SAMPLE_PASSAGE.title, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
}
public static final class MyPassage {
String title;
String content;
String unbreakable;
String ignored;
public static final MyPassage SAMPLE_PASSAGE = MyPassage.newInstance(
"Hadoop: The Definitive Guide",
"Hadoop got its start in Nutch",
"united kingdom", "Nonsense");
private MyPassage() {
}
public static final MyPassage newInstance(String title, String content, String unbreakable,
String ignored) {
MiscIndexCreator.MyPassage instance = new MiscIndexCreator.MyPassage();
instance.title = title;
instance.content = content;
instance.unbreakable = unbreakable;
instance.ignored = ignored;
return instance;
}
}
}
package player.kent.chen.temp.lucene.indexcomponent;
import java.io.File;
public class MiscIndexInspector {
public static void main(String[] args) throws Exception {
String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene/indexOptions";
Directory indexDir = FSDirectory.open(new File(rootDir, "index"));
IndexSearcher indexSearcher = new IndexSearcher(indexDir);
//按ignored field搜索
TopDocs searchOnIgnored = doSearch(indexSearcher, "ignored", "nonsense");
println("Num of result matching 'ignored:nonsense' is :" + searchOnIgnored.totalHits); //为0, 因为ignored field没有被index
//按content field搜索
TopDocs searchOnContent = doSearch(indexSearcher, "content", "hadoop");
println("Result matching 'content:hadoop' is :");
Document docOfContentMatching = indexSearcher.doc(searchOnContent.scoreDocs[0].doc);
println(" content : " + docOfContentMatching.get("content")); //为null, 因为content field在index中没有store
println(" ignored : " + docOfContentMatching.get("ignored")); //不为空,因为ignored在index中已store,虽然没有被index
//按unbreakable field搜索
TopDocs searchOnUnbreakable = doSearch(indexSearcher, "unbreakable", "united kingdom");
println("Num of result matching 'unbreakable:united kingdom' is :"
+ searchOnUnbreakable.totalHits); //为1
TopDocs searchOnUnbreakable2 = doSearch(indexSearcher, "unbreakable", "kingdom");
println("Num of result matching 'unbreakable:kingdom' is :"
+ searchOnUnbreakable2.totalHits); //为0, 因为"united kingdom"没有被analyze, 所以"kingdom"不是一个token
//按title field搜索
TopDocs searchOnTitle = doSearch(indexSearcher, "title", "hadoop");
println("Result matching 'title:hadoop' is :");
Document docOfTitleMatching = indexSearcher.doc(searchOnTitle.scoreDocs[0].doc);
println(" title : " + docOfTitleMatching.get("title")); //为"Hadoop: The Definitive Guide"这整个标题,因为title有被store
indexSearcher.close();
}
private static void println(String o) {
System.out.println(o);
}
private static TopDocs doSearch(IndexSearcher indexSearcher, String field, String keyword)
throws IOException, ParseException {
return indexSearcher.search(buildQp(field, keyword), 2);
}
private static Query buildQp(String field, String keyword) throws ParseException {
Query query = new TermQuery(new Term(field, keyword));
return query;
}
}