【Lucene3.0 初窥】Lucene体系结构概述

Lucene 的基本原理与《全文检索的基本原理》是差不多的。

Lucene 的源码主要有 7 个子包，每个包完成特定的功能：

包名	功能描述
org.apache.lucene.analysis	语言分析器，主要用于的切词，支持中文主要是扩展此类
org.apache.lucene.document	索引存储时的文档结构管理，类似于关系型数据库的表结构
org.apache.lucene.index	索引管理，包括索引建立、删除等
org.apache.lucene.queryParser	查询分析器，实现查询关键词间的运算，如与、或、非等
org.apache.lucene.search	检索管理，根据查询条件，检索得到结果
org.apache.lucene.store	数据存储管理，主要包括一些底层的 I/O 操作
org.apache.lucene.util	一些公用类

另外： Lucene 3.0 还有一个 org.apache.lucene.messages 包，这个包增加了本地语言支持 NLS 和软件系统国际化。

上面的图可以很明显的看出 Lucene 的两大主要的功能：建立索引 ( 红色箭头： Index), 检索索引 ( 蓝色箭头： Search) 。

analysis 模块主要负责词法分析及语言处理而形成 Term( 词 ) 。具体参见文章《 Lucene分析器—Analyzer 》
index 模块主要负责索引的创建，里面有 IndexWriter 。
store 模块主要负责索引的读写。
queryParser 主要负责语法分析。
search 模块主要负责对索引的搜索 ( 其中 similarity 就是相关性打分 ) 。

讲到这里基本上对全文检索工具包Lucene的原理和结构已经有了大致的了解了，下面给出Lucene3.0.1建立索引和检索索引的基本代码，关于Lucene的细节探讨将在后续文章中展开。

    import java.io.File;  
import java.io.FileReader;  
import java.io.IOException;  
  
import org.apache.lucene.analysis.standard.StandardAnalyzer;  
import org.apache.lucene.document.DateTools;  
import org.apache.lucene.document.Document;  
import org.apache.lucene.document.Field;  
import org.apache.lucene.index.IndexWriter;  
import org.apache.lucene.store.FSDirectory;  
import org.apache.lucene.util.Version;  

public class IndexFiles {
   // 主要代码 索引docDir文件夹下文档，索引文件在INDEX_DIR文件夹中  
   public static void main(String[] args) {  
		
	File indexDir=new File("e:\\实验\\index");
	File docDir = new File("e:\\实验\\content"); 
	    
	try {  
               //索引器
      	       IndexWriter standardWriter = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED);            
               //不建立复合式索引文件，默认的情况下是复合式的索引文件
               standardWriter.setUseCompoundFile(false);
	       String[] files = docDir.list(); 
	       for (String fileStr : files) {  
	           File file = new File(docDir, fileStr);  
	           if (!file.isDirectory()) {         	
	              Document doc = new Document();  
	              //文件名称，可查询，不分词
	              String fileName=file.getName().substring(0,file.getName().indexOf("."));
	              doc.add(new Field("name",fileName, Field.Store.YES, Field.Index.NOT_ANALYZED));    	    
	              //文件路径，可查询，不分词
	              String filePath=file.getPath();
	              doc.add(new Field("path", filePath, Field.Store.YES, Field.Index.NOT_ANALYZED));   
	              //文件内容，需要检索
	              doc.add(new Field("content", new FileReader(file)));  
	              standardWriter.addDocument(doc);  
	           }  
	       }  
	       standardWriter.optimize();
               //关闭索引器
                standardWriter.close();  
	 } catch (IOException e) {  
	       System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());  
         }  
     }   
}

    import java.io.BufferedReader;  
import java.io.File;  
import java.io.IOException;  
import java.io.InputStreamReader;  
  
import org.apache.lucene.analysis.Analyzer;  
import org.apache.lucene.analysis.standard.StandardAnalyzer;  
import org.apache.lucene.document.Document;  
import org.apache.lucene.index.IndexReader;  
import org.apache.lucene.queryParser.QueryParser;  
import org.apache.lucene.search.IndexSearcher;  
import org.apache.lucene.search.Query;  
import org.apache.lucene.search.ScoreDoc;  
import org.apache.lucene.search.Searcher;  
import org.apache.lucene.search.TopScoreDocCollector;  
import org.apache.lucene.store.FSDirectory;  
import org.apache.lucene.util.Version;  
/**
  * 检索索引
  */  
public class SearchFiles {  
  
    /** Simple command-line based search demo. */  
    public static void main(String[] args) throws Exception {  
  
        String index = "E:\\实验\\index";  
        String field = "content";  
        String queries = null;  
        boolean raw = false;  
        // 要显示条数  
        int hitsPerPage = 10;  
  
        // searching, so read-only=true  
        IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)), true); // only  
  
        Searcher searcher = new IndexSearcher(reader);  
        Analyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);  

  
        BufferedReader in = new BufferedReader(new InputStreamReader(System.in));  
        QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,field, standardAnalyzer);  
        while (true) {  
            if (queries == null) // prompt the user  
                System.out.println("Enter query: ");  
  
            String line = in.readLine();  
  
            if (line == null || line.length() == -1)  
                break;  
  
            line = line.trim();  
            if (line.length() == 0)  
                break;  
  
            Query query = parser.parse(line);  
            System.out.println("Searching for: " + query.toString(field));  
  
            doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null);  
        }  
        reader.close();  
    }  
  
    public static void doPagingSearch(BufferedReader in, Searcher searcher, Query query, int hitsPerPage, boolean raw,  
            boolean interactive) throws IOException {  
  
        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, false);  
        searcher.search(query, collector);  
        ScoreDoc[] hits = collector.topDocs().scoreDocs;  
  
        int end, numTotalHits = collector.getTotalHits();  
        System.out.println(numTotalHits + " total matching documents");  
  
        int start = 0;  
  
        end = Math.min(hits.length, start + hitsPerPage);  
  
        for (int i = start; i < end; i++) {  
            Document doc = searcher.doc(hits[i].doc);  
            String path = doc.get("path");  
            if (path != null) {  
                System.out.println((i + 1) + ". " + path);    
            } else {  
                System.out.println((i + 1) + ". " + "No path for this document");  
            }  
          }  
      }  
  }

【Lucene3.0 初窥】Lucene体系结构概述

更多文章、技术交流、商务合作、联系博主

微信扫码或搜索：z360901061

微信扫一扫加我为好友

QQ号联系： 360901061

您的支持是博主写作最大的动力，如果您喜欢我的文章，感觉我的文章对您有帮助，请用微信扫描下面二维码支持博主2元、5元、10元、20元等您想捐的金额吧，狠狠点击下面给点支持吧，站长非常感激您！手机微信长按不能支付解决办法：请将微信支付二维码保存到相册，切换到微信，然后点击微信右上角扫一扫功能，选择支付二维码完成支付。

【本文对您有帮助就好】元

2元

5元

10元

20元

自定义