lucene搜索实例解析

jopen 9年前

1、搜索的简单实现(TermQuery)

         1.1 创建 IndexReader

         1.2 创建 IndexSearcher

         1.3 创建Term和TermQuery

         1.4 创建TermQuery获取TopDocs 

         1.5 创建TopDocs 获取ScoreDoc

         1.6 根据ScoreDoc获取相应的文档

    2、其他搜索Query

         2.1TermRangeQuery

         2.2NumericRangeQuery

         2.3PrefixQuery

         2.4WildcardQuery

         2.5BooleanQuery

         2.6PhraseQuery

         2.7FuzzyQuery

   3、使用QueryParse

这是search类:

    package com.dhb.search;                import java.io.IOException;        import java.text.ParseException;        import java.text.SimpleDateFormat;        import java.util.Date;        import java.util.HashMap;        import java.util.Map;                import org.apache.lucene.analysis.standard.StandardAnalyzer;        import org.apache.lucene.document.Document;        import org.apache.lucene.document.Field;        import org.apache.lucene.document.NumericField;        import org.apache.lucene.index.CorruptIndexException;        import org.apache.lucene.index.IndexReader;        import org.apache.lucene.index.IndexWriter;        import org.apache.lucene.index.IndexWriterConfig;        import org.apache.lucene.index.Term;        import org.apache.lucene.search.BooleanClause.Occur;        import org.apache.lucene.search.BooleanQuery;        import org.apache.lucene.search.FuzzyQuery;        import org.apache.lucene.search.IndexSearcher;        import org.apache.lucene.search.NumericRangeQuery;        import org.apache.lucene.search.PhraseQuery;        import org.apache.lucene.search.PrefixQuery;        import org.apache.lucene.search.Query;        import org.apache.lucene.search.ScoreDoc;        import org.apache.lucene.search.TermQuery;        import org.apache.lucene.search.TermRangeQuery;        import org.apache.lucene.search.TopDocs;        import org.apache.lucene.search.WildcardQuery;        import org.apache.lucene.store.Directory;        import org.apache.lucene.store.LockObtainFailedException;        import org.apache.lucene.store.RAMDirectory;        import org.apache.lucene.util.Version;                        public class SearcherUtil {            private String[] ids = {"1","2","3","4","5","6"};            private String[] emails = {"aa@csdn.org","bb@csdn.org","cc@sina.org","dd@sina.org",                    "ee@qq.com","ff@qq.com"};            private String[] contents = {"Welcome to my office ,I like surfing internet.",                                          "hello boys like haha",                                          "hello girls we like each other.",                                          "I like football,you like too.",                                          "I like basketball very much, how about you?",                                          "bye-bye see you I don't like."};            private int[] attachment ={2,3,1,4,5,5};            private String[] names = {"Victor","Nancy","Kitty","Cindy","Tom","Tony"};             private Map<String, Float> scores = new HashMap<String, Float>();            private Date[] dates = null;                        private Directory directory;            private IndexReader reader;                        public SearcherUtil() {                setDates();                scores.put("qq.com", 2.0f);                scores.put("sina.org", 1.5f);                directory = new RAMDirectory();                index();            }            private void setDates() {                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");                dates = new Date[ids.length];                try {                    dates[0] = sdf.parse("2010-02-19");                    dates[1] = sdf.parse("2012-01-11");                    dates[2] = sdf.parse("2011-09-19");                    dates[3] = sdf.parse("2010-12-22");                    dates[4] = sdf.parse("2012-01-01");                    dates[5] = sdf.parse("2011-05-19");                } catch (ParseException e) {                    e.printStackTrace();                }            }                    public void index() {                IndexWriter writer = null;                IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35,                         new StandardAnalyzer(Version.LUCENE_35));                try {                    writer = new IndexWriter(directory, iwc);                    //清空所有索引                    writer.deleteAll();                    Document doc = null;                    for (int i = 0;i < ids.length; i++) {                                                doc = new Document();                        doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));                        doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));                        doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));                        doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));                        /**                        * 加权操作                        */                        String et = emails[i].substring(emails[i].indexOf("@")+1);                        if(scores.containsKey(et)) {                            doc.setBoost(scores.get(et));                        } else {                            doc.setBoost(0.5f);                        }                        //对数字的操作,存储数字                        doc.add(new NumericField("attachment", Field.Store.YES, true).setIntValue(attachment[i]));                        //对日期的操作,存储日期                        doc.add(new NumericField("dates", Field.Store.YES, true).setLongValue(dates[i].getTime()));                        writer.addDocument(doc);                                            }                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    if(writer!=null)                        try {                            writer.close();                        } catch (CorruptIndexException e) {                            e.printStackTrace();                        } catch (IOException e) {                            e.printStackTrace();                        }                }            }            public IndexSearcher getSearcher() {                try {                    if(reader==null) {                        reader = IndexReader.open(directory);                    } else {                        IndexReader tr = IndexReader.openIfChanged(reader);                        if(tr!=null) {                            reader.close();           //关闭原来的reader                            reader = tr;                            }                    }                    return new IndexSearcher(reader);                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }                return null;            }            /**            * 精确查询            * @param fld            * @param txt            */            public void searchByTerm(String fld, String txt,int num) {                try {                    IndexSearcher searcher = getSearcher();                    Query query = new TermQuery(new Term(fld, txt));                    TopDocs tds = searcher.search(query, num);                    //总记录数,和num没有任何关系                    System.out.println("一共查询了:"+tds.totalHits);                                        for(ScoreDoc sd : tds.scoreDocs) {                        Document doc = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+doc.getBoost()+"--分数:"+sd.score+                                doc.get("name")+"["+doc.get("email")+"] "+doc.get("id")+",附件:"                                +doc.get("attachment")+",日期:"+doc.get("dates"));                    }                    searcher.close();                } catch (IOException e) {                    e.printStackTrace();                }            }            /**            * 范围查询            * @param fld 查询的field             * @param start 开始的字符            * @param end 结束的字符            * @param num            *             */            public void searchByTermRange(String fld, String start, String end, int num) {                try {                    IndexSearcher searcher = getSearcher();                    //默认为true,就表示包含开始字符和结束字符                    Query query = new TermRangeQuery(fld, start, end, true, true);                    TopDocs tds = searcher.search(query, num);                    System.out.println("一共查询了:"+tds.totalHits);                                        for(ScoreDoc sd : tds.scoreDocs) {                        Document doc = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+doc.getBoost()+"--分数:"+sd.score+                                doc.get("name")+"["+doc.get("email")+"] "+doc.get("id")+",附件:"                                +doc.get("attachment")+",日期:"+doc.get("dates"));                    }                    searcher.close();                } catch (IOException e) {                    e.printStackTrace();                }            }            /**            * 范围查询(整型)            * @param fld            * @param start            * @param end            * @param num            */            public void searchByNumericRange(String fld, int start, int end, int num) {                try {                    IndexSearcher searcher = getSearcher();                    Query query = NumericRangeQuery.newIntRange(fld, start, end, true, true);                    TopDocs tds = searcher.search(query, num);                    System.out.println("一共查询了:"+tds.totalHits);                                        for(ScoreDoc sd : tds.scoreDocs) {                        Document doc = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+doc.getBoost()+"--分数:"+sd.score+                                doc.get("name")+"["+doc.get("email")+"] "+doc.get("id")+",附件:"                                +doc.get("attachment")+",日期:"+doc.get("dates"));                    }                    searcher.close();                } catch (IOException e) {                    e.printStackTrace();                }            }            /**            * 前缀查询            * @param fld            * @param value            * @param num            */            public void searchByPrefix(String fld, String value, int num) {                try {                    IndexSearcher searcher = getSearcher();                    Query query = new PrefixQuery(new Term(fld, value));                    TopDocs tds = searcher.search(query, num);                    System.out.println("一共查询了:"+tds.totalHits);                                        for(ScoreDoc sd : tds.scoreDocs) {                        Document doc = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+doc.getBoost()+"--分数:"+sd.score+                                doc.get("name")+"["+doc.get("email")+"] "+doc.get("id")+",附件:"                                +doc.get("attachment")+",日期:"+doc.get("dates"));                    }                    searcher.close();                } catch (IOException e) {                    e.printStackTrace();                }            }            /**            * 通配符查询            * @param fld            * @param value            * @param num            */            public void searchByWildcard(String fld, String value, int num) {                try {                    IndexSearcher searcher = getSearcher();                    Query query = new WildcardQuery(new Term(fld, value));                    TopDocs tds = searcher.search(query, num);                    System.out.println("一共查询了:"+tds.totalHits);                                        for(ScoreDoc sd : tds.scoreDocs) {                        Document doc = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+doc.getBoost()+"--分数:"+sd.score+                                doc.get("name")+"["+doc.get("email")+"] "+doc.get("id")+",附件:"                                +doc.get("attachment")+",日期:"+doc.get("dates"));                    }                    searcher.close();                } catch (IOException e) {                    e.printStackTrace();                }            }            /**            * 连接多个子查询            * @param num            */            public void searchByBoolean(int num) {                try {                    IndexSearcher searcher = getSearcher();                    BooleanQuery query = new BooleanQuery();                    /**                    * BooleanQuery可以连接多个子查询                    * Occur.MUST表示必须出现                    * Occur.SHOULD表示可以出现                    * Occur.MUST——NOT表示不能出现                    */                    query.add(new TermQuery(new Term("name", "Victor")), Occur.MUST);                    query.add(new TermQuery(new Term("content", "like")), Occur.MUST);                                        TopDocs tds = searcher.search(query, num);                    System.out.println("一共查询了:"+tds.totalHits);                                        for(ScoreDoc sd : tds.scoreDocs) {                        Document doc = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+doc.getBoost()+"--分数:"+sd.score+                                doc.get("name")+"["+doc.get("email")+"] "+doc.get("id")+",附件:"                                +doc.get("attachment")+",日期:"+doc.get("dates"));                    }                    searcher.close();                } catch (IOException e) {                    e.printStackTrace();                }            }            /**            * 短语查询            * @param num            */            public void searchByPhrase(int num) {                try {                    IndexSearcher searcher = getSearcher();                    PhraseQuery query = new PhraseQuery();                    query.setSlop(1);                    //这里的短语必须均为小写字母                    //第一个term                    query.add(new Term("content", "hello"));                    //产生距离之后的第二个term                    query.add(new Term("content", "like"));                                        TopDocs tds = searcher.search(query, num);                    System.out.println("一共查询了:"+tds.totalHits);                                        for(ScoreDoc sd : tds.scoreDocs) {                        Document doc = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+doc.getBoost()+"--分数:"+sd.score+                                doc.get("name")+"["+doc.get("email")+"] "+doc.get("id")+",附件:"                                +doc.get("attachment")+",日期:"+doc.get("dates"));                    }                    searcher.close();                } catch (IOException e) {                    e.printStackTrace();                }            }            /**            * 模糊查询            * @param num            */            public void searchByFuzzy(int num) {                try {                    IndexSearcher searcher = getSearcher();                    Query query = new FuzzyQuery(new Term("name", "Tiny"));                                        TopDocs tds = searcher.search(query, num);                    System.out.println("一共查询了:"+tds.totalHits);                                        for(ScoreDoc sd : tds.scoreDocs) {                        Document doc = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+doc.getBoost()+"--分数:"+sd.score+                                doc.get("name")+"["+doc.get("email")+"] "+doc.get("id")+",附件:"                                +doc.get("attachment")+",日期:"+doc.get("dates"));                    }                    searcher.close();                } catch (IOException e) {                    e.printStackTrace();                }            }                        public void searchByQueryParse(Query query, int num) {                try {                    IndexSearcher searcher = getSearcher();                    TopDocs tds = searcher.search(query, num);                    System.out.println("一共查询了:"+tds.totalHits);                            for(ScoreDoc sd : tds.scoreDocs) {                        Document doc = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+doc.getBoost()+"--分数:"+sd.score+                                doc.get("name")+"["+doc.get("email")+"] "+doc.get("id")+",附件:"                                +doc.get("attachment")+",日期:"+doc.get("dates"));                    }                    searcher.close();                } catch (IOException e) {                    e.printStackTrace();                }            }        }  

这是测试类:
    package com.dhb.test;                import org.apache.lucene.analysis.standard.StandardAnalyzer;        import org.apache.lucene.queryParser.ParseException;        import org.apache.lucene.queryParser.QueryParser;        import org.apache.lucene.queryParser.QueryParser.Operator;        import org.apache.lucene.search.Query;        import org.apache.lucene.util.Version;        import org.junit.Before;        import org.junit.Test;                import com.dhb.search.SearcherUtil;                public class TestSearch {            private SearcherUtil su;                    @Before            public void init() {                su = new SearcherUtil();            }                    @Test            public void searchByTerm() {                su.searchByTerm("content", "like", 3);                /**                * 一共查询了:6 (5) --权值:1.0--分数:0.634387Tony[ff@qq.com]                * 6,附件:5,日期:1305734400000 (3) --权值:1.0--分数:0.5981058Cindy[dd@sina.org]                * 4,附件:4,日期:1292947200000 (4) --权值:1.0--分数:0.5286558Tom[ee@qq.com]                * 5,附件:5,日期:1325347200000                */            }                    @Test            public void searchByTermRange() {                // su.searchByTermRange("id", "1", "3", 10);                // 查询name以a开头和s结尾的                // su.searchByTermRange("name", "A", "S", 10);                // 由于attachment是数字类型,使用TermRange无法查询                su.searchByTermRange("attachment", "2", "10", 10);            }                    @Test            public void searchByNumericRange() {                su.searchByNumericRange("attachment", 2, 10, 10);            }                    @Test            public void searchByPrefix() {                // su.searchByPrefix("name", "T", 10);                // 把content里面的单词,一个一个拆分                su.searchByPrefix("content", "b", 10);            }                    @Test            public void searchByWildcard() {                // *表示可以匹配任意多个字符,?表示可以匹配单个字符                // su.searchByWildcard("email", "*@sina.org", 10);                su.searchByWildcard("name", "T??", 10);            }                    @Test            public void searchByBoolean() {                su.searchByBoolean(10);            }                    @Test            public void searchByPhrase() {                su.searchByPhrase(10);            }                    @Test            public void searchByFuzzy() {                su.searchByFuzzy(10);            }                    @Test            public void searchByQueryParse() throws ParseException {                // 创建QueryParser对象,默认搜索域为content                QueryParser parser = new QueryParser(Version.LUCENE_35, "content",                        new StandardAnalyzer(Version.LUCENE_35));                        // 改变空格的默认操作符,以下可以改为AND                //parser.setDefaultOperator(Operator.AND);                        // 开启第一个字符的通配查询,默认是关闭的,太消耗性能                parser.setAllowLeadingWildcard(true);                        // 搜索包含content中包含like的                Query query = parser.parse("like");                        // 查询content中,basketball 和 basketball 的文档                query = parser.parse("basketball football");                        // 改变搜索域                query = parser.parse("name:Tom");                        // 用*或者? 通配符匹配                query = parser.parse("name:V*");                        // 通配符默认不能放在首位                query = parser.parse("email:*@qq.com");                        // 查询 名字中没mike,content中like的。 +和- 必须要放在前面                query = parser.parse("- name:Tom + like");                        // 匹配一个区间,TO必须大写                query = parser.parse("id:[1 TO 6]");                        // 开区间                query = parser.parse("id:{1 TO 3}");                        /**                * 查询字符串中空格的,加上“” 完全匹配                */                query = parser.parse("\"I like football\"");                        // 查询I和football中一个字符距离的                query = parser.parse("\"I football\"~1");                        // 没办法匹配数字范围,必须要自定义                query = parser.parse("attach:[2 TO 10]");                // 模糊匹配                query = parser.parse("name:Tim~");                        su.searchByQueryParse(query, 10);            }        }