lucene索引_加权操作、对日期和数字进行索引、IndexReader的设计

jopen 9年前

    package com.dhb.index;                import java.io.File;        import java.io.IOException;        import java.text.ParseException;        import java.text.SimpleDateFormat;        import java.util.Date;        import java.util.HashMap;        import java.util.Map;                import org.apache.lucene.analysis.standard.StandardAnalyzer;        import org.apache.lucene.document.Document;        import org.apache.lucene.document.Field;        import org.apache.lucene.document.NumericField;        import org.apache.lucene.index.CorruptIndexException;        import org.apache.lucene.index.IndexReader;        import org.apache.lucene.index.IndexWriter;        import org.apache.lucene.index.IndexWriterConfig;        import org.apache.lucene.index.IndexReader.FieldOption;        import org.apache.lucene.index.Term;        import org.apache.lucene.search.IndexSearcher;        import org.apache.lucene.search.ScoreDoc;        import org.apache.lucene.search.TermQuery;        import org.apache.lucene.search.TopDocs;        import org.apache.lucene.store.Directory;        import org.apache.lucene.store.FSDirectory;        import org.apache.lucene.store.LockObtainFailedException;        import org.apache.lucene.util.Version;        import org.junit.Before;        import org.junit.Test;                public class IndexUtil {            private String[] ids = {"1","2","3","4","5","6"};            private String[] emails = {"aa@csdn.org","bb@csdn.org","cc@sina.org","dd@sina.org",                    "ee@qq.com","ff@qq.com"};            private String[] contents = {"Welcome to my office ,I like surfing internet.",                                          "hello boys like haha",                                          "hello girls we like each other.",                                          "I like football,you like too.",                                          "I like basketball very much, how about you?",                                          "bye-bye see you I don't like."};            private int[] attachment ={2,3,1,4,5,5};            private String[] names = {"Victor","Nancy","Kitty","Cindy","Tom","Tony"};             private Map<String, Float> scores = new HashMap<String, Float>();            private Date[] dates = null;            private static IndexReader reader = null;                        private Directory directory = null;            @Before            public void IndexUtilBefore() {                try {                    setDates();                    scores.put("qq.com", 2.0f);                    scores.put("sina.org", 1.5f);                    directory = FSDirectory.open(new File("D:/luceneData/index02"));                                        reader = IndexReader.open(directory, false);                } catch (IOException e) {                    e.printStackTrace();                }            }            public IndexSearcher getSearcher() {                try {                    if(reader==null) {                        reader = IndexReader.open(directory);                    } else {                        IndexReader tr = IndexReader.openIfChanged(reader);                        if(tr!=null) {                            reader.close();           //关闭原来的reader                            reader = tr;                            }                    }                    return new IndexSearcher(reader);                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }                return null;            }            private void setDates() {                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");                dates = new Date[ids.length];                try {                    dates[0] = sdf.parse("2010-02-19");                    dates[1] = sdf.parse("2012-01-11");                    dates[2] = sdf.parse("2011-09-19");                    dates[3] = sdf.parse("2010-12-22");                    dates[4] = sdf.parse("2012-01-01");                    dates[5] = sdf.parse("2011-05-19");                } catch (ParseException e) {                    e.printStackTrace();                }            }            @Test            public void index() {                IndexWriter writer = null;                IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35,                         new StandardAnalyzer(Version.LUCENE_35));                try {                    writer = new IndexWriter(directory, iwc);                    //清空所有索引                    writer.deleteAll();                    Document doc = null;                    for (int i = 0;i < ids.length; i++) {                        /**                        * Field.Store.YES或者NO(存储域选项)                        * 1、设置为YES表示把这个域中的内容完全存储到文件中,方便进行文本的还原                        * 2、设置为NO表示把这个域中的内容不存储到文件中,但是可以被索引,此时内容无法还原(doc.get)                        */                        /**                        * 使用Field.Index.*来进行操作                        * Index.ANALYZED:进行分词和索引,适用于标题和内容                        * Index.NOT_ANALYZED:进行索引,但不进行分词,如身份证号码,姓名,ID等,适用于精确搜索                        * Index.ANALYZED_NO_NORMS进行分词但是不存储norms信息,这个norms中包含了创建索引的时间和权值等信息                        * Index.NOT_ANALYZED_NO_NORMS即不进行分词也不存储norms信息                        * Index.NO不进行索引                        */                        /**                        * NOT_ANALYZED_NO_NORMS         YES    标示符(主键,文件名),电话号码,身份证号,姓名,日期                        * ANALYZED                      YES    文档标题和摘要                        * ANALYZED                      NO     文档正文                        * NO                            YES    文档类型,数据库主键(不进行索引)                        * NOT_ANALYZED                  NO     隐藏关键字                        */                        doc = new Document();                        doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));                        doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));                        doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));                        doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));                        /**                        * 加权操作                        */                        String et = emails[i].substring(emails[i].indexOf("@")+1);                        System.out.println(et);                        if(scores.containsKey(et)) {                            doc.setBoost(scores.get(et));                        } else {                            doc.setBoost(0.5f);                        }                        //对数字的操作,存储数字                        doc.add(new NumericField("attachment", Field.Store.YES, true).setIntValue(attachment[i]));                        //对日期的操作,存储日期                        doc.add(new NumericField("dates", Field.Store.YES, true).setLongValue(dates[i].getTime()));                        writer.addDocument(doc);                                            }                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    if(writer!=null)                        try {                            writer.close();                        } catch (CorruptIndexException e) {                            e.printStackTrace();                        } catch (IOException e) {                            e.printStackTrace();                        }                }            }            @Test            public void query() {                try {                    IndexReader reader = IndexReader.open(directory);                    //通过reader可以有效地获取文档的数量                    System.out.println("numDocs:"+reader.numDocs());                    System.out.println("maxDocs:"+reader.maxDoc());                    //通过reader可以有效地获取删除的文档的数量                    System.out.println("numDeletedDocs:"+reader.numDeletedDocs());                    reader.close();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }               }            @Test            /**            * 删除            */            public void delete() {                IndexWriter writer = null;                try {                    writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,                             new StandardAnalyzer(Version.LUCENE_35)));                    //参数是一个选项,可以是一个query;也可以是一个term,term是一个精确查找的值                    //此时删除的文档并不会被完全删除,而是存储在一个回收站中的,是可以恢复的                    writer.deleteDocuments(new Term("id", "1"));                    writer.commit();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    /*if(writer!=null)                       try {                           writer.close();                       } catch (CorruptIndexException e) {                           e.printStackTrace();                       } catch (IOException e) {                           e.printStackTrace();                       } */                }            }            @Test            public void delete2() {                try {                    reader.deleteDocuments(new Term("id", "1"));                    reader.close();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }            }            @Test            /**            * 强制删除            * 在lucene3.5之前都是使用optimize()进行处理,但是这个操作消耗资源,已经被弃用            */            public void forceDelete() {                //删除优化,删除回收站文件                IndexWriter writer = null;                try {                    writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,                             new StandardAnalyzer(Version.LUCENE_35)));                    writer.forceMergeDeletes();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    if(writer!=null)                        try {                            writer.close();                        } catch (CorruptIndexException e) {                            e.printStackTrace();                        } catch (IOException e) {                            e.printStackTrace();                        }                 }            }            @Test            /**            * 恢复删除            */            public void unDelete() {                //使用indexReader进行恢复                try {                    IndexReader reader = IndexReader.open(directory, false);                    //恢复时必须把IndexReader的只读(readOnly)设置为false                    reader.undeleteAll();                    reader.close();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }            }            @Test            public void search() {                try {                    IndexReader reader = IndexReader.open(directory);                    IndexSearcher searcher = new IndexSearcher(reader);                    TermQuery query = new TermQuery(new Term("content", "like"));                    TopDocs tds = searcher.search(query, 10);                    for(ScoreDoc sd : tds.scoreDocs) {                        Document d = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+d.getBoost()+"--分数:"+sd.score+                                d.get("name")+"["+d.get("email")+"] "+d.get("id")+",附件:"                                +d.get("attachment")+",日期:"+d.get("dates"));                    }                    reader.close();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }            }            @Test            public void search2() {                try {                    //IndexReader reader = IndexReader.open(directory);                    //IndexSearcher searcher = new IndexSearcher(reader);                    //方式二:                    IndexSearcher searcher = getSearcher();                                        TermQuery query = new TermQuery(new Term("content", "like"));                    TopDocs tds = searcher.search(query, 10);                    for(ScoreDoc sd : tds.scoreDocs) {                        Document d = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+"--权值:"+d.getBoost()+"--分数:"+sd.score+                                d.get("name")+"["+d.get("email")+"] "+d.get("id")+",附件:"                                +d.get("attachment")+",日期:"+d.get("dates"));                    }                    searcher.close();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }            }            @Test            public void search3() {                for (int i = 0; i < 5; i++) {                    search2();                    System.out.println("------------------");                    try {                        Thread.sleep(10000);                    } catch (InterruptedException e) {                        e.printStackTrace();                    }                }            }        }