`

lucene3.0学习笔记(三)与paoding整合

阅读更多
试过lucene最新版跟paoding2.0版不兼容,被迫换lucene2.9,研究继续。。。。。。


将整合要用到的jar包加入工程,附件中有。
添加环境变量PAODING_DIC_HOME      E:\paoding-analysis\dic【dic位置即paoding-analysis-2.0.4-beta.zip得解压位置】


在paoding-dic-home.properties文件下面添加:
#paoding.dic.home=dic
paoding.dic.home=E:/paoding-analysis/dic



paoding-knives.properties文件

paoding.knife.class.letterKnife=net.paoding.analysis.knife.LetterKnife
paoding.knife.class.numberKnife=net.paoding.analysis.knife.NumberKnife
paoding.knife.class.cjkKnife=net.paoding.analysis.knife.CJKKnife


这两个文件加入工程src根目录即可。



附上例子:
Java代码 复制代码 收藏代码
  1. import java.io.BufferedReader;   
  2. import java.io.File;   
  3. import java.io.FileInputStream;   
  4. import java.io.FileNotFoundException;   
  5. import java.io.IOException;   
  6. import java.io.InputStream;   
  7. import java.io.InputStreamReader;   
  8. import org.apache.lucene.analysis.Analyzer;   
  9. import org.apache.lucene.analysis.TokenStream;   
  10. import org.apache.lucene.document.Document;   
  11. import org.apache.lucene.document.Field;   
  12. import org.apache.lucene.index.IndexReader;   
  13. import org.apache.lucene.index.IndexWriter;   
  14. import org.apache.lucene.index.TermPositionVector;   
  15. import org.apache.lucene.queryParser.ParseException;   
  16. import org.apache.lucene.queryParser.QueryParser;   
  17. import org.apache.lucene.search.Hits;   
  18. import org.apache.lucene.search.IndexSearcher;   
  19. import org.apache.lucene.search.Query;   
  20. import org.apache.lucene.search.Searcher;   
  21. import org.apache.lucene.search.highlight.Highlighter;   
  22. import org.apache.lucene.search.highlight.QueryScorer;   
  23. import org.apache.lucene.search.highlight.SimpleFragmenter;   
  24. import org.apache.lucene.search.highlight.SimpleHTMLFormatter;   
  25. import org.apache.lucene.search.highlight.TokenSources;   
  26. import org.apache.lucene.store.Directory;   
  27. import org.apache.lucene.store.FSDirectory;   
  28.   
  29. @SuppressWarnings("deprecation")   
  30. public class LuceneChinese {   
  31.     // 数据文件夹   
  32.     private static final String DATA_DIR = "E:\\test\\file";   
  33.     // 索引存放文件夹   
  34.     private static final String INDEX_DIR = "E:\\test\\index";   
  35.     // 字段   
  36.     private static final String FIELD_NAME = "content";   
  37.     public static void main(String[] args) throws Exception {   
  38.         createIndex();   
  39.         search("");   
  40.     }   
  41.     /**  
  42.      * 创建索引  
  43.      */  
  44.     public static void createIndex() {   
  45.         System.out.println("-------------------建立索引开始-----------------------");   
  46.         long timeStart = System.currentTimeMillis();   
  47.         try {   
  48.             // PaodingChineseAnalyzer实现Analyzer接口,继承PaodingAnalyze,重写tokenizer方法,实现过滤分词   
  49.             Analyzer analyzer = new PaodingChineseAnalyzer(new File(   
  50.                     "E:\\stopwords.txt"));   
  51.             IndexWriter writer = new IndexWriter(FSDirectory.open(new File(   
  52.                     INDEX_DIR)), analyzer, true,   
  53.                     IndexWriter.MaxFieldLength.LIMITED);   
  54.             // 根据指定的目录把该目录下所有txt文件索引起来   
  55.             indexDoc(writer, new File(DATA_DIR));   
  56.             // 优化, 可以提高搜索速度。   
  57.             writer.optimize();   
  58.             writer.close();   
  59.         } catch (IOException e) {   
  60.             e.printStackTrace();   
  61.         }   
  62.         long timeEnd = System.currentTimeMillis();   
  63.         System.out.println("-------------------建立索引耗时: "  
  64.                 + (timeEnd - timeStart) + " 毫秒-----------------------");   
  65.     }   
  66.     /**  
  67.      * 搜索  
  68.      *   
  69.      * @param keyword  
  70.      * @throws IOException  
  71.      * @throws ParseException  
  72.      */  
  73.     public static void search(String queryString) throws IOException,   
  74.             ParseException {   
  75.         // 输入搜索关键字   
  76.         if (queryString == null || queryString == "") {   
  77.             System.out.print("Search for:");   
  78.             InputStreamReader in = new InputStreamReader(System.in);   
  79.             BufferedReader reader = new BufferedReader(in);   
  80.             queryString = reader.readLine();   
  81.             if (queryString == "") {   
  82.                 System.exit(0);   
  83.             }   
  84.         }   
  85.         long timeStart = System.currentTimeMillis();   
  86.         // 读取索引文件   
  87.         Directory directory = FSDirectory.open(new File(INDEX_DIR));   
  88.         // PaodingChineseAnalyzer实现Analyzer接口,继承PaodingAnalyzer   
  89.         Analyzer analyzer = new PaodingChineseAnalyzer();   
  90.         IndexReader reader = IndexReader.open(directory, true);   
  91.         QueryParser parser = new QueryParser(FIELD_NAME, analyzer);   
  92.         Query query = parser.parse(queryString);   
  93.         // 创建索引查询器   
  94.         Searcher searcher = new IndexSearcher(directory);   
  95.         query = query.rewrite(reader);   
  96.         Hits hits = searcher.search(query);   
  97.         // 高亮显示标签,默认是<b></b>   
  98. //      BoldFormatter formatter = new BoldFormatter();   
  99.          SimpleHTMLFormatter shf = new  
  100.          SimpleHTMLFormatter("<span style=\"color:red\">",   
  101.          "</span>");   
  102.         // 构造高亮器,指定高亮的格式,指定查询计分器   
  103.         Highlighter highlighter = new Highlighter(shf, new QueryScorer(   
  104.                 query));   
  105.         // 设置块划分器   
  106.         highlighter.setTextFragmenter(new SimpleFragmenter(50));   
  107.         System.out.println("共搜索到: " + hits.length() + " 条资源");   
  108.         for (int i = 0; i < hits.length(); i++) {   
  109.             String text = hits.doc(i).get(FIELD_NAME);   
  110.             String path = hits.doc(i).get("path");   
  111.             int maxNumFragmentsRequired = 5;   
  112.             String fragmentSeparator = "...";   
  113.             TermPositionVector tpv = (TermPositionVector) reader   
  114.                     .getTermFreqVector(hits.id(i), FIELD_NAME);   
  115.             TokenStream tokenStream = TokenSources.getTokenStream(tpv);   
  116.             String result = highlighter.getBestFragments(tokenStream, text,   
  117.                     maxNumFragmentsRequired, fragmentSeparator);   
  118.             System.out.println("\n文件路径:" + path);   
  119.             System.out.println("\n" + result);   
  120.         }   
  121.         reader.close();   
  122.         System.out.println("共搜索到: " + hits.length() + " 条资源");   
  123.         long timeEnd = System.currentTimeMillis();   
  124.         System.out.println("-------------------查询耗时: " + (timeEnd - timeStart)   
  125.                 + " 毫秒-----------------------");   
  126.     }   
  127.     /**  
  128.      * 对指定的目录进行索引  
  129.      *   
  130.      * @param writer  
  131.      *            IndexWriter  
  132.      * @param root  
  133.      *            指定的目录  
  134.      */  
  135.     private static void indexDoc(IndexWriter writer, File root) {   
  136.         // 不去索引不能读的文件   
  137.         if (root.canRead()) {   
  138.             if (root.isDirectory()) {   
  139.                 File[] files = root.listFiles();   
  140.                 if (files.length != 0) {   
  141.                     for (int i = 0; i < files.length; i++) {   
  142.                         // 递归调用   
  143.                         indexDoc(writer, files[i]);   
  144.                     }   
  145.                 }   
  146.             } else {   
  147.                 try {   
  148.                     // 文件的文本内容   
  149.                     InputStream in = new FileInputStream(root);   
  150.                     byte b[] = new byte[in.available()];   
  151.                     in.read(b);   
  152.                     String content = new String(b, "GBK");   
  153.                     // 创建一个lucene document   
  154.                     Document d = new Document();   
  155.                     // 把文件的文本内容添加进来 进行索引,保存   
  156.                     d.add(new Field(FIELD_NAME, content, Field.Store.YES,   
  157.                             Field.Index.TOKENIZED,   
  158.                             Field.TermVector.WITH_POSITIONS_OFFSETS));   
  159.                     // 同时把path也加入进来,只存储,不索引   
  160.                     d.add(new Field("path", root.getAbsolutePath(),   
  161.                             Field.Store.YES, Field.Index.NOT_ANALYZED));   
  162.                     // 把document写入索引文件   
  163.                     writer.addDocument(d);   
  164.                     System.out.println("add file: " + root.getAbsolutePath());   
  165.                 } catch (FileNotFoundException e) {   
  166.                     System.out.println("file not found, ignored.");   
  167.                     e.printStackTrace();   
  168.                 } catch (IOException e) {   
  169.   
  170.                 }   
  171.             }   
  172.         }   
  173.     }   
  174. }  
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

@SuppressWarnings("deprecation")
public class LuceneChinese {
	// 数据文件夹
	private static final String DATA_DIR = "E:\\test\\file";
	// 索引存放文件夹
	private static final String INDEX_DIR = "E:\\test\\index";
	// 字段
	private static final String FIELD_NAME = "content";
	public static void main(String[] args) throws Exception {
		createIndex();
		search("");
	}
	/**
	 * 创建索引
	 */
	public static void createIndex() {
		System.out.println("-------------------建立索引开始-----------------------");
		long timeStart = System.currentTimeMillis();
		try {
			// PaodingChineseAnalyzer实现Analyzer接口,继承PaodingAnalyze,重写tokenizer方法,实现过滤分词
			Analyzer analyzer = new PaodingChineseAnalyzer(new File(
					"E:\\stopwords.txt"));
			IndexWriter writer = new IndexWriter(FSDirectory.open(new File(
					INDEX_DIR)), analyzer, true,
					IndexWriter.MaxFieldLength.LIMITED);
			// 根据指定的目录把该目录下所有txt文件索引起来
			indexDoc(writer, new File(DATA_DIR));
			// 优化, 可以提高搜索速度。
			writer.optimize();
			writer.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
		long timeEnd = System.currentTimeMillis();
		System.out.println("-------------------建立索引耗时: "
				+ (timeEnd - timeStart) + " 毫秒-----------------------");
	}
	/**
	 * 搜索
	 * 
	 * @param keyword
	 * @throws IOException
	 * @throws ParseException
	 */
	public static void search(String queryString) throws IOException,
			ParseException {
		// 输入搜索关键字
		if (queryString == null || queryString == "") {
			System.out.print("Search for:");
			InputStreamReader in = new InputStreamReader(System.in);
			BufferedReader reader = new BufferedReader(in);
			queryString = reader.readLine();
			if (queryString == "") {
				System.exit(0);
			}
		}
		long timeStart = System.currentTimeMillis();
		// 读取索引文件
		Directory directory = FSDirectory.open(new File(INDEX_DIR));
		// PaodingChineseAnalyzer实现Analyzer接口,继承PaodingAnalyzer
		Analyzer analyzer = new PaodingChineseAnalyzer();
		IndexReader reader = IndexReader.open(directory, true);
		QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
		Query query = parser.parse(queryString);
		// 创建索引查询器
		Searcher searcher = new IndexSearcher(directory);
		query = query.rewrite(reader);
		Hits hits = searcher.search(query);
		// 高亮显示标签,默认是<b></b>
//		BoldFormatter formatter = new BoldFormatter();
		 SimpleHTMLFormatter shf = new
		 SimpleHTMLFormatter("<span style=\"color:red\">",
		 "</span>");
		// 构造高亮器,指定高亮的格式,指定查询计分器
		Highlighter highlighter = new Highlighter(shf, new QueryScorer(
				query));
		// 设置块划分器
		highlighter.setTextFragmenter(new SimpleFragmenter(50));
		System.out.println("共搜索到: " + hits.length() + " 条资源");
		for (int i = 0; i < hits.length(); i++) {
			String text = hits.doc(i).get(FIELD_NAME);
			String path = hits.doc(i).get("path");
			int maxNumFragmentsRequired = 5;
			String fragmentSeparator = "...";
			TermPositionVector tpv = (TermPositionVector) reader
					.getTermFreqVector(hits.id(i), FIELD_NAME);
			TokenStream tokenStream = TokenSources.getTokenStream(tpv);
			String result = highlighter.getBestFragments(tokenStream, text,
					maxNumFragmentsRequired, fragmentSeparator);
			System.out.println("\n文件路径:" + path);
			System.out.println("\n" + result);
		}
		reader.close();
		System.out.println("共搜索到: " + hits.length() + " 条资源");
		long timeEnd = System.currentTimeMillis();
		System.out.println("-------------------查询耗时: " + (timeEnd - timeStart)
				+ " 毫秒-----------------------");
	}
	/**
	 * 对指定的目录进行索引
	 * 
	 * @param writer
	 *            IndexWriter
	 * @param root
	 *            指定的目录
	 */
	private static void indexDoc(IndexWriter writer, File root) {
		// 不去索引不能读的文件
		if (root.canRead()) {
			if (root.isDirectory()) {
				File[] files = root.listFiles();
				if (files.length != 0) {
					for (int i = 0; i < files.length; i++) {
						// 递归调用
						indexDoc(writer, files[i]);
					}
				}
			} else {
				try {
					// 文件的文本内容
					InputStream in = new FileInputStream(root);
					byte b[] = new byte[in.available()];
					in.read(b);
					String content = new String(b, "GBK");
					// 创建一个lucene document
					Document d = new Document();
					// 把文件的文本内容添加进来 进行索引,保存
					d.add(new Field(FIELD_NAME, content, Field.Store.YES,
							Field.Index.TOKENIZED,
							Field.TermVector.WITH_POSITIONS_OFFSETS));
					// 同时把path也加入进来,只存储,不索引
					d.add(new Field("path", root.getAbsolutePath(),
							Field.Store.YES, Field.Index.NOT_ANALYZED));
					// 把document写入索引文件
					writer.addDocument(d);
					System.out.println("add file: " + root.getAbsolutePath());
				} catch (FileNotFoundException e) {
					System.out.println("file not found, ignored.");
					e.printStackTrace();
				} catch (IOException e) {

				}
			}
		}
	}
}



Java代码 复制代码 收藏代码
  1. import java.io.File;   
  2. import java.io.Reader;   
  3. import java.util.Set;   
  4. import net.paoding.analysis.analyzer.PaodingAnalyzer;   
  5. import net.paoding.analysis.analyzer.PaodingTokenizer;   
  6. import org.apache.lucene.analysis.LowerCaseFilter;   
  7. import org.apache.lucene.analysis.StopFilter;   
  8. import org.apache.lucene.analysis.TokenStream;   
  9. import org.apache.lucene.analysis.WordlistLoader;   
  10.   
  11. public class PaodingChineseAnalyzer extends PaodingAnalyzer {   
  12.   
  13.     private static String[] stopWords = {   
  14.     "www""的""和""与""时""在",   
  15.     "是""被""所""那""这""有",   
  16.     "将""会""为""对""了""过",   
  17.     "去" };   
  18.     @SuppressWarnings("unchecked")   
  19.     private Set stopSet;   
  20.     public PaodingChineseAnalyzer() {   
  21.         stopSet = StopFilter.makeStopSet(stopWords);   
  22.     }   
  23.     public PaodingChineseAnalyzer(String[] stopWords) {   
  24.         stopSet = StopFilter.makeStopSet(stopWords);   
  25.     }   
  26.     // 读取外部stopwords文件   
  27.     public PaodingChineseAnalyzer(File stopwordsFile) {   
  28.         try {   
  29.             stopSet = WordlistLoader.getWordSet(stopwordsFile);   
  30.         } catch (Exception e) {   
  31.             e.printStackTrace();   
  32.         }   
  33.     }   
  34.     // 过滤分词   
  35.     @SuppressWarnings("deprecation")   
  36.     public final TokenStream tokenStream(String fieldName, Reader reader) {   
  37.         TokenStream result = new PaodingTokenizer(reader, getKnife(),   
  38.                 createTokenCollector());   
  39.         // 加入过滤分词方法,lucene也提供了很多过滤分词方法,可以选择使用   
  40.         result = new StopFilter(result, stopSet);   
  41.         result = new LowerCaseFilter(result);   
  42.         return result;   
  43.     }   
  44. }  
import java.io.File;
import java.io.Reader;
import java.util.Set;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.analyzer.PaodingTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;

public class PaodingChineseAnalyzer extends PaodingAnalyzer {

	private static String[] stopWords = {
	"www", "的", "和", "与", "时", "在",
	"是", "被", "所", "那", "这", "有",
	"将", "会", "为", "对", "了", "过",
	"去" };
	@SuppressWarnings("unchecked")
	private Set stopSet;
	public PaodingChineseAnalyzer() {
		stopSet = StopFilter.makeStopSet(stopWords);
	}
	public PaodingChineseAnalyzer(String[] stopWords) {
		stopSet = StopFilter.makeStopSet(stopWords);
	}
	// 读取外部stopwords文件
	public PaodingChineseAnalyzer(File stopwordsFile) {
		try {
			stopSet = WordlistLoader.getWordSet(stopwordsFile);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	// 过滤分词
	@SuppressWarnings("deprecation")
	public final TokenStream tokenStream(String fieldName, Reader reader) {
		TokenStream result = new PaodingTokenizer(reader, getKnife(),
				createTokenCollector());
		// 加入过滤分词方法,lucene也提供了很多过滤分词方法,可以选择使用
		result = new StopFilter(result, stopSet);
		result = new LowerCaseFilter(result);
		return result;
	}
}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics