演示高亮搜索结果，解决：hankcs/HanLP#74

hankcs · hankcs · commit 91eefb5f0019 · 2015-10-22T12:30:21.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@
 
 # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
 hs_err_pid*
+/.idea
diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>com.hankcs.nlp</groupId>
     <artifactId>hanlp-solr-plugin</artifactId>
-    <version>1.0.1</version>
+    <version>1.0.2</version>
 
     <name>hanlp-solr-plugin</name>
     <url>https://github.com/hankcs/HanLP</url>
@@ -37,6 +37,14 @@
             <scope>test</scope>
         </dependency>
 
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-highlighter</artifactId>
+            <version>${lucene.version}</version>
+            <scope>test</scope>
+        </dependency>
+
+
         <dependency>
             <groupId>org.apache.lucene</groupId>
             <artifactId>lucene-core</artifactId>
diff --git a/src/main/java/com/hankcs/lucene/SegmentWrapper.java b/src/main/java/com/hankcs/lucene/SegmentWrapper.java
@@ -32,6 +32,10 @@ public class SegmentWrapper
      * termArray下标
      */
     int index;
+    /**
+     * term的偏移量，由于wrapper是按行读取的，必须对term.offset做一个校正
+     */
+    int offset;
 
     public SegmentWrapper(BufferedReader br, Segment segment)
     {
@@ -49,6 +53,7 @@ public void reset(BufferedReader br)
         this.br = br;
         termArray = null;
         index = 0;
+        offset = 0;
     }
 
     public Term next() throws IOException
@@ -58,12 +63,17 @@ public Term next() throws IOException
         while (isBlank(line))
         {
             if (line == null) return null;
+            offset += line.length() + 1;
             line = br.readLine();
         }
 
         List<Term> termList = segment.seg(line);
         if (termList.size() == 0) return null;
         termArray = termList.toArray(new Term[0]);
+        for (Term term : termArray)
+        {
+            term.offset += offset;
+        }
         index = 0;
 
         return termArray[index++];
diff --git a/src/test/java/com/hankcs/lucene/HighLighterTest.java b/src/test/java/com/hankcs/lucene/HighLighterTest.java
@@ -0,0 +1,169 @@
+/*
+ * <summary></summary>
+ * <author>He Han</author>
+ * <email>me@hankcs.com</email>
+ * <create-date>2015/10/22 11:37</create-date>
+ *
+ * <copyright file="HighLighterDemo.java" company="码农场">
+ * Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/
+ * This source is subject to Hankcs. Please contact Hankcs to get more information.
+ * </copyright>
+ */
+package com.hankcs.lucene;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.*;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.*;
+import org.apache.lucene.search.highlight.*;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.store.RAMDirectory;
+
+import java.io.IOException;
+
+/**
+ * 演示高亮搜索结果
+ * @author hankcs
+ *
+ */
+public class HighLighterTest extends TestCase
+{
+
+    public void testHightlight() throws Exception
+    {
+        // Lucene Document的主要域名
+        String fieldName = "text";
+
+        // 实例化Analyzer分词器
+        Analyzer analyzer = new HanLPAnalyzer();
+
+        Directory directory = null;
+        IndexWriter iwriter;
+        IndexReader ireader = null;
+        IndexSearcher isearcher;
+        try
+        {
+            //索引过程**********************************
+            //建立内存索引对象
+            directory = new RAMDirectory();
+
+            //配置IndexWriterConfig
+            IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
+            iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
+            iwriter = new IndexWriter(directory, iwConfig);
+            {
+                // 加入一个文档
+                Document doc = new Document();
+                doc.add(new TextField(fieldName, "我白天是一名语言学习者，晚上是一名初级码农。空的时候喜欢看算法和应用数学书，也喜欢悬疑推理小说，ACG方面喜欢型月、轨迹。喜欢有思想深度的事物，讨厌急躁、拜金与安逸的人。目前在魔都某女校学习，这是我的个人博客。闻道有先后，术业有专攻，请多多关照。你喜欢写代码吗？", Field.Store.YES));
+                doc.add(new TextField("title", "关于hankcs", Field.Store.YES));
+                iwriter.addDocument(doc);
+            }
+            {
+                // 再加入一个
+                Document doc = new Document();
+                doc.add(new TextField(fieldName, "\n\n   \n程序员喜欢黑夜", Field.Store.YES));
+                doc.add(new TextField("title", "关于程序员", Field.Store.YES));
+                iwriter.addDocument(doc);
+            }
+            iwriter.close();
+
+            //搜索过程**********************************
+            //实例化搜索器
+            ireader = DirectoryReader.open(directory);
+            isearcher = new IndexSearcher(ireader);
+
+            String keyword = "喜欢";
+            //使用QueryParser查询分析器构造Query对象
+            QueryParser qp = new QueryParser(fieldName, analyzer);
+            Query query = qp.parse(keyword);
+            System.out.println("Query = " + query);
+
+            //搜索相似度最高的5条记录
+            TopDocs topDocs = isearcher.search(query, 5);
+            System.out.println("命中：" + topDocs.totalHits);
+            //输出结果
+            ScoreDoc[] scoreDocs = topDocs.scoreDocs;
+
+            for (int i = 0; i < Math.min(5, scoreDocs.length); ++i)
+            {
+                Document targetDoc = isearcher.doc(scoreDocs[i].doc);
+                System.out.print(targetDoc.getField("title").stringValue());
+                System.out.println(" , " + scoreDocs[i].score);
+
+                String text = targetDoc.get(fieldName);
+                System.out.println(displayHtmlHighlight(query, analyzer, fieldName, text, 200));
+            }
+        }
+        catch (CorruptIndexException e)
+        {
+            e.printStackTrace();
+        }
+        catch (LockObtainFailedException e)
+        {
+            e.printStackTrace();
+        }
+        catch (IOException e)
+        {
+            e.printStackTrace();
+        }
+        catch (ParseException e)
+        {
+            e.printStackTrace();
+        }
+        catch (InvalidTokenOffsetsException e)
+        {
+            e.printStackTrace();
+        }
+        finally
+        {
+            if (ireader != null)
+            {
+                try
+                {
+                    ireader.close();
+                }
+                catch (IOException e)
+                {
+                    e.printStackTrace();
+                }
+            }
+            if (directory != null)
+            {
+                try
+                {
+                    directory.close();
+                }
+                catch (IOException e)
+                {
+                    e.printStackTrace();
+                }
+            }
+        }
+    }
+
+    /**
+     * 获取高亮显示结果的html代码
+     * @param query 查询
+     * @param analyzer 分词器
+     * @param fieldName 域名
+     * @param fieldContent 域内容
+     * @param fragmentSize 结果的长度（不含html标签长度）
+     * @return 结果（一段html代码）
+     * @throws IOException
+     * @throws InvalidTokenOffsetsException
+     */
+    static String displayHtmlHighlight(Query query, Analyzer analyzer, String fieldName, String fieldContent, int fragmentSize) throws IOException, InvalidTokenOffsetsException
+    {
+        //创建一个高亮器
+        Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<font color='red'>", "</font>"), new QueryScorer(query));
+        Fragmenter fragmenter = new SimpleFragmenter(fragmentSize);
+        highlighter.setTextFragmenter(fragmenter);
+        return highlighter.getBestFragment(analyzer, fieldName, fieldContent);
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -10,3 +10,4 @@`
`10`	`10`
`11`	`11`	`# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml`
`12`	`12`	`hs_err_pid*`
	`13`	`+/.idea`