Skip to content

Commit 91eefb5

Browse files
committed
演示高亮搜索结果,解决:hankcs/HanLP#74
1 parent 8c469d0 commit 91eefb5

File tree

4 files changed

+189
-1
lines changed

4 files changed

+189
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@
1010

1111
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
1212
hs_err_pid*
13+
/.idea

pom.xml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>com.hankcs.nlp</groupId>
88
<artifactId>hanlp-solr-plugin</artifactId>
9-
<version>1.0.1</version>
9+
<version>1.0.2</version>
1010

1111
<name>hanlp-solr-plugin</name>
1212
<url>https://github.com/hankcs/HanLP</url>
@@ -37,6 +37,14 @@
3737
<scope>test</scope>
3838
</dependency>
3939

40+
<dependency>
41+
<groupId>org.apache.lucene</groupId>
42+
<artifactId>lucene-highlighter</artifactId>
43+
<version>${lucene.version}</version>
44+
<scope>test</scope>
45+
</dependency>
46+
47+
4048
<dependency>
4149
<groupId>org.apache.lucene</groupId>
4250
<artifactId>lucene-core</artifactId>

src/main/java/com/hankcs/lucene/SegmentWrapper.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ public class SegmentWrapper
3232
* termArray下标
3333
*/
3434
int index;
35+
/**
36+
* term的偏移量,由于wrapper是按行读取的,必须对term.offset做一个校正
37+
*/
38+
int offset;
3539

3640
public SegmentWrapper(BufferedReader br, Segment segment)
3741
{
@@ -49,6 +53,7 @@ public void reset(BufferedReader br)
4953
this.br = br;
5054
termArray = null;
5155
index = 0;
56+
offset = 0;
5257
}
5358

5459
public Term next() throws IOException
@@ -58,12 +63,17 @@ public Term next() throws IOException
5863
while (isBlank(line))
5964
{
6065
if (line == null) return null;
66+
offset += line.length() + 1;
6167
line = br.readLine();
6268
}
6369

6470
List<Term> termList = segment.seg(line);
6571
if (termList.size() == 0) return null;
6672
termArray = termList.toArray(new Term[0]);
73+
for (Term term : termArray)
74+
{
75+
term.offset += offset;
76+
}
6777
index = 0;
6878

6979
return termArray[index++];
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
/*
2+
* <summary></summary>
3+
* <author>He Han</author>
4+
* <email>[email protected]</email>
5+
* <create-date>2015/10/22 11:37</create-date>
6+
*
7+
* <copyright file="HighLighterDemo.java" company="码农场">
8+
* Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/
9+
* This source is subject to Hankcs. Please contact Hankcs to get more information.
10+
* </copyright>
11+
*/
12+
package com.hankcs.lucene;
13+
14+
import junit.framework.TestCase;
15+
import org.apache.lucene.analysis.Analyzer;
16+
import org.apache.lucene.document.Document;
17+
import org.apache.lucene.document.Field;
18+
import org.apache.lucene.document.TextField;
19+
import org.apache.lucene.index.*;
20+
import org.apache.lucene.queryparser.classic.ParseException;
21+
import org.apache.lucene.queryparser.classic.QueryParser;
22+
import org.apache.lucene.search.*;
23+
import org.apache.lucene.search.highlight.*;
24+
import org.apache.lucene.store.Directory;
25+
import org.apache.lucene.store.LockObtainFailedException;
26+
import org.apache.lucene.store.RAMDirectory;
27+
28+
import java.io.IOException;
29+
30+
/**
31+
* 演示高亮搜索结果
32+
* @author hankcs
33+
*
34+
*/
35+
public class HighLighterTest extends TestCase
36+
{
37+
38+
public void testHightlight() throws Exception
39+
{
40+
// Lucene Document的主要域名
41+
String fieldName = "text";
42+
43+
// 实例化Analyzer分词器
44+
Analyzer analyzer = new HanLPAnalyzer();
45+
46+
Directory directory = null;
47+
IndexWriter iwriter;
48+
IndexReader ireader = null;
49+
IndexSearcher isearcher;
50+
try
51+
{
52+
//索引过程**********************************
53+
//建立内存索引对象
54+
directory = new RAMDirectory();
55+
56+
//配置IndexWriterConfig
57+
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
58+
iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
59+
iwriter = new IndexWriter(directory, iwConfig);
60+
{
61+
// 加入一个文档
62+
Document doc = new Document();
63+
doc.add(new TextField(fieldName, "我白天是一名语言学习者,晚上是一名初级码农。空的时候喜欢看算法和应用数学书,也喜欢悬疑推理小说,ACG方面喜欢型月、轨迹。喜欢有思想深度的事物,讨厌急躁、拜金与安逸的人。目前在魔都某女校学习,这是我的个人博客。闻道有先后,术业有专攻,请多多关照。你喜欢写代码吗?", Field.Store.YES));
64+
doc.add(new TextField("title", "关于hankcs", Field.Store.YES));
65+
iwriter.addDocument(doc);
66+
}
67+
{
68+
// 再加入一个
69+
Document doc = new Document();
70+
doc.add(new TextField(fieldName, "\n\n \n程序员喜欢黑夜", Field.Store.YES));
71+
doc.add(new TextField("title", "关于程序员", Field.Store.YES));
72+
iwriter.addDocument(doc);
73+
}
74+
iwriter.close();
75+
76+
//搜索过程**********************************
77+
//实例化搜索器
78+
ireader = DirectoryReader.open(directory);
79+
isearcher = new IndexSearcher(ireader);
80+
81+
String keyword = "喜欢";
82+
//使用QueryParser查询分析器构造Query对象
83+
QueryParser qp = new QueryParser(fieldName, analyzer);
84+
Query query = qp.parse(keyword);
85+
System.out.println("Query = " + query);
86+
87+
//搜索相似度最高的5条记录
88+
TopDocs topDocs = isearcher.search(query, 5);
89+
System.out.println("命中:" + topDocs.totalHits);
90+
//输出结果
91+
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
92+
93+
for (int i = 0; i < Math.min(5, scoreDocs.length); ++i)
94+
{
95+
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
96+
System.out.print(targetDoc.getField("title").stringValue());
97+
System.out.println(" , " + scoreDocs[i].score);
98+
99+
String text = targetDoc.get(fieldName);
100+
System.out.println(displayHtmlHighlight(query, analyzer, fieldName, text, 200));
101+
}
102+
}
103+
catch (CorruptIndexException e)
104+
{
105+
e.printStackTrace();
106+
}
107+
catch (LockObtainFailedException e)
108+
{
109+
e.printStackTrace();
110+
}
111+
catch (IOException e)
112+
{
113+
e.printStackTrace();
114+
}
115+
catch (ParseException e)
116+
{
117+
e.printStackTrace();
118+
}
119+
catch (InvalidTokenOffsetsException e)
120+
{
121+
e.printStackTrace();
122+
}
123+
finally
124+
{
125+
if (ireader != null)
126+
{
127+
try
128+
{
129+
ireader.close();
130+
}
131+
catch (IOException e)
132+
{
133+
e.printStackTrace();
134+
}
135+
}
136+
if (directory != null)
137+
{
138+
try
139+
{
140+
directory.close();
141+
}
142+
catch (IOException e)
143+
{
144+
e.printStackTrace();
145+
}
146+
}
147+
}
148+
}
149+
150+
/**
151+
* 获取高亮显示结果的html代码
152+
* @param query 查询
153+
* @param analyzer 分词器
154+
* @param fieldName 域名
155+
* @param fieldContent 域内容
156+
* @param fragmentSize 结果的长度(不含html标签长度)
157+
* @return 结果(一段html代码)
158+
* @throws IOException
159+
* @throws InvalidTokenOffsetsException
160+
*/
161+
static String displayHtmlHighlight(Query query, Analyzer analyzer, String fieldName, String fieldContent, int fragmentSize) throws IOException, InvalidTokenOffsetsException
162+
{
163+
//创建一个高亮器
164+
Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<font color='red'>", "</font>"), new QueryScorer(query));
165+
Fragmenter fragmenter = new SimpleFragmenter(fragmentSize);
166+
highlighter.setTextFragmenter(fragmenter);
167+
return highlighter.getBestFragment(analyzer, fieldName, fieldContent);
168+
}
169+
}

0 commit comments

Comments
 (0)