Skip to content

Commit 7bb5e8e

Browse files
committed
升级HanLP 1.6.7,支持配置词典优先级与分词算法
1 parent c980a7a commit 7bb5e8e

File tree

4 files changed

+27
-11
lines changed

4 files changed

+27
-11
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,5 @@
1111
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
1212
hs_err_pid*
1313
/.idea
14+
*.iml
15+
target/

README.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ HanLP中文分词Lucene插件
1111
<dependency>
1212
<groupId>com.hankcs.nlp</groupId>
1313
<artifactId>hanlp-lucene-plugin</artifactId>
14-
<version>1.1.3</version>
14+
<version>1.1.4</version>
1515
</dependency>
1616
```
1717

1818
## Solr快速上手
1919
1.[hanlp-portable.jar](http://central.maven.org/maven2/com/hankcs/hanlp/portable-1.5.3/hanlp-portable-1.5.3.jar)[hanlp-lucene-plugin.jar](https://github.com/hankcs/hanlp-lucene-plugin/releases)共两个jar放入```${webapp}/WEB-INF/lib```下。(或者使用```mvn package```对源码打包,拷贝```target/hanlp-lucene-plugin-x.x.x.jar``````${webapp}/WEB-INF/lib```下)
2020
1. 修改solr core的配置文件```${core}/conf/schema.xml```
21-
21+
2222
```xml
2323
<fieldType name="text_cn" class="solr.TextField">
2424
<analyzer type="index">
@@ -33,7 +33,7 @@ HanLP中文分词Lucene插件
3333
<field name="my_field1" type="text_cn" indexed="true" stored="true"/>
3434
<field name="my_field2" type="text_cn" indexed="true" stored="true"/>
3535
```
36-
36+
3737
* 如果你的业务系统中有其他字段,比如location,summary之类,也需要一一指定其type="text_cn"。切记,否则这些字段仍旧是solr默认分词器。
3838
* 另外,切记不要在query中开启indexMode,否则会影响PhaseQuery。indexMode只需在index中开启一遍即可。
3939

@@ -42,9 +42,11 @@ HanLP中文分词Lucene插件
4242

4343
| 配置项名称 | 功能 | 默认值 |
4444
| -------- | -----: | :----: |
45+
| algorithm | [分词算法](https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/HanLP.java#L643) | viterbi |
4546
| enableIndexMode | 设为索引模式(切勿在query中开启) | true |
4647
| enableCustomDictionary | 是否启用用户词典 | true |
4748
| customDictionaryPath | 用户词典路径(绝对路径或程序可以读取的相对路径,多个词典用空格隔开) | null |
49+
| enableCustomDictionaryForcing | [用户词典高优先级](https://github.com/hankcs/HanLP/wiki/FAQ#%E4%B8%BA%E4%BB%80%E4%B9%88%E4%BF%AE%E6%94%B9%E4%BA%86%E8%AF%8D%E5%85%B8%E8%BF%98%E6%98%AF%E6%B2%A1%E6%9C%89%E6%95%88%E6%9E%9C) | false |
4850
| stopWordDictionaryPath | 停用词词典路径 | null |
4951
| enableNumberQuantifierRecognize | 是否启用数词和数量词识别 | true |
5052
| enableNameRecognize | 开启人名识别 | true |
@@ -54,7 +56,7 @@ HanLP中文分词Lucene插件
5456
| enablePlaceRecognize | 开启地名识别 | false |
5557
| enableNormalization | 是否执行字符正规化(繁体->简体,全角->半角,大写->小写) | false |
5658
| enableTraditionalChineseMode | 开启精准繁体中文分词 | false |
57-
| enableDebug | 开启除错模式 | false |
59+
| enableDebug | 开启调试模式 | false |
5860

5961
更高级的配置主要通过class path下的```hanlp.properties```进行配置,请阅读[HanLP自然语言处理包文档](https://github.com/hankcs/HanLP)以了解更多相关配置,如:
6062

@@ -66,7 +68,7 @@ HanLP中文分词Lucene插件
6668
## 停用词与同义词
6769
推荐利用Lucene或Solr自带的filter实现,本插件不会越俎代庖。
6870
一个示例配置如下:
69-
71+
7072
```xml
7173
<!-- text_cn字段类型: 指定使用HanLP分词器,同时开启索引模式。通过solr自带的停用词过滤器,使用"stopwords.txt"(默认空白)过滤。
7274
在搜索的时候,还支持solr自带的同义词词典。-->

pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>com.hankcs.nlp</groupId>
88
<artifactId>hanlp-lucene-plugin</artifactId>
9-
<version>1.1.3</version>
9+
<version>1.1.4</version>
1010

1111
<name>hanlp-solr-plugin</name>
1212
<url>https://github.com/hankcs/hanlp-lucene-plugin</url>
@@ -41,7 +41,7 @@
4141
<properties>
4242
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
4343

44-
<hanlp.version>portable-1.5.4</hanlp.version>
44+
<hanlp.version>portable-1.6.7</hanlp.version>
4545
<lucene.version>7.2.0</lucene.version>
4646

4747
<javac.src.version>1.7</javac.src.version>

src/main/java/com/hankcs/lucene/HanLPTokenizerFactory.java

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ public class HanLPTokenizerFactory extends TokenizerFactory
2121
private boolean enablePorterStemming;
2222
private boolean enableNumberQuantifierRecognize;
2323
private boolean enableCustomDictionary;
24+
private boolean enableCustomDictionaryForcing;
2425
private boolean enableTranslatedNameRecognize;
2526
private boolean enableJapaneseNameRecognize;
2627
private boolean enableOrganizationRecognize;
2728
private boolean enablePlaceRecognize;
2829
private boolean enableNameRecognize;
2930
private boolean enableTraditionalChineseMode;
31+
private String algorithm;
3032
private Set<String> stopWordDictionary;
3133

3234
/**
@@ -41,13 +43,15 @@ public HanLPTokenizerFactory(Map<String, String> args)
4143
enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
4244
enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false);
4345
enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true);
46+
enableCustomDictionaryForcing = getBoolean(args, "enableCustomDictionaryForcing", true);
4447
enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false);
4548
enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false);
4649
enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false);
4750
enableNameRecognize = getBoolean(args, "enableNameRecognize", false);
4851
enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false);
4952
enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false);
50-
HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
53+
HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
54+
algorithm = getString(args, "algorithm", "viterbi");
5155
Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath");
5256
if (customDictionaryPathSet != null)
5357
{
@@ -59,18 +63,26 @@ public HanLPTokenizerFactory(Map<String, String> args)
5963
stopWordDictionary = new TreeSet<>();
6064
stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
6165
}
62-
if (getBoolean(args, "enableDebug", false)) {
63-
HanLP.Config.enableDebug();
66+
if (getBoolean(args, "enableDebug", false))
67+
{
68+
HanLP.Config.enableDebug();
6469
}
6570
}
6671

72+
protected final String getString(Map<String, String> args, String name, String defaultVal)
73+
{
74+
String s = args.remove(name);
75+
return s == null ? defaultVal : s;
76+
}
77+
6778
@Override
6879
public Tokenizer create(AttributeFactory factory)
6980
{
70-
Segment segment = HanLP.newSegment().enableOffset(true).enableIndexMode(enableIndexMode)
81+
Segment segment = HanLP.newSegment(algorithm).enableOffset(true).enableIndexMode(enableIndexMode)
7182
.enableNameRecognize(enableNameRecognize)
7283
.enableNumberQuantifierRecognize(enableNumberQuantifierRecognize)
7384
.enableCustomDictionary(enableCustomDictionary)
85+
.enableCustomDictionaryForcing(enableCustomDictionaryForcing)
7486
.enableTranslatedNameRecognize(enableTranslatedNameRecognize)
7587
.enableJapaneseNameRecognize(enableJapaneseNameRecognize)
7688
.enableOrganizationRecognize(enableOrganizationRecognize)

0 commit comments

Comments
 (0)