升级HanLP 1.6.7，支持配置词典优先级与分词算法

hankcs · hankcs · commit 7bb5e8e286c5 · 2018-08-18T10:30:39.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,5 @@
 # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
 hs_err_pid*
 /.idea
+*.iml
+target/
diff --git a/README.md b/README.md
@@ -11,14 +11,14 @@ HanLP中文分词Lucene插件
     <dependency>
       <groupId>com.hankcs.nlp</groupId>
       <artifactId>hanlp-lucene-plugin</artifactId>
-      <version>1.1.3</version>
+      <version>1.1.4</version>
     </dependency>
 ```
 
 ## Solr快速上手
  1. 将[hanlp-portable.jar](http://central.maven.org/maven2/com/hankcs/hanlp/portable-1.5.3/hanlp-portable-1.5.3.jar)和[hanlp-lucene-plugin.jar](https://github.com/hankcs/hanlp-lucene-plugin/releases)共两个jar放入```${webapp}/WEB-INF/lib```下。（或者使用```mvn package```对源码打包，拷贝```target/hanlp-lucene-plugin-x.x.x.jar```到```${webapp}/WEB-INF/lib```下）
  1. 修改solr core的配置文件```${core}/conf/schema.xml```：
- 
+
 ```xml
   <fieldType name="text_cn" class="solr.TextField">
       <analyzer type="index">
@@ -33,7 +33,7 @@ HanLP中文分词Lucene插件
   <field name="my_field1" type="text_cn" indexed="true" stored="true"/>
   <field name="my_field2" type="text_cn" indexed="true" stored="true"/>
 ```
-    
+
  * 如果你的业务系统中有其他字段，比如location，summary之类，也需要一一指定其type="text_cn"。切记，否则这些字段仍旧是solr默认分词器。
  * 另外，切记不要在query中开启indexMode，否则会影响PhaseQuery。indexMode只需在index中开启一遍即可。
 
@@ -42,9 +42,11 @@ HanLP中文分词Lucene插件
 
 | 配置项名称       | 功能   |  默认值  |
 | --------   | -----:  | :----:  |
+| algorithm   | [分词算法](https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/HanLP.java#L643) |   viterbi     |
 | enableIndexMode    | 设为索引模式（切勿在query中开启） |   true     |
 | enableCustomDictionary    | 是否启用用户词典 |   true     |
 | customDictionaryPath    | 用户词典路径(绝对路径或程序可以读取的相对路径,多个词典用空格隔开) |   null     |
+| enableCustomDictionaryForcing    | [用户词典高优先级](https://github.com/hankcs/HanLP/wiki/FAQ#%E4%B8%BA%E4%BB%80%E4%B9%88%E4%BF%AE%E6%94%B9%E4%BA%86%E8%AF%8D%E5%85%B8%E8%BF%98%E6%98%AF%E6%B2%A1%E6%9C%89%E6%95%88%E6%9E%9C) |   false     |
 | stopWordDictionaryPath    | 停用词词典路径 |   null     |
 | enableNumberQuantifierRecognize    | 是否启用数词和数量词识别 |   true     |
 | enableNameRecognize    | 开启人名识别 |   true     |
@@ -54,7 +56,7 @@ HanLP中文分词Lucene插件
 | enablePlaceRecognize    | 开启地名识别 |   false     |
 | enableNormalization    | 是否执行字符正规化（繁体->简体，全角->半角，大写->小写） |   false     |
 | enableTraditionalChineseMode    | 开启精准繁体中文分词 |   false     |
-| enableDebug    | 开启除错模式 |   false     |
+| enableDebug    | 开启调试模式 |   false     |
 
  更高级的配置主要通过class path下的```hanlp.properties```进行配置，请阅读[HanLP自然语言处理包文档](https://github.com/hankcs/HanLP)以了解更多相关配置，如：
 
@@ -66,7 +68,7 @@ HanLP中文分词Lucene插件
 ## 停用词与同义词
  推荐利用Lucene或Solr自带的filter实现，本插件不会越俎代庖。
  一个示例配置如下：
- 
+
 ```xml
     <!-- text_cn字段类型: 指定使用HanLP分词器，同时开启索引模式。通过solr自带的停用词过滤器，使用"stopwords.txt"（默认空白）过滤。
 	 在搜索的时候，还支持solr自带的同义词词典。-->
diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>com.hankcs.nlp</groupId>
     <artifactId>hanlp-lucene-plugin</artifactId>
-    <version>1.1.3</version>
+    <version>1.1.4</version>
 
     <name>hanlp-solr-plugin</name>
     <url>https://github.com/hankcs/hanlp-lucene-plugin</url>
@@ -41,7 +41,7 @@
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 
-        <hanlp.version>portable-1.5.4</hanlp.version>
+        <hanlp.version>portable-1.6.7</hanlp.version>
         <lucene.version>7.2.0</lucene.version>
 
         <javac.src.version>1.7</javac.src.version>
diff --git a/src/main/java/com/hankcs/lucene/HanLPTokenizerFactory.java b/src/main/java/com/hankcs/lucene/HanLPTokenizerFactory.java
@@ -21,12 +21,14 @@ public class HanLPTokenizerFactory extends TokenizerFactory
     private boolean enablePorterStemming;
     private boolean enableNumberQuantifierRecognize;
     private boolean enableCustomDictionary;
+    private boolean enableCustomDictionaryForcing;
     private boolean enableTranslatedNameRecognize;
     private boolean enableJapaneseNameRecognize;
     private boolean enableOrganizationRecognize;
     private boolean enablePlaceRecognize;
     private boolean enableNameRecognize;
     private boolean enableTraditionalChineseMode;
+    private String algorithm;
     private Set<String> stopWordDictionary;
 
     /**
@@ -41,13 +43,15 @@ public HanLPTokenizerFactory(Map<String, String> args)
         enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
         enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false);
         enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true);
+        enableCustomDictionaryForcing = getBoolean(args, "enableCustomDictionaryForcing", true);
         enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false);
         enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false);
         enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false);
         enableNameRecognize = getBoolean(args, "enableNameRecognize", false);
         enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false);
         enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false);
-        HanLP.Config.Normalization  = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
+        HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
+        algorithm = getString(args, "algorithm", "viterbi");
         Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath");
         if (customDictionaryPathSet != null)
         {
@@ -59,18 +63,26 @@ public HanLPTokenizerFactory(Map<String, String> args)
             stopWordDictionary = new TreeSet<>();
             stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
         }
-        if (getBoolean(args, "enableDebug", false)) {
-          HanLP.Config.enableDebug();
+        if (getBoolean(args, "enableDebug", false))
+        {
+            HanLP.Config.enableDebug();
         }
     }
 
+    protected final String getString(Map<String, String> args, String name, String defaultVal)
+    {
+        String s = args.remove(name);
+        return s == null ? defaultVal : s;
+    }
+
     @Override
     public Tokenizer create(AttributeFactory factory)
     {
-        Segment segment = HanLP.newSegment().enableOffset(true).enableIndexMode(enableIndexMode)
+        Segment segment = HanLP.newSegment(algorithm).enableOffset(true).enableIndexMode(enableIndexMode)
                 .enableNameRecognize(enableNameRecognize)
                 .enableNumberQuantifierRecognize(enableNumberQuantifierRecognize)
                 .enableCustomDictionary(enableCustomDictionary)
+                .enableCustomDictionaryForcing(enableCustomDictionaryForcing)
                 .enableTranslatedNameRecognize(enableTranslatedNameRecognize)
                 .enableJapaneseNameRecognize(enableJapaneseNameRecognize)
                 .enableOrganizationRecognize(enableOrganizationRecognize)