@@ -21,12 +21,14 @@ public class HanLPTokenizerFactory extends TokenizerFactory
2121 private boolean enablePorterStemming ;
2222 private boolean enableNumberQuantifierRecognize ;
2323 private boolean enableCustomDictionary ;
24+ private boolean enableCustomDictionaryForcing ;
2425 private boolean enableTranslatedNameRecognize ;
2526 private boolean enableJapaneseNameRecognize ;
2627 private boolean enableOrganizationRecognize ;
2728 private boolean enablePlaceRecognize ;
2829 private boolean enableNameRecognize ;
2930 private boolean enableTraditionalChineseMode ;
31+ private String algorithm ;
3032 private Set <String > stopWordDictionary ;
3133
3234 /**
@@ -41,13 +43,15 @@ public HanLPTokenizerFactory(Map<String, String> args)
4143 enablePorterStemming = getBoolean (args , "enablePorterStemming" , false );
4244 enableNumberQuantifierRecognize = getBoolean (args , "enableNumberQuantifierRecognize" , false );
4345 enableCustomDictionary = getBoolean (args , "enableCustomDictionary" , true );
46+ enableCustomDictionaryForcing = getBoolean (args , "enableCustomDictionaryForcing" , true );
4447 enableTranslatedNameRecognize = getBoolean (args , "enableTranslatedNameRecognize" , false );
4548 enableJapaneseNameRecognize = getBoolean (args , "enableJapaneseNameRecognize" , false );
4649 enableOrganizationRecognize = getBoolean (args , "enableOrganizationRecognize" , false );
4750 enableNameRecognize = getBoolean (args , "enableNameRecognize" , false );
4851 enablePlaceRecognize = getBoolean (args , "enablePlaceRecognize" , false );
4952 enableTraditionalChineseMode = getBoolean (args , "enableTraditionalChineseMode" , false );
50- HanLP .Config .Normalization = getBoolean (args , "enableNormalization" , HanLP .Config .Normalization );
53+ HanLP .Config .Normalization = getBoolean (args , "enableNormalization" , HanLP .Config .Normalization );
54+ algorithm = getString (args , "algorithm" , "viterbi" );
5155 Set <String > customDictionaryPathSet = getSet (args , "customDictionaryPath" );
5256 if (customDictionaryPathSet != null )
5357 {
@@ -59,18 +63,26 @@ public HanLPTokenizerFactory(Map<String, String> args)
5963 stopWordDictionary = new TreeSet <>();
6064 stopWordDictionary .addAll (IOUtil .readLineListWithLessMemory (stopWordDictionaryPath ));
6165 }
62- if (getBoolean (args , "enableDebug" , false )) {
63- HanLP .Config .enableDebug ();
66+ if (getBoolean (args , "enableDebug" , false ))
67+ {
68+ HanLP .Config .enableDebug ();
6469 }
6570 }
6671
72+ protected final String getString (Map <String , String > args , String name , String defaultVal )
73+ {
74+ String s = args .remove (name );
75+ return s == null ? defaultVal : s ;
76+ }
77+
6778 @ Override
6879 public Tokenizer create (AttributeFactory factory )
6980 {
70- Segment segment = HanLP .newSegment ().enableOffset (true ).enableIndexMode (enableIndexMode )
81+ Segment segment = HanLP .newSegment (algorithm ).enableOffset (true ).enableIndexMode (enableIndexMode )
7182 .enableNameRecognize (enableNameRecognize )
7283 .enableNumberQuantifierRecognize (enableNumberQuantifierRecognize )
7384 .enableCustomDictionary (enableCustomDictionary )
85+ .enableCustomDictionaryForcing (enableCustomDictionaryForcing )
7486 .enableTranslatedNameRecognize (enableTranslatedNameRecognize )
7587 .enableJapaneseNameRecognize (enableJapaneseNameRecognize )
7688 .enableOrganizationRecognize (enableOrganizationRecognize )
0 commit comments