This is an automated email from the ASF dual-hosted git repository.
jermy pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph.git
The following commit(s) were added to refs/heads/master by this push:
new 07810249a refact(core): update dependencies version & adopt the new
analyzer (#1989)
07810249a is described below
commit 07810249a53836a330891d0c6b89cd1bbcc4319a
Author: Jade Peng <[email protected]>
AuthorDate: Wed Oct 26 23:11:30 2022 +0800
refact(core): update dependencies version & adopt the new analyzer (#1989)
jraft 1.3.9->1.3.11
ohc-core 0.7.0->0.7.4
org.apdplat.word 1.3->1.3.1
hanlp portable-1.5.0->portable-1.8.3
lucene-analyzers-smartcn,lucene-core 7.4.0->8.11.2
jcseg-core->2.2.0 ->2.6.2
lz4-java 1.7.1->1.8.0
eclipse-collections 10.4.0->11.1.0
fastutil 8.1.0->8.4.0
jjwt 0.11.2->0.11.5
Co-authored-by: jadepeng <[email protected]>
---
hugegraph-core/pom.xml | 61 +++++++++++++++-------
.../baidu/hugegraph/analyzer/JcsegAnalyzer.java | 34 ++++++------
.../baidu/hugegraph/unit/core/AnalyzerTest.java | 18 ++++---
3 files changed, 72 insertions(+), 41 deletions(-)
diff --git a/hugegraph-core/pom.xml b/hugegraph-core/pom.xml
index e3bd210ca..bb1d3646d 100644
--- a/hugegraph-core/pom.xml
+++ b/hugegraph-core/pom.xml
@@ -29,6 +29,21 @@
<properties>
<top.level.dir>${basedir}/..</top.level.dir>
+ <jraft.version>1.3.11</jraft.version>
+ <ohc.version>0.7.4</ohc.version>
+ <lz4.version>1.8.0</lz4.version>
+ <apdplat-word.version>1.3.1</apdplat-word.version>
+ <mmseg4j-core.version>1.10.0</mmseg4j-core.version>
+ <jcseg.version>2.6.2</jcseg.version>
+ <hanlp.version>portable-1.8.3</hanlp.version>
+ <ansj-seg.version>5.1.6</ansj-seg.version>
+ <lucene.version>8.11.2</lucene.version>
+ <jieba-analysis.version>1.0.2</jieba-analysis.version>
+ <ikanalyzer.version>2012_u6</ikanalyzer.version>
+ <commons-compress.version>1.21</commons-compress.version>
+ <eclipse-collections.version>11.1.0</eclipse-collections.version>
+ <fastutil.version>8.5.9</fastutil.version>
+ <jjwt.version>0.11.5</jjwt.version>
</properties>
<dependencies>
@@ -69,7 +84,7 @@
<dependency>
<groupId>com.alipay.sofa</groupId>
<artifactId>jraft-core</artifactId>
- <version>1.3.9</version>
+ <version>${jraft.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
@@ -101,7 +116,7 @@
<dependency>
<groupId>org.caffinitas.ohc</groupId>
<artifactId>ohc-core</artifactId>
- <version>0.7.0</version>
+ <version>${ohc.version}</version>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
@@ -113,98 +128,106 @@
<dependency>
<groupId>org.apdplat</groupId>
<artifactId>word</artifactId>
- <version>1.3</version>
+ <version>${apdplat-word.version}</version>
<exclusions>
<exclusion>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-common</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.ansj</groupId>
<artifactId>ansj_seg</artifactId>
- <version>5.1.6</version>
+ <version>${ansj-seg.version}</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
- <version>portable-1.5.0</version>
+ <version>${hanlp.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
- <version>7.4.0</version>
+ <version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
- <version>7.4.0</version>
+ <version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
- <version>1.0.2</version>
+ <version>${jieba-analysis.version}</version>
</dependency>
<dependency>
<groupId>org.lionsoul</groupId>
<artifactId>jcseg-core</artifactId>
- <version>2.2.0</version>
+ <version>${jcseg.version}</version>
</dependency>
<dependency>
<groupId>com.chenlb.mmseg4j</groupId>
<artifactId>mmseg4j-core</artifactId>
- <version>1.10.0</version>
+ <version>${mmseg4j-core.version}</version>
</dependency>
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
- <version>2012_u6</version>
+ <version>${ikanalyzer.version}</version>
</dependency>
<dependency>
<groupId>org.lz4</groupId>
<artifactId>lz4-java</artifactId>
- <version>1.7.1</version>
+ <version>${lz4.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
- <version>1.21</version>
+ <version>${commons-compress.version}</version>
</dependency>
<dependency>
<groupId>org.eclipse.collections</groupId>
<artifactId>eclipse-collections-api</artifactId>
- <version>10.4.0</version>
+ <version>${eclipse-collections.version}</version>
</dependency>
<dependency>
<groupId>org.eclipse.collections</groupId>
<artifactId>eclipse-collections</artifactId>
- <version>10.4.0</version>
+ <version>${eclipse-collections.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/it.unimi.dsi/fastutil -->
<dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>fastutil</artifactId>
- <version>8.1.0</version>
+ <version>${fastutil.version}</version>
</dependency>
<!-- jwt auth token TODO: move to auth pom -->
<dependency>
<groupId>io.jsonwebtoken</groupId>
<artifactId>jjwt-api</artifactId>
- <version>0.11.2</version>
+ <version>${jjwt.version}</version>
</dependency>
<dependency>
<groupId>io.jsonwebtoken</groupId>
<artifactId>jjwt-impl</artifactId>
- <version>0.11.2</version>
+ <version>${jjwt.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>io.jsonwebtoken</groupId>
<artifactId>jjwt-jackson</artifactId>
- <version>0.11.2</version>
+ <version>${jjwt.version}</version>
<scope>runtime</scope>
</dependency>
</dependencies>
diff --git
a/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/JcsegAnalyzer.java
b/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/JcsegAnalyzer.java
index 211f38429..b4ccb7b70 100644
---
a/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/JcsegAnalyzer.java
+++
b/hugegraph-core/src/main/java/com/baidu/hugegraph/analyzer/JcsegAnalyzer.java
@@ -23,12 +23,11 @@ import java.io.StringReader;
import java.util.List;
import java.util.Set;
-import org.lionsoul.jcseg.tokenizer.core.ADictionary;
-import org.lionsoul.jcseg.tokenizer.core.DictionaryFactory;
-import org.lionsoul.jcseg.tokenizer.core.ISegment;
-import org.lionsoul.jcseg.tokenizer.core.IWord;
-import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig;
-import org.lionsoul.jcseg.tokenizer.core.SegmentFactory;
+import org.lionsoul.jcseg.ISegment;
+import org.lionsoul.jcseg.IWord;
+import org.lionsoul.jcseg.dic.ADictionary;
+import org.lionsoul.jcseg.dic.DictionaryFactory;
+import org.lionsoul.jcseg.segmenter.SegmenterConfig;
import com.baidu.hugegraph.HugeException;
import com.baidu.hugegraph.config.ConfigException;
@@ -45,11 +44,10 @@ public class JcsegAnalyzer implements Analyzer {
"Complex"
);
- private static final JcsegTaskConfig CONFIG = new JcsegTaskConfig();
- private static final ADictionary DIC =
- DictionaryFactory.createDefaultDictionary(new JcsegTaskConfig());
+ private static final SegmenterConfig CONFIG = new SegmenterConfig();
+ private static final ADictionary DIC =
DictionaryFactory.createDefaultDictionary(CONFIG);
- private int segMode;
+ private final ISegment.Type type;
public JcsegAnalyzer(String mode) {
if (!SUPPORT_MODES.contains(mode)) {
@@ -57,17 +55,23 @@ public class JcsegAnalyzer implements Analyzer {
"Unsupported segment mode '%s' for jcseg analyzer, " +
"the available values are %s", mode, SUPPORT_MODES);
}
- this.segMode = SUPPORT_MODES.indexOf(mode) + 1;
+
+ if ("Simple".equals(mode)) {
+ this.type = ISegment.SIMPLE;
+ } else {
+ this.type = ISegment.COMPLEX;
+ }
}
@Override
public Set<String> segment(String text) {
Set<String> result = InsertionOrderUtil.newSet();
try {
- Object[] args = new Object[]{new StringReader(text), CONFIG, DIC};
- ISegment seg = SegmentFactory.createJcseg(this.segMode, args);
- IWord word = null;
- while ((word = seg.next()) != null) {
+ ISegment segmentor = this.type.factory.create(CONFIG, DIC);
+ segmentor.reset(new StringReader(text));
+
+ IWord word;
+ while ((word = segmentor.next()) != null) {
result.add(word.getValue());
}
} catch (Exception e) {
diff --git
a/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/core/AnalyzerTest.java
b/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/core/AnalyzerTest.java
index 76273460a..fdd749993 100644
---
a/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/core/AnalyzerTest.java
+++
b/hugegraph-test/src/main/java/com/baidu/hugegraph/unit/core/AnalyzerTest.java
@@ -100,13 +100,17 @@ public class AnalyzerTest {
"海淀区", "西北旺", "东路", "10", "号", "院"),
analyzer.segment(text2));
- // nlp mode
- analyzer = AnalyzerFactory.analyzer("hanlp", "nlp");
+ // Note latest hanlp portable version not contains model data
+ //
https://github.com/hankcs/HanLP/tree/portable#%E6%96%B9%E5%BC%8F%E4%B8%80maven
+ // So test IndexTokenizer instead
+ analyzer = AnalyzerFactory.analyzer("hanlp", "index");
Assert.assertEquals(setOf("England", " ", "wins", "World", "Cup"),
analyzer.segment(text1));
- Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", ",", "中华人民共和国",
- "国歌", "百度", "科技园", "位于", "北京市",
- "海淀区", "西北旺", "东路10号院"),
+ Assert.assertEquals(setOf("英格兰", "英格", "格兰", "世界杯", "世界", "夺冠", ",",
+ "中华人民共和国", "中华", "华人", "人民", "共和国",
+ "共和","国歌", "百度", "科技园", "科技", "位于",
+ "北京市", "北京", "海淀区", "海淀", "淀区", "西北旺",
+ "西北", "东路", "10", "号", "院"),
analyzer.segment(text2));
}
@@ -153,7 +157,7 @@ public class AnalyzerTest {
analyzer.segment(text1));
Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", ",", "中华",
"人民共和国", "国歌", "百度", "科技", "园", "位于",
- "北京市", "海淀区", "西北", "旺", "东路", "10",
+ "北京市", "海淀区", "西北", "旺", "东路", "1", "0",
"号", "院"),
analyzer.segment(text2));
@@ -163,7 +167,7 @@ public class AnalyzerTest {
analyzer.segment(text1));
Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", ",", "中华",
"人民共和国", "国歌", "百度", "科技", "园", "位于",
- "北京市", "海淀区", "西北", "旺", "东路", "10",
+ "北京市", "海淀区", "西北", "旺", "东路", "1", "0",
"号", "院"),
analyzer.segment(text2));
}