This is an automated email from the ASF dual-hosted git repository.
krickert pushed a commit to branch OPENNLP-1833-grpc-expansion
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
The following commit(s) were added to refs/heads/OPENNLP-1833-grpc-expansion by
this push:
new 9c6a5dd6 Breaking point: OPENNLP-1833: Tighten gRPC v1 contract and
server validation
9c6a5dd6 is described below
commit 9c6a5dd69829679492d82d9817a565708379cb3e
Author: Kristian Rickert <[email protected]>
AuthorDate: Tue Jun 9 23:27:43 2026 -0400
Breaking point: OPENNLP-1833: Tighten gRPC v1 contract and server validation
---
.gitignore | 2 +
opennlp-grpc/README.md | 23 ++--
opennlp-grpc/docs/rfc/opennlp-grpc-design.md | 65 +++++++---
.../docs/rfc/opennlp-grpc-jira-proposal.md | 30 +++--
.../opennlp/grpc/v1/opennlp_document_v1.proto | 30 ++++-
.../opennlp/grpc/v1/opennlp_pipeline_v1.proto | 31 ++++-
opennlp-grpc/opennlp-grpc-service/pom.xml | 18 +++
.../opennlp/grpc/model/ModelBundleCache.java | 19 ++-
.../opennlp/grpc/processor/AnalysisException.java | 2 +
.../grpc/processor/BasicDocumentAnalyzer.java | 139 +++++++++++++++++----
.../opennlp/grpc/processor/OffsetMapper.java | 97 ++++++++++++++
.../opennlp/grpc/profile/ProfileResolver.java | 6 +-
.../grpc/v1/server/OpenNlpAnalysisServiceImpl.java | 1 -
.../src/main/resources/log4j2.xml | 7 +-
.../processor/BasicDocumentAnalyzerPolicyTest.java | 61 +++++++++
.../grpc/processor/BasicDocumentAnalyzerTest.java | 67 ++++++++--
.../opennlp/grpc/processor/OffsetMapperTest.java | 61 +++++++++
.../opennlp/grpc/profile/ProfileResolverTest.java | 19 +++
18 files changed, 595 insertions(+), 83 deletions(-)
diff --git a/.gitignore b/.gitignore
index 91d90c9a..3ec3d094 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,5 +14,7 @@ extlib
*.jar
__pycache__/
*.pyc
+derby.log
+META-INF/MANIFEST.MF
.python-version
\ No newline at end of file
diff --git a/opennlp-grpc/README.md b/opennlp-grpc/README.md
index 4a921aba..cae50bef 100644
--- a/opennlp-grpc/README.md
+++ b/opennlp-grpc/README.md
@@ -44,17 +44,26 @@ Options:
- `-p, --port` - listen port (default `7071`)
- `-c, --config` - key=value config file
-Example config:
+Example config (`key=value`, `#` comments):
```ini
-server.enable_reflection = false
-model.location = extlib
-model.recursive = true
-model.sentdetect.wildcard.pattern = opennlp-models-sentdetect-*.jar
-model.tokenizer.wildcard.pattern = opennlp-models-tokenizer-*.jar
+server.enable_reflection=false
+server.max_inbound_message_size=10485760
+
+# Optional explicit model overrides. When omitted, the en sentence-detector and
+# tokenizer load from the classpath via the opennlp-models-* runtime deps.
+# model.sentence_detector.path=/path/to/en-sent.bin
+# model.tokenizer.path=/path/to/en-token.bin
```
-Place model JARs in `extlib` (or `model.location`).
+By default no configuration is required: the server loads the bundled English
+sentence-detector and tokenizer from the classpath.
+
+> v1 note: this minimal slice implements sentence detection, tokenization,
+> probability reporting, `max_text_length`, offset encoding selection, and the
+> default `en-basic` model bundle. Unsupported backends, ONNX embedding model
+> selection, non-default bundles, and chunk/embed configs are rejected
explicitly
+> instead of being silently ignored.
## v1 API
diff --git a/opennlp-grpc/docs/rfc/opennlp-grpc-design.md
b/opennlp-grpc/docs/rfc/opennlp-grpc-design.md
index 79294914..5640479a 100644
--- a/opennlp-grpc/docs/rfc/opennlp-grpc-design.md
+++ b/opennlp-grpc/docs/rfc/opennlp-grpc-design.md
@@ -38,7 +38,7 @@ Phase 1 is agreement on this contract-the protos and the
design captured here. I
| Field | Value |
| -------------------- | ------------------------------------------------ |
| **Status** | Draft RFC |
-| **Version** | 0.7 |
+| **Version** | 0.8 |
| **API version** | `v1` |
| **OpenNLP baseline** | 3.0.0-SNAPSHOT (JDK 21+) |
| **Companion** | [JIRA proposal](./opennlp-grpc-jira-proposal.md) |
@@ -291,7 +291,8 @@ OpenNLP Java APIs mix coordinate systems:
- Every `AnnotationSpan` in `OpenNlpDocument` and in RPC responses MUST use
`CoordinateSpace.COORDINATE_SPACE_CHAR_DOCUMENT` unless explicitly documented
otherwise.
- Offsets are **half-open** `[start, end)` into `raw_text`, matching
`opennlp.tools.util.Span`.
-- The server is solely responsible for converting token-index spans from
`NameFinderME` to character spans before returning.
+- Offset units are explicit: `OpenNlpDocument.offset_encoding` records whether
all spans are UTF-8 byte offsets, Java/OpenNLP UTF-16 code-unit offsets, or
Unicode code-point offsets. `AnalysisOptions.offset_encoding` selects the
response encoding; unset means UTF-8 bytes.
+- The server is solely responsible for converting Java/OpenNLP offsets and
token-index spans into the requested wire offset encoding before returning.
---
@@ -301,7 +302,7 @@ OpenNLP Java APIs mix coordinate systems:
- Classic models: Java-serialized `.bin` in ZIP/JAR (unchanged).
- Models are **never** sent inline in `AnalyzeDocumentRequest`.
-- Server loads from configurable directory/classpath (port sandbox
`model.location`, wildcards).
+- Server loads the default `en-basic` bundle from the classpath via
`DefaultClassPathModelProvider` (the `opennlp-models-*` runtime deps); optional
`model.sentence_detector.path` / `model.tokenizer.path` config keys override
with explicit `.bin` files.
### 6.2 ModelBundleRef and discovery
@@ -310,11 +311,16 @@ OpenNLP Java APIs mix coordinate systems:
```protobuf
message ModelBundleRef {
string bundle_id = 1;
- map<string, string> component_keys = 2;
+ repeated ComponentModelRef component_models = 2;
+}
+
+message ComponentModelRef {
+ ComponentType component_type = 1;
+ string model_hash = 2;
}
```
-Example `component_keys`: `tokenizer`, `sentence_detector`, `pos`,
`ner_person`, `ner_org`, `embed_minilm`, `langdetect`.
+Example `ComponentType` values: `COMPONENT_TYPE_TOKENIZER`,
`COMPONENT_TYPE_SENTENCE_DETECTOR`, `COMPONENT_TYPE_POS_TAGGER`,
`COMPONENT_TYPE_NAME_FINDER`, `COMPONENT_TYPE_EMBEDDER`,
`COMPONENT_TYPE_LANGUAGE_DETECTOR`.
Server config (or a model resolver) maps `bundle_id` → concrete
artifacts/paths. Clients can send only `bundle_id` when using server-defined
profiles.
@@ -326,13 +332,13 @@ Server config (or a model resolver) maps `bundle_id` →
concrete artifacts/path
`ModelBundleInfo` / `ModelDescriptor` (see full proto in 11.2–11.3) are
intended to carry enough metadata for real client discovery:
- `locale` / language.
-- Component types present (e.g. "sentence_detector", "embed").
+- Component types present (for example `COMPONENT_TYPE_SENTENCE_DETECTOR`,
`COMPONENT_TYPE_TOKENIZER`, `COMPONENT_TYPE_EMBEDDER`).
- Supported or typical `PipelineStep` values this bundle is intended to serve.
- Optional free-form capabilities or tags.
Implementations should populate these fields so that a client can list
bundles, filter by language or capability (e.g. "has an embed component"), and
then pick a `bundle_id` or `profile_id`. The exact richness of the descriptors
can grow over time without breaking v1 clients (additive fields only).
-In the sandbox implementation we will start with `ConfiguredModelLoader` +
`DirectoryModelFinder` under `org.apache.opennlp.grpc.model` (drop the sandbox
copy once OPENNLP-1829 ships in the dependency) and extend it for ONNX
embedding artifacts (model + vocab pairs) as first-class bundle components.
+The current sandbox slice resolves a single shared `en-basic` bundle through
`DefaultClassPathModelProvider` (`opennlp-grpc.model.ModelBundleCache`).
Multi-bundle resolution by `bundle_id` / `component_models`, directory/JAR
discovery (e.g. via OPENNLP-1829's `DirectoryModelFinder` once it ships in the
dependency), and ONNX embedding artifacts (model + vocab pairs) as first-class
bundle components are deferred to later phases.
### 6.3 Profiles
@@ -476,7 +482,7 @@ message OpenNlpDocument {
optional float language_confidence = 4;
repeated AnnotatedSentence sentences = 5;
optional DocumentAnalytics analytics = 6;
- map<string, string> metadata = 7;
+ google.protobuf.Struct metadata = 7;
repeated EmbeddingResult embeddings = 8; // denormalized "all embeddings
with spans" view (optional convenience)
optional DocumentClassification classification = 9;
@@ -485,6 +491,9 @@ message OpenNlpDocument {
// Shared linguistic backbone (sentences above) is computed once; each group
// applies its chunking strategy and named embedding models independently.
repeated ChunkEmbeddingGroup chunk_embedding_groups = 10;
+
+ // Unit of every AnnotationSpan start/end offset in this document.
+ OffsetEncoding offset_encoding = 11;
}
// A named, traceable group of chunks produced by one chunking strategy,
@@ -517,14 +526,20 @@ message ChunkEmbeddingGroup {
// and the multiple embedding models attached directly to it.
repeated Chunk chunks = 5;
- // Optional per-group metadata (timing, counts, provenance, etc.).
- map<string, string> metadata = 6;
+ // Optional typed per-group statistics/provenance.
+ optional ChunkGroupStats stats = 6;
// Primary granularity for the chunks/vectors in this group (CHUNK for
// segmentation-style groups, SENTENCE, etc.).
optional EmbeddingGranularity granularity = 7;
}
+message ChunkGroupStats {
+ int32 chunk_count = 1;
+ int32 total_tokens = 2;
+ int64 processing_time_ms = 3;
+}
+
// A chunk (segmentation or otherwise) with its embeddings attached inside.
// This is the "chunk owns its embedding models" shape (chunking first).
message Chunk {
@@ -637,7 +652,7 @@ enum EmbeddingGranularity {
// produce their own group with CHUNK-granularity vectors).
EMBEDDING_GRANULARITY_CHUNK_LEVEL = 3;
// Future: paragraph, section, or custom spans. Consumers should match on
this
- // enum (plus group metadata) rather than string parsing of config ids.
+ // enum (plus the group id/config fields) rather than string parsing config
ids.
reserved 4 to 10;
}
@@ -726,7 +741,27 @@ message AnalysisProfile {
message ModelBundleRef {
string bundle_id = 1;
- map<string, string> component_keys = 2;
+ repeated ComponentModelRef component_models = 2;
+}
+
+message ComponentModelRef {
+ ComponentType component_type = 1;
+ string model_hash = 2;
+}
+
+enum ComponentType {
+ COMPONENT_TYPE_UNSPECIFIED = 0;
+ COMPONENT_TYPE_LANGUAGE_DETECTOR = 1;
+ COMPONENT_TYPE_SENTENCE_DETECTOR = 2;
+ COMPONENT_TYPE_TOKENIZER = 3;
+ COMPONENT_TYPE_POS_TAGGER = 4;
+ COMPONENT_TYPE_NAME_FINDER = 5;
+ COMPONENT_TYPE_CHUNKER = 6;
+ COMPONENT_TYPE_PARSER = 7;
+ COMPONENT_TYPE_LEMMATIZER = 8;
+ COMPONENT_TYPE_DOC_CATEGORIZER = 9;
+ COMPONENT_TYPE_SENTIMENT = 10;
+ COMPONENT_TYPE_EMBEDDER = 11;
}
message AnalysisOptions {
@@ -735,17 +770,18 @@ message AnalysisOptions {
InferenceBackend inference_backend = 3;
optional int32 max_text_length = 4;
optional string onnx_embedding_model_id = 5;
+ OffsetEncoding offset_encoding = 6;
}
message ModelDescriptor {
string hash = 1;
string name = 2;
string locale = 3;
- string component_type = 4;
+ ComponentType component_type = 4;
// Discovery aids (additive; populated by server for ListModelBundles)
repeated string languages = 5; // e.g. ["en", "eng"]
repeated PipelineStep supported_steps = 6;
- map<string, string> attributes = 7; // free-form (e.g. "dim":"384",
"task":"embed")
+ int32 embedding_dimension = 7; // 0 unless this is an embedding
model
}
message ModelBundleInfo {
@@ -999,6 +1035,7 @@ Two chunking strategies, each with explicitly named
embedding models (not an aut
| Version | Date | Changes
|
| ------- | ---------- |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
+| 0.8 | 2026-06-08 | §6: replace the removed
`ConfiguredModelLoader`/`DirectoryModelFinder` + `model.location` scanning
design with the current `DefaultClassPathModelProvider` single-`en-basic`
bundle; note multi-bundle/discovery/ONNX as deferred. |
| 0.7 | 2026-06-08 | Reconcile §11/§12 with the on-disk `*_v1.proto`
sources: `CharSpan`→`AnnotationSpan` (canonical; the coordinate space, not the
name, distinguishes char vs. token offsets), `ChunkingSpec` field names
(`algorithm`/`chunk_size`/`chunk_overlap`) + document `SemanticChunkingConfig`,
`optional clear_adaptive_data`, full-path proto imports and `*_v1.proto`
filenames. |
| 0.6 | 2026-06-07 | Add `buf.yaml`; fix Buf STANDARD lint (enum value
prefixes, remove unused import). |
| 0.5 | 2026-06-06 | Expand conversational Summary: motivation, what the
document-centric gRPC API unlocks (polyglot integration, streaming, shared
infrastructure, search/RAG). |
diff --git a/opennlp-grpc/docs/rfc/opennlp-grpc-jira-proposal.md
b/opennlp-grpc/docs/rfc/opennlp-grpc-jira-proposal.md
index 0e27028b..8974d049 100644
--- a/opennlp-grpc/docs/rfc/opennlp-grpc-jira-proposal.md
+++ b/opennlp-grpc/docs/rfc/opennlp-grpc-jira-proposal.md
@@ -67,7 +67,7 @@ Evolve the sandbox POC into ASF-native modules (target: main
repo after consensu
### Design highlights
1. **Three proto layers (NLP-only):** domain types (`OpenNlpDocument`),
pipeline config (`AnalysisProfile`), service (`OpenNlpAnalysisService`)
-2. **Offset contract:** All exported spans use **character offsets in the
original `raw_text`** (`CHAR_DOCUMENT`), half-open `[start, end)` matching
`opennlp.tools.util.Span`
+2. **Offset contract:** All exported spans are half-open `[start, end)` ranges
in the original `raw_text`; `CoordinateSpace` says what the range is relative
to, and `OffsetEncoding` says whether the units are UTF-8 bytes, UTF-16 code
units, or Unicode code points
3. **Model bundles:** Replace per-RPC `model_hash` with `ModelBundleRef` +
server-defined profiles (reuse sandbox model discovery patterns)
4. **Thread safety:** Leverage OpenNLP 3.0 thread-safe `*ME` instances cached
per model bundle
@@ -85,6 +85,8 @@ package org.apache.opennlp.grpc.v1;
option java_package = "org.apache.opennlp.grpc.v1";
option java_multiple_files = true;
+import "google/protobuf/struct.proto";
+
// --- Layer 1: Document ---
message OpenNlpDocument {
@@ -93,38 +95,48 @@ message OpenNlpDocument {
optional string detected_language = 3;
optional float language_confidence = 4;
repeated AnnotatedSentence sentences = 5;
- map<string, string> metadata = 6;
+ optional DocumentAnalytics analytics = 6;
+ google.protobuf.Struct metadata = 7;
+ OffsetEncoding offset_encoding = 11;
}
message AnnotatedSentence {
- CharSpan sentence_span = 1;
+ AnnotationSpan sentence_span = 1;
repeated Token tokens = 2;
repeated NamedEntity entities = 3;
}
message Token {
string text = 1;
- CharSpan char_span = 2;
+ AnnotationSpan annotation_span = 2;
optional string pos_tag = 3;
}
message NamedEntity {
- CharSpan char_span = 1;
+ AnnotationSpan annotation_span = 1;
string entity_type = 2;
- optional double prob = 3;
+ optional double probability = 3;
}
-message CharSpan {
+message AnnotationSpan {
int32 start = 1;
int32 end = 2;
CoordinateSpace space = 3;
optional string type = 4;
- optional double prob = 5;
+ optional double probability = 5;
}
enum CoordinateSpace {
COORDINATE_SPACE_UNSPECIFIED = 0;
- CHAR_DOCUMENT = 1;
+ COORDINATE_SPACE_CHAR_DOCUMENT = 1;
+ COORDINATE_SPACE_TOKEN_SENTENCE = 2;
+}
+
+enum OffsetEncoding {
+ OFFSET_ENCODING_UNSPECIFIED = 0;
+ OFFSET_ENCODING_UTF8_BYTE = 1;
+ OFFSET_ENCODING_UTF16_CODE_UNIT = 2;
+ OFFSET_ENCODING_UNICODE_CODE_POINT = 3;
}
// --- Layer 2: Pipeline ---
diff --git
a/opennlp-grpc/opennlp-grpc-api/src/main/proto/org/apache/opennlp/grpc/v1/opennlp_document_v1.proto
b/opennlp-grpc/opennlp-grpc-api/src/main/proto/org/apache/opennlp/grpc/v1/opennlp_document_v1.proto
index 442bf313..bb6b62b0 100644
---
a/opennlp-grpc/opennlp-grpc-api/src/main/proto/org/apache/opennlp/grpc/v1/opennlp_document_v1.proto
+++
b/opennlp-grpc/opennlp-grpc-api/src/main/proto/org/apache/opennlp/grpc/v1/opennlp_document_v1.proto
@@ -22,6 +22,8 @@ package org.apache.opennlp.grpc.v1;
option java_package = "org.apache.opennlp.grpc.v1";
option java_multiple_files = true;
+import "google/protobuf/struct.proto";
+
// Canonical 1:1 NLP document: text in, annotations out.
// The sentences list provides the shared base linguistic analysis
// (computed once even when multiple chunk+embed groups are requested).
@@ -32,7 +34,9 @@ message OpenNlpDocument {
optional float language_confidence = 4;
repeated AnnotatedSentence sentences = 5;
optional DocumentAnalytics analytics = 6;
- map<string, string> metadata = 7;
+ // Opaque client-supplied metadata, echoed back verbatim. The server never
+ // writes first-class results here; it is a passthrough lane only.
+ google.protobuf.Struct metadata = 7;
repeated EmbeddingResult embeddings = 8; // denormalized convenience "all
vectors + spans"
optional DocumentClassification classification = 9;
@@ -40,6 +44,10 @@ message OpenNlpDocument {
// from one analysis. Each group corresponds to one chunking strategy with
// its explicitly requested embedding models attached inside the chunks.
repeated ChunkEmbeddingGroup chunk_embedding_groups = 10;
+
+ // Unit of every AnnotationSpan start/end offset in this document (uniform).
+ // Echoes the requested AnalysisOptions.offset_encoding; defaults to UTF-8
bytes.
+ OffsetEncoding offset_encoding = 11;
}
message AnnotatedSentence {
@@ -84,6 +92,17 @@ enum CoordinateSpace {
COORDINATE_SPACE_TOKEN_SENTENCE = 2;
}
+// Unit that AnnotationSpan.start/end are measured in. Orthogonal to
+// CoordinateSpace (which says what the offset is relative to). The wire
+// default is UTF-8 bytes, aligning with the protobuf encoding of raw_text so
+// non-JVM clients slice correctly without conversion.
+enum OffsetEncoding {
+ OFFSET_ENCODING_UNSPECIFIED = 0;
+ OFFSET_ENCODING_UTF8_BYTE = 1; // default; matches the raw_text
wire bytes
+ OFFSET_ENCODING_UTF16_CODE_UNIT = 2; // Java/OpenNLP native
+ OFFSET_ENCODING_UNICODE_CODE_POINT = 3; // language-neutral codepoints
+}
+
message DocumentAnalytics {
int32 total_tokens = 1;
int32 total_sentences = 2;
@@ -131,10 +150,17 @@ message ChunkEmbeddingGroup {
repeated string embedding_model_ids = 3; // exactly as named for this
strategy
optional string result_set_name = 4;
repeated Chunk chunks = 5;
- map<string, string> metadata = 6;
+ optional ChunkGroupStats stats = 6;
optional EmbeddingGranularity granularity = 7;
}
+// Typed per-group statistics/provenance (replaces an untyped metadata map).
+message ChunkGroupStats {
+ int32 chunk_count = 1;
+ int32 total_tokens = 2;
+ int64 processing_time_ms = 3;
+}
+
message ParseTree {
ParseNode root = 1;
}
diff --git
a/opennlp-grpc/opennlp-grpc-api/src/main/proto/org/apache/opennlp/grpc/v1/opennlp_pipeline_v1.proto
b/opennlp-grpc/opennlp-grpc-api/src/main/proto/org/apache/opennlp/grpc/v1/opennlp_pipeline_v1.proto
index 780b8855..5b25095d 100644
---
a/opennlp-grpc/opennlp-grpc-api/src/main/proto/org/apache/opennlp/grpc/v1/opennlp_pipeline_v1.proto
+++
b/opennlp-grpc/opennlp-grpc-api/src/main/proto/org/apache/opennlp/grpc/v1/opennlp_pipeline_v1.proto
@@ -22,6 +22,8 @@ package org.apache.opennlp.grpc.v1;
option java_package = "org.apache.opennlp.grpc.v1";
option java_multiple_files = true;
+import "org/apache/opennlp/grpc/v1/opennlp_document_v1.proto";
+
// Pipeline steps supported by AnalyzeDocument (and future streaming variants).
enum PipelineStep {
PIPELINE_STEP_UNSPECIFIED = 0;
@@ -91,7 +93,28 @@ message AnalysisProfile {
message ModelBundleRef {
string bundle_id = 1;
- map<string, string> component_keys = 2;
+ repeated ComponentModelRef component_models = 2;
+}
+
+// Binds a pipeline component to a specific model artifact (by SHA-256 hash).
+message ComponentModelRef {
+ ComponentType component_type = 1;
+ string model_hash = 2;
+}
+
+enum ComponentType {
+ COMPONENT_TYPE_UNSPECIFIED = 0;
+ COMPONENT_TYPE_LANGUAGE_DETECTOR = 1;
+ COMPONENT_TYPE_SENTENCE_DETECTOR = 2;
+ COMPONENT_TYPE_TOKENIZER = 3;
+ COMPONENT_TYPE_POS_TAGGER = 4;
+ COMPONENT_TYPE_NAME_FINDER = 5;
+ COMPONENT_TYPE_CHUNKER = 6;
+ COMPONENT_TYPE_PARSER = 7;
+ COMPONENT_TYPE_LEMMATIZER = 8;
+ COMPONENT_TYPE_DOC_CATEGORIZER = 9;
+ COMPONENT_TYPE_SENTIMENT = 10;
+ COMPONENT_TYPE_EMBEDDER = 11;
}
message AnalysisOptions {
@@ -101,16 +124,18 @@ message AnalysisOptions {
InferenceBackend inference_backend = 3;
optional int32 max_text_length = 4;
optional string onnx_embedding_model_id = 5;
+ // Unit for response AnnotationSpan offsets. Unset => UTF-8 bytes (the
default).
+ OffsetEncoding offset_encoding = 6;
}
message ModelDescriptor {
string hash = 1;
string name = 2;
string locale = 3;
- string component_type = 4;
+ ComponentType component_type = 4;
repeated string languages = 5;
repeated PipelineStep supported_steps = 6;
- map<string, string> attributes = 7;
+ int32 embedding_dimension = 7; // 0 unless this is an embedding model
}
message ModelBundleInfo {
diff --git a/opennlp-grpc/opennlp-grpc-service/pom.xml
b/opennlp-grpc/opennlp-grpc-service/pom.xml
index 6312e991..4e2400fb 100644
--- a/opennlp-grpc/opennlp-grpc-service/pom.xml
+++ b/opennlp-grpc/opennlp-grpc-service/pom.xml
@@ -163,6 +163,24 @@
<configuration>
<finalName>opennlp-grpc-server-${project.version}</finalName>
<createDependencyReducedPom>false</createDependencyReducedPom>
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*.SF</exclude>
+ <exclude>META-INF/*.DSA</exclude>
+ <exclude>META-INF/*.RSA</exclude>
+
<exclude>META-INF/DEPENDENCIES</exclude>
+ <exclude>META-INF/LICENSE</exclude>
+ <exclude>META-INF/LICENSE.txt</exclude>
+ <exclude>META-INF/MANIFEST.MF</exclude>
+ <exclude>META-INF/NOTICE</exclude>
+
<exclude>META-INF/versions/*/module-info.class</exclude>
+ <exclude>module-info.class</exclude>
+ <exclude>model.properties</exclude>
+ </excludes>
+ </filter>
+ </filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
diff --git
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/model/ModelBundleCache.java
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/model/ModelBundleCache.java
index dc6e506c..0b48e1f7 100644
---
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/model/ModelBundleCache.java
+++
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/model/ModelBundleCache.java
@@ -29,14 +29,13 @@ import java.util.Objects;
import opennlp.tools.models.ClassPathModelProvider;
import opennlp.tools.models.DefaultClassPathModelProvider;
import opennlp.tools.models.ModelType;
-import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
-import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.opennlp.grpc.profile.ProfileRegistry;
import org.apache.opennlp.grpc.processor.AnalysisException;
+import org.apache.opennlp.grpc.v1.ComponentType;
import org.apache.opennlp.grpc.v1.ModelBundleInfo;
import org.apache.opennlp.grpc.v1.ModelDescriptor;
import org.apache.opennlp.grpc.v1.PipelineStep;
@@ -55,8 +54,8 @@ public final class ModelBundleCache {
private final ClassPathModelProvider modelProvider;
private final Map<String, ModelBundleInfo> bundles;
- private final SentenceDetector sentenceDetector;
- private final Tokenizer tokenizer;
+ private final SentenceDetectorME sentenceDetector;
+ private final TokenizerME tokenizer;
public ModelBundleCache(Map<String, String> configuration) {
Objects.requireNonNull(configuration, "configuration");
@@ -66,11 +65,11 @@ public final class ModelBundleCache {
this.bundles = buildBundleCatalog();
}
- public SentenceDetector getSentenceDetector() {
+ public SentenceDetectorME getSentenceDetector() {
return sentenceDetector;
}
- public Tokenizer getTokenizer() {
+ public TokenizerME getTokenizer() {
return tokenizer;
}
@@ -78,7 +77,7 @@ public final class ModelBundleCache {
return new ArrayList<>(bundles.values());
}
- private SentenceDetector loadSentenceDetector(Map<String, String>
configuration) {
+ private SentenceDetectorME loadSentenceDetector(Map<String, String>
configuration) {
try {
final String configuredPath = configuration.get(KEY_SENTDETECT_PATH);
final SentenceModel model;
@@ -99,7 +98,7 @@ public final class ModelBundleCache {
}
}
- private Tokenizer loadTokenizer(Map<String, String> configuration) {
+ private TokenizerME loadTokenizer(Map<String, String> configuration) {
try {
final String configuredPath = configuration.get(KEY_TOKENIZER_PATH);
final TokenizerModel model;
@@ -130,13 +129,13 @@ public final class ModelBundleCache {
.addModels(ModelDescriptor.newBuilder()
.setName("opennlp-models-sentdetect-" + DEFAULT_LANGUAGE)
.setLocale(DEFAULT_LANGUAGE)
- .setComponentType("sentdetect")
+ .setComponentType(ComponentType.COMPONENT_TYPE_SENTENCE_DETECTOR)
.addLanguages(DEFAULT_LANGUAGE)
.build())
.addModels(ModelDescriptor.newBuilder()
.setName("opennlp-models-tokenizer-" + DEFAULT_LANGUAGE)
.setLocale(DEFAULT_LANGUAGE)
- .setComponentType("tokenizer")
+ .setComponentType(ComponentType.COMPONENT_TYPE_TOKENIZER)
.addLanguages(DEFAULT_LANGUAGE)
.build())
.build());
diff --git
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/AnalysisException.java
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/AnalysisException.java
index 0c4c3c3f..37162a34 100644
---
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/AnalysisException.java
+++
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/AnalysisException.java
@@ -23,6 +23,8 @@ package org.apache.opennlp.grpc.processor;
*/
public final class AnalysisException extends RuntimeException {
+ private static final long serialVersionUID = 1L;
+
public enum FailureType {
/** Client supplied an invalid request. */
INVALID_ARGUMENT,
diff --git
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzer.java
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzer.java
index 7165d6e1..2df3f089 100644
---
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzer.java
+++
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzer.java
@@ -22,12 +22,13 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
-import opennlp.tools.sentdetect.SentenceDetector;
-import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.util.Span;
import org.apache.opennlp.grpc.model.ModelBundleCache;
import org.apache.opennlp.grpc.profile.ProfileRegistry;
import org.apache.opennlp.grpc.profile.ProfileResolver;
+import org.apache.opennlp.grpc.v1.AnalysisOptions;
import org.apache.opennlp.grpc.v1.AnalysisProfile;
import org.apache.opennlp.grpc.v1.AnalyzeDocumentRequest;
import org.apache.opennlp.grpc.v1.AnalyzeDocumentResponse;
@@ -36,6 +37,9 @@ import org.apache.opennlp.grpc.v1.AnnotationSpan;
import org.apache.opennlp.grpc.v1.ChunkEmbedConfigEntry;
import org.apache.opennlp.grpc.v1.CoordinateSpace;
import org.apache.opennlp.grpc.v1.DiagnosticSeverity;
+import org.apache.opennlp.grpc.v1.InferenceBackend;
+import org.apache.opennlp.grpc.v1.ModelBundleRef;
+import org.apache.opennlp.grpc.v1.OffsetEncoding;
import org.apache.opennlp.grpc.v1.OpenNlpDocument;
import org.apache.opennlp.grpc.v1.PipelineStep;
import org.apache.opennlp.grpc.v1.ProcessingDiagnostic;
@@ -44,6 +48,9 @@ import org.apache.opennlp.grpc.v1.Token;
/**
* Initial v1 processor: shared sentence detection and tokenization backbone.
+ *
+ * <p>Internally all offsets are computed in Java UTF-16 indices; a final pass
converts
+ * every span to the client-requested {@link OffsetEncoding} (default UTF-8
bytes).
*/
public class BasicDocumentAnalyzer implements DocumentAnalyzer {
@@ -75,19 +82,24 @@ public class BasicDocumentAnalyzer implements
DocumentAnalyzer {
}
final AnalysisProfile profile = profileResolver.resolve(request);
- validateSupportedRequest(request, profile);
+ validateSupportedRequest(request, profile, rawText);
+
+ final boolean includeProbabilities =
+ request.hasOptions() && request.getOptions().getIncludeProbabilities();
final List<ProcessingDiagnostic> diagnostics = new ArrayList<>();
final OpenNlpDocument.Builder document = OpenNlpDocument.newBuilder()
.setDocId(input.getDocId())
- .setRawText(rawText)
- .putAllMetadata(input.getMetadataMap());
+ .setRawText(rawText);
+ if (input.hasMetadata()) {
+ document.setMetadata(input.getMetadata());
+ }
if (PipelineStepPolicy.shouldRun(profile,
PipelineStep.PIPELINE_STEP_SENTENCE_DETECT)) {
runStep(
PipelineStep.PIPELINE_STEP_SENTENCE_DETECT,
diagnostics,
- () -> runSentenceDetection(rawText, document, diagnostics));
+ () -> runSentenceDetection(rawText, document, includeProbabilities,
diagnostics));
} else {
addSkippedDiagnostic(diagnostics,
PipelineStep.PIPELINE_STEP_SENTENCE_DETECT);
}
@@ -102,18 +114,24 @@ public class BasicDocumentAnalyzer implements
DocumentAnalyzer {
runStep(
PipelineStep.PIPELINE_STEP_TOKENIZE,
diagnostics,
- () -> runTokenization(rawText, document, diagnostics));
+ () -> runTokenization(rawText, document, includeProbabilities,
diagnostics));
} else {
addSkippedDiagnostic(diagnostics, PipelineStep.PIPELINE_STEP_TOKENIZE);
}
+ final OffsetEncoding requestedEncoding = request.hasOptions()
+ ? request.getOptions().getOffsetEncoding()
+ : OffsetEncoding.OFFSET_ENCODING_UNSPECIFIED;
+ applyOffsetEncoding(document, rawText, requestedEncoding);
+
return AnalyzeDocumentResponse.newBuilder()
.setDocument(document.build())
.addAllDiagnostics(diagnostics)
.build();
}
- private static void validateSupportedRequest(AnalyzeDocumentRequest request,
AnalysisProfile profile) {
+ private static void validateSupportedRequest(
+ AnalyzeDocumentRequest request, AnalysisProfile profile, String rawText)
{
for (PipelineStep step : profile.getStepsList()) {
if (step == PipelineStep.PIPELINE_STEP_UNSPECIFIED) {
continue;
@@ -123,16 +141,58 @@ public class BasicDocumentAnalyzer implements
DocumentAnalyzer {
}
}
+ validateOptions(request, rawText);
+ validateModelBundle(profile);
+
if (request.getChunkEmbedConfigsCount() == 0) {
return;
}
-
for (ChunkEmbedConfigEntry entry : request.getChunkEmbedConfigsList()) {
validateSemanticChunking(entry);
}
throw AnalysisException.unimplemented("chunk_embed_configs are not
implemented on this server");
}
+ private static void validateOptions(AnalyzeDocumentRequest request, String
rawText) {
+ if (!request.hasOptions()) {
+ return;
+ }
+ final AnalysisOptions options = request.getOptions();
+ final InferenceBackend backend = options.getInferenceBackend();
+ if (backend != InferenceBackend.INFERENCE_BACKEND_UNSPECIFIED
+ && backend != InferenceBackend.INFERENCE_BACKEND_OPENNLP_ME) {
+ throw AnalysisException.unimplemented(
+ "inference_backend " + backend.name() + " is not implemented; only
OPENNLP_ME is supported");
+ }
+ if (options.hasOnnxEmbeddingModelId() &&
!options.getOnnxEmbeddingModelId().isBlank()) {
+ throw AnalysisException.unimplemented(
+ "onnx_embedding_model_id is not implemented (no EMBED step on this
server)");
+ }
+ if (options.hasMaxTextLength()
+ && options.getMaxTextLength() > 0
+ && rawText.length() > options.getMaxTextLength()) {
+ throw AnalysisException.invalidArgument(
+ "document.raw_text exceeds max_text_length (" +
options.getMaxTextLength() + ")");
+ }
+ }
+
+ private static void validateModelBundle(AnalysisProfile profile) {
+ if (!profile.hasModelBundle()) {
+ return;
+ }
+ final ModelBundleRef bundle = profile.getModelBundle();
+ final String bundleId = bundle.getBundleId();
+ if (!bundleId.isBlank() &&
!bundleId.equals(ProfileRegistry.DEFAULT_BUNDLE_ID)) {
+ throw AnalysisException.notFound(
+ "Unknown model bundle '" + bundleId + "'; only '"
+ + ProfileRegistry.DEFAULT_BUNDLE_ID + "' is available");
+ }
+ if (bundle.getComponentModelsCount() > 0) {
+ throw AnalysisException.unimplemented(
+ "per-component model selection (component_models) is not
implemented");
+ }
+ }
+
private static void validateSemanticChunking(ChunkEmbedConfigEntry entry) {
if (!entry.hasChunking() || !entry.getChunking().hasSemanticConfig()) {
return;
@@ -164,13 +224,17 @@ public class BasicDocumentAnalyzer implements
DocumentAnalyzer {
private void runSentenceDetection(
String rawText,
OpenNlpDocument.Builder document,
+ boolean includeProbabilities,
List<ProcessingDiagnostic> diagnostics) {
- final SentenceDetector detector = modelBundleCache.getSentenceDetector();
+ final SentenceDetectorME detector = modelBundleCache.getSentenceDetector();
final Span[] spans = detector.sentPosDetect(rawText);
- for (Span span : spans) {
- document.addSentences(AnnotatedSentence.newBuilder()
- .setSentenceSpan(toAnnotationSpan(span))
- .build());
+ final double[] probabilities = includeProbabilities ? detector.probs() :
null;
+ for (int i = 0; i < spans.length; i++) {
+ final AnnotationSpan.Builder span =
toAnnotationSpan(spans[i]).toBuilder();
+ if (probabilities != null && i < probabilities.length) {
+ span.setProbability(probabilities[i]);
+ }
+
document.addSentences(AnnotatedSentence.newBuilder().setSentenceSpan(span).build());
}
diagnostics.add(ProcessingDiagnostic.newBuilder()
.setStep(PipelineStep.PIPELINE_STEP_SENTENCE_DETECT)
@@ -182,23 +246,29 @@ public class BasicDocumentAnalyzer implements
DocumentAnalyzer {
private void runTokenization(
String rawText,
OpenNlpDocument.Builder document,
+ boolean includeProbabilities,
List<ProcessingDiagnostic> diagnostics) {
- final Tokenizer tokenizer = modelBundleCache.getTokenizer();
+ final TokenizerME tokenizer = modelBundleCache.getTokenizer();
int tokenCount = 0;
for (int i = 0; i < document.getSentencesCount(); i++) {
final AnnotatedSentence sentence = document.getSentences(i);
final AnnotationSpan sentenceSpan = sentence.getSentenceSpan();
final String sentenceText = rawText.substring(sentenceSpan.getStart(),
sentenceSpan.getEnd());
final Span[] tokenSpans = tokenizer.tokenizePos(sentenceText);
+ final double[] probabilities = includeProbabilities ? tokenizer.probs()
: null;
final AnnotatedSentence.Builder sentenceBuilder = sentence.toBuilder();
- for (Span tokenSpan : tokenSpans) {
+ for (int t = 0; t < tokenSpans.length; t++) {
+ final Span tokenSpan = tokenSpans[t];
+ final AnnotationSpan.Builder span = AnnotationSpan.newBuilder()
+ .setStart(sentenceSpan.getStart() + tokenSpan.getStart())
+ .setEnd(sentenceSpan.getStart() + tokenSpan.getEnd())
+ .setSpace(CoordinateSpace.COORDINATE_SPACE_CHAR_DOCUMENT);
+ if (probabilities != null && t < probabilities.length) {
+ span.setProbability(probabilities[t]);
+ }
sentenceBuilder.addTokens(Token.newBuilder()
.setText(sentenceText.substring(tokenSpan.getStart(),
tokenSpan.getEnd()))
- .setAnnotationSpan(AnnotationSpan.newBuilder()
- .setStart(sentenceSpan.getStart() + tokenSpan.getStart())
- .setEnd(sentenceSpan.getStart() + tokenSpan.getEnd())
- .setSpace(CoordinateSpace.COORDINATE_SPACE_CHAR_DOCUMENT)
- .build())
+ .setAnnotationSpan(span)
.build());
tokenCount++;
}
@@ -211,6 +281,33 @@ public class BasicDocumentAnalyzer implements
DocumentAnalyzer {
.build());
}
+ /**
+ * Converts every span in the document from Java UTF-16 indices to the
requested
+ * {@link OffsetEncoding} and records the chosen encoding on the document.
+ */
+ private static void applyOffsetEncoding(
+ OpenNlpDocument.Builder document, String rawText, OffsetEncoding
requested) {
+ final OffsetMapper mapper = OffsetMapper.forText(rawText, requested);
+ for (int i = 0; i < document.getSentencesCount(); i++) {
+ final AnnotatedSentence.Builder sentence =
document.getSentences(i).toBuilder();
+ sentence.setSentenceSpan(remap(sentence.getSentenceSpan(), mapper));
+ for (int t = 0; t < sentence.getTokensCount(); t++) {
+ final Token.Builder token = sentence.getTokens(t).toBuilder();
+ token.setAnnotationSpan(remap(token.getAnnotationSpan(), mapper));
+ sentence.setTokens(t, token.build());
+ }
+ document.setSentences(i, sentence.build());
+ }
+ document.setOffsetEncoding(mapper.encoding());
+ }
+
+ private static AnnotationSpan remap(AnnotationSpan span, OffsetMapper
mapper) {
+ return span.toBuilder()
+ .setStart(mapper.toTarget(span.getStart()))
+ .setEnd(mapper.toTarget(span.getEnd()))
+ .build();
+ }
+
private static void addSkippedDiagnostic(List<ProcessingDiagnostic>
diagnostics, PipelineStep step) {
diagnostics.add(ProcessingDiagnostic.newBuilder()
.setStep(step)
diff --git
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/OffsetMapper.java
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/OffsetMapper.java
new file mode 100644
index 00000000..5bf686f2
--- /dev/null
+++
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/processor/OffsetMapper.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the specific
+ * language governing permissions and limitations under the License.
+ */
+package org.apache.opennlp.grpc.processor;
+
+import org.apache.opennlp.grpc.v1.OffsetEncoding;
+
+/**
+ * Converts annotation offsets from Java/OpenNLP UTF-16 code-unit indices into
the
+ * {@link OffsetEncoding} requested by the client. The wire default is UTF-8
bytes,
+ * which aligns with the protobuf encoding of {@code raw_text} so non-JVM
clients
+ * slice the text correctly without conversion.
+ *
+ * <p>Span boundaries produced by OpenNLP always fall on Unicode code-point
+ * boundaries, so a single prefix table keyed by Java char index is sufficient.
+ */
+final class OffsetMapper {
+
+ private final int[] javaIndexToTarget;
+ private final OffsetEncoding encoding;
+
+ private OffsetMapper(int[] javaIndexToTarget, OffsetEncoding encoding) {
+ this.javaIndexToTarget = javaIndexToTarget;
+ this.encoding = encoding;
+ }
+
+ /** Resolves the requested encoding, mapping {@code UNSPECIFIED} to the
UTF-8 byte default. */
+ static OffsetEncoding resolve(OffsetEncoding requested) {
+ return requested == null || requested ==
OffsetEncoding.OFFSET_ENCODING_UNSPECIFIED
+ ? OffsetEncoding.OFFSET_ENCODING_UTF8_BYTE
+ : requested;
+ }
+
+ static OffsetMapper forText(String text, OffsetEncoding requested) {
+ final OffsetEncoding resolved = resolve(requested);
+ final int length = text.length();
+ final int[] map = new int[length + 1];
+ int target = 0;
+ int i = 0;
+ while (i < length) {
+ final int codePoint = text.codePointAt(i);
+ final int charCount = Character.charCount(codePoint);
+ map[i] = target;
+ if (charCount == 2) {
+ // Low surrogate index shares the code point start; never a span
boundary.
+ map[i + 1] = target;
+ }
+ target += unitsFor(codePoint, resolved, charCount);
+ i += charCount;
+ }
+ map[length] = target;
+ return new OffsetMapper(map, resolved);
+ }
+
+ OffsetEncoding encoding() {
+ return encoding;
+ }
+
+ int toTarget(int javaIndex) {
+ return javaIndexToTarget[javaIndex];
+ }
+
+ private static int unitsFor(int codePoint, OffsetEncoding encoding, int
charCount) {
+ return switch (encoding) {
+ case OFFSET_ENCODING_UTF16_CODE_UNIT -> charCount;
+ case OFFSET_ENCODING_UNICODE_CODE_POINT -> 1;
+ default -> utf8ByteLength(codePoint);
+ };
+ }
+
+ private static int utf8ByteLength(int codePoint) {
+ if (codePoint < 0x80) {
+ return 1;
+ }
+ if (codePoint < 0x800) {
+ return 2;
+ }
+ if (codePoint < 0x10000) {
+ return 3;
+ }
+ return 4;
+ }
+}
diff --git
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/profile/ProfileResolver.java
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/profile/ProfileResolver.java
index 92f52c82..db075211 100644
---
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/profile/ProfileResolver.java
+++
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/profile/ProfileResolver.java
@@ -48,9 +48,7 @@ public final class ProfileResolver {
final boolean hasProfileId = request.hasProfileId() &&
!request.getProfileId().isBlank();
final boolean hasInlineProfile = request.hasProfile();
final AnalysisProfile inline = hasInlineProfile ? request.getProfile() :
null;
- final boolean inlineHasSteps = inline != null && inline.getStepsCount() >
0;
-
- if (hasProfileId && hasInlineProfile && inlineHasSteps) {
+ if (hasProfileId && hasInlineProfile) {
final AnalysisProfile serverProfile =
registry.find(request.getProfileId())
.orElseThrow(() -> AnalysisException.notFound("Unknown profile_id: "
+ request.getProfileId()));
return merge(serverProfile, inline);
@@ -61,7 +59,7 @@ public final class ProfileResolver {
.orElseThrow(() -> AnalysisException.notFound("Unknown profile_id: "
+ request.getProfileId()));
}
- if (hasInlineProfile && inlineHasSteps) {
+ if (hasInlineProfile && inline.getStepsCount() > 0) {
return inline;
}
diff --git
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/v1/server/OpenNlpAnalysisServiceImpl.java
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/v1/server/OpenNlpAnalysisServiceImpl.java
index 702ed7d0..214bb793 100644
---
a/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/v1/server/OpenNlpAnalysisServiceImpl.java
+++
b/opennlp-grpc/opennlp-grpc-service/src/main/java/org/apache/opennlp/grpc/v1/server/OpenNlpAnalysisServiceImpl.java
@@ -33,7 +33,6 @@ import org.apache.opennlp.grpc.v1.GetServiceInfoResponse;
import org.apache.opennlp.grpc.v1.ListModelBundlesRequest;
import org.apache.opennlp.grpc.v1.ListModelBundlesResponse;
import org.apache.opennlp.grpc.v1.OpenNlpAnalysisServiceGrpc;
-import org.apache.opennlp.grpc.v1.PipelineStep;
/**
* gRPC adapter for the v1 document-centric API.
diff --git a/opennlp-grpc/opennlp-grpc-service/src/main/resources/log4j2.xml
b/opennlp-grpc/opennlp-grpc-service/src/main/resources/log4j2.xml
index 081ebdbc..99bbb2f0 100644
--- a/opennlp-grpc/opennlp-grpc-service/src/main/resources/log4j2.xml
+++ b/opennlp-grpc/opennlp-grpc-service/src/main/resources/log4j2.xml
@@ -23,12 +23,15 @@
<!--
The pattern can be adjusted as needed, see
https://logging.apache.org/log4j/2.x/manual/layouts.html
-->
- <PatternLayout pattern="%m%n"/>
+ <PatternLayout pattern="%d{ISO8601} %-5level [%t] %c{1} - %m%n"/>
</Console>
</Appenders>
<Loggers>
- <Root level="DEBUG">
+ <!-- gRPC/Netty emit per-frame DEBUG traffic; keep them quiet by
default. -->
+ <Logger name="io.grpc" level="WARN"/>
+ <Logger name="io.netty" level="WARN"/>
+ <Root level="INFO">
<AppenderRef ref="STDOUT"/>
</Root>
</Loggers>
diff --git
a/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzerPolicyTest.java
b/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzerPolicyTest.java
index a49f60c5..da40e5ec 100644
---
a/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzerPolicyTest.java
+++
b/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzerPolicyTest.java
@@ -19,11 +19,14 @@ package org.apache.opennlp.grpc.processor;
import java.util.Map;
+import org.apache.opennlp.grpc.v1.AnalysisOptions;
import org.apache.opennlp.grpc.v1.AnalysisProfile;
import org.apache.opennlp.grpc.v1.AnalyzeDocumentRequest;
import org.apache.opennlp.grpc.v1.ChunkEmbedConfigEntry;
import org.apache.opennlp.grpc.v1.ChunkingSpec;
import org.apache.opennlp.grpc.v1.DiagnosticSeverity;
+import org.apache.opennlp.grpc.v1.InferenceBackend;
+import org.apache.opennlp.grpc.v1.ModelBundleRef;
import org.apache.opennlp.grpc.v1.OpenNlpDocument;
import org.apache.opennlp.grpc.v1.PipelineStep;
import org.apache.opennlp.grpc.v1.SemanticChunkingConfig;
@@ -110,4 +113,62 @@ class BasicDocumentAnalyzerPolicyTest {
&& d.getSeverity() == DiagnosticSeverity.DIAGNOSTIC_SEVERITY_INFO
&& d.getMessage().contains("skipped")));
}
+
+ @Test
+ void rejectsTextExceedingMaxTextLength() {
+ final BasicDocumentAnalyzer analyzer = new BasicDocumentAnalyzer(Map.of());
+
+ final AnalysisException error = assertThrows(AnalysisException.class, ()
-> analyzer.analyze(
+ AnalyzeDocumentRequest.newBuilder()
+ .setDocument(OpenNlpDocument.newBuilder().setRawText("This is too
long.").build())
+
.setOptions(AnalysisOptions.newBuilder().setMaxTextLength(4).build())
+ .build()));
+
+ assertEquals(AnalysisException.FailureType.INVALID_ARGUMENT,
error.getFailureType());
+ }
+
+ @Test
+ void rejectsUnsupportedInferenceBackend() {
+ final BasicDocumentAnalyzer analyzer = new BasicDocumentAnalyzer(Map.of());
+
+ final AnalysisException error = assertThrows(AnalysisException.class, ()
-> analyzer.analyze(
+ AnalyzeDocumentRequest.newBuilder()
+ .setDocument(OpenNlpDocument.newBuilder().setRawText("Hello
world.").build())
+ .setOptions(AnalysisOptions.newBuilder()
+
.setInferenceBackend(InferenceBackend.INFERENCE_BACKEND_ONNX_RUNTIME)
+ .build())
+ .build()));
+
+ assertEquals(AnalysisException.FailureType.UNIMPLEMENTED,
error.getFailureType());
+ }
+
+ @Test
+ void rejectsOnnxEmbeddingModelId() {
+ final BasicDocumentAnalyzer analyzer = new BasicDocumentAnalyzer(Map.of());
+
+ final AnalysisException error = assertThrows(AnalysisException.class, ()
-> analyzer.analyze(
+ AnalyzeDocumentRequest.newBuilder()
+ .setDocument(OpenNlpDocument.newBuilder().setRawText("Hello
world.").build())
+
.setOptions(AnalysisOptions.newBuilder().setOnnxEmbeddingModelId("minilm").build())
+ .build()));
+
+ assertEquals(AnalysisException.FailureType.UNIMPLEMENTED,
error.getFailureType());
+ }
+
+ @Test
+ void rejectsUnknownModelBundle() {
+ final BasicDocumentAnalyzer analyzer = new BasicDocumentAnalyzer(Map.of());
+
+ final AnalysisException error = assertThrows(AnalysisException.class, ()
-> analyzer.analyze(
+ AnalyzeDocumentRequest.newBuilder()
+ .setDocument(OpenNlpDocument.newBuilder().setRawText("Hello
world.").build())
+ .setProfile(AnalysisProfile.newBuilder()
+ .setProfileId("custom")
+ .addSteps(PipelineStep.PIPELINE_STEP_SENTENCE_DETECT)
+
.setModelBundle(ModelBundleRef.newBuilder().setBundleId("de-custom").build())
+ .build())
+ .build()));
+
+ assertEquals(AnalysisException.FailureType.NOT_FOUND,
error.getFailureType());
+ }
}
diff --git
a/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzerTest.java
b/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzerTest.java
index fdbc8268..aaaf9637 100644
---
a/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzerTest.java
+++
b/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/BasicDocumentAnalyzerTest.java
@@ -12,15 +12,19 @@
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations under the License.
+ * KIND, either express or implied. See the License for the specific
+ * language governing permissions and limitations under the License.
*/
package org.apache.opennlp.grpc.processor;
+import java.nio.charset.StandardCharsets;
import java.util.Map;
+import org.apache.opennlp.grpc.v1.AnalysisOptions;
import org.apache.opennlp.grpc.v1.AnalyzeDocumentRequest;
import org.apache.opennlp.grpc.v1.AnalyzeDocumentResponse;
+import org.apache.opennlp.grpc.v1.AnnotatedSentence;
+import org.apache.opennlp.grpc.v1.OffsetEncoding;
import org.apache.opennlp.grpc.v1.OpenNlpDocument;
import org.apache.opennlp.grpc.v1.PipelineStep;
import org.junit.jupiter.api.Test;
@@ -34,16 +38,20 @@ class BasicDocumentAnalyzerTest {
private static final String TEXT =
"The driver got badly injured by the accident. He was taken to the
hospital!";
+ private final BasicDocumentAnalyzer analyzer = new
BasicDocumentAnalyzer(Map.of());
+
+ private AnalyzeDocumentResponse analyze(String text, AnalysisOptions
options) {
+ final AnalyzeDocumentRequest.Builder request =
AnalyzeDocumentRequest.newBuilder()
+
.setDocument(OpenNlpDocument.newBuilder().setDocId("doc-1").setRawText(text).build());
+ if (options != null) {
+ request.setOptions(options);
+ }
+ return analyzer.analyze(request.build());
+ }
+
@Test
void analyzesSentencesAndTokens() {
- final BasicDocumentAnalyzer analyzer = new BasicDocumentAnalyzer(Map.of());
-
- final AnalyzeDocumentResponse response =
analyzer.analyze(AnalyzeDocumentRequest.newBuilder()
- .setDocument(OpenNlpDocument.newBuilder()
- .setDocId("doc-1")
- .setRawText(TEXT)
- .build())
- .build());
+ final AnalyzeDocumentResponse response = analyze(TEXT, null);
assertEquals("doc-1", response.getDocument().getDocId());
assertEquals(2, response.getDocument().getSentencesCount());
@@ -53,4 +61,43 @@ class BasicDocumentAnalyzerTest {
assertTrue(response.getDiagnosticsList().stream()
.anyMatch(d -> d.getStep() == PipelineStep.PIPELINE_STEP_TOKENIZE));
}
+
+ @Test
+ void defaultsToUtf8ByteOffsetsForMultibyteText() {
+ // One sentence containing a supplementary character (emoji = 2 UTF-16
units, 4 UTF-8 bytes).
+ final String text = "Hi there 😀.";
+ final AnalyzeDocumentResponse response = analyze(text, null);
+
+ assertEquals(OffsetEncoding.OFFSET_ENCODING_UTF8_BYTE,
response.getDocument().getOffsetEncoding());
+ final AnnotatedSentence sentence = response.getDocument().getSentences(0);
+ // The sentence covers the whole text; its end offset must be the UTF-8
byte length,
+ // not the (smaller) UTF-16 length, proving the conversion ran.
+ assertEquals(text.getBytes(StandardCharsets.UTF_8).length,
sentence.getSentenceSpan().getEnd());
+ assertTrue(sentence.getSentenceSpan().getEnd() > text.length());
+ }
+
+ @Test
+ void honorsUtf16OffsetEncodingWhenRequested() {
+ final String text = "Hi there 😀.";
+ final AnalyzeDocumentResponse response = analyze(text,
+ AnalysisOptions.newBuilder()
+ .setOffsetEncoding(OffsetEncoding.OFFSET_ENCODING_UTF16_CODE_UNIT)
+ .build());
+
+ assertEquals(OffsetEncoding.OFFSET_ENCODING_UTF16_CODE_UNIT,
response.getDocument().getOffsetEncoding());
+ assertEquals(text.length(),
response.getDocument().getSentences(0).getSentenceSpan().getEnd());
+ }
+
+ @Test
+ void includeProbabilitiesPopulatesSpanProbability() {
+ final AnalyzeDocumentResponse withProbs = analyze(TEXT,
+ AnalysisOptions.newBuilder().setIncludeProbabilities(true).build());
+ final AnnotatedSentence sentence = withProbs.getDocument().getSentences(0);
+ assertTrue(sentence.getSentenceSpan().hasProbability());
+ assertTrue(sentence.getTokens(0).getAnnotationSpan().hasProbability());
+
+ // Default (off) leaves probabilities unset.
+ final AnnotatedSentence noProbs = analyze(TEXT,
null).getDocument().getSentences(0);
+ assertFalse(noProbs.getSentenceSpan().hasProbability());
+ }
}
diff --git
a/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/OffsetMapperTest.java
b/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/OffsetMapperTest.java
new file mode 100644
index 00000000..27068b97
--- /dev/null
+++
b/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/processor/OffsetMapperTest.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the specific
+ * language governing permissions and limitations under the License.
+ */
+package org.apache.opennlp.grpc.processor;
+
+import org.apache.opennlp.grpc.v1.OffsetEncoding;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+class OffsetMapperTest {
+
+ // "Hi 😀." : H,i,space are 1 Java char each; 😀 (U+1F600) is a surrogate pair
+ // (2 Java chars, 1 code point, 4 UTF-8 bytes); '.' is 1 char. Java length =
6.
+ private static final String TEXT = "Hi 😀.";
+
+ @Test
+ void unspecifiedDefaultsToUtf8Byte() {
+ final OffsetMapper mapper = OffsetMapper.forText(TEXT,
OffsetEncoding.OFFSET_ENCODING_UNSPECIFIED);
+ assertEquals(OffsetEncoding.OFFSET_ENCODING_UTF8_BYTE, mapper.encoding());
+ }
+
+ @Test
+ void utf8ByteOffsets() {
+ final OffsetMapper mapper = OffsetMapper.forText(TEXT,
OffsetEncoding.OFFSET_ENCODING_UTF8_BYTE);
+ assertEquals(0, mapper.toTarget(0)); // start
+ assertEquals(3, mapper.toTarget(3)); // before emoji: "Hi " = 3 bytes
+ assertEquals(7, mapper.toTarget(5)); // after emoji (4 bytes): 3 + 4 = 7
+ assertEquals(8, mapper.toTarget(6)); // plus '.' = 8 total bytes
+ }
+
+ @Test
+ void utf16CodeUnitOffsetsAreIdentity() {
+ final OffsetMapper mapper = OffsetMapper.forText(TEXT,
OffsetEncoding.OFFSET_ENCODING_UTF16_CODE_UNIT);
+ assertEquals(3, mapper.toTarget(3));
+ assertEquals(5, mapper.toTarget(5));
+ assertEquals(6, mapper.toTarget(6));
+ }
+
+ @Test
+ void unicodeCodePointOffsets() {
+ final OffsetMapper mapper = OffsetMapper.forText(TEXT,
OffsetEncoding.OFFSET_ENCODING_UNICODE_CODE_POINT);
+ assertEquals(3, mapper.toTarget(3)); // H, i, space = 3 code points
+ assertEquals(4, mapper.toTarget(5)); // + emoji = 4 code points
+ assertEquals(5, mapper.toTarget(6)); // + '.' = 5 code points
+ }
+}
diff --git
a/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/profile/ProfileResolverTest.java
b/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/profile/ProfileResolverTest.java
index 8fb2bcff..f62b8f75 100644
---
a/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/profile/ProfileResolverTest.java
+++
b/opennlp-grpc/opennlp-grpc-service/src/test/java/org/apache/opennlp/grpc/profile/ProfileResolverTest.java
@@ -20,6 +20,7 @@ package org.apache.opennlp.grpc.profile;
import org.apache.opennlp.grpc.processor.AnalysisException;
import org.apache.opennlp.grpc.v1.AnalyzeDocumentRequest;
import org.apache.opennlp.grpc.v1.AnalysisProfile;
+import org.apache.opennlp.grpc.v1.ModelBundleRef;
import org.apache.opennlp.grpc.v1.OpenNlpDocument;
import org.apache.opennlp.grpc.v1.PipelineStep;
import org.junit.jupiter.api.Test;
@@ -68,6 +69,24 @@ class ProfileResolverTest {
assertEquals(PipelineStep.PIPELINE_STEP_SENTENCE_DETECT,
profile.getSteps(0));
}
+ @Test
+ void inlineProfileMergesNonStepOverridesWhenProfileIdIsSet() {
+ final AnalysisProfile profile =
resolver.resolve(AnalyzeDocumentRequest.newBuilder()
+ .setDocument(OpenNlpDocument.newBuilder().setRawText("Hello.").build())
+ .setProfileId("en-basic")
+ .setProfile(AnalysisProfile.newBuilder()
+ .setProfileId("custom-bundle")
+
.setModelBundle(ModelBundleRef.newBuilder().setBundleId("custom").build())
+ .build())
+ .build());
+
+ assertEquals("custom-bundle", profile.getProfileId());
+ assertEquals("custom", profile.getModelBundle().getBundleId());
+ assertEquals(2, profile.getStepsCount());
+ assertEquals(PipelineStep.PIPELINE_STEP_SENTENCE_DETECT,
profile.getSteps(0));
+ assertEquals(PipelineStep.PIPELINE_STEP_TOKENIZE, profile.getSteps(1));
+ }
+
@Test
void unknownProfileIdFails() {
assertThrows(AnalysisException.class, () ->
resolver.resolve(AnalyzeDocumentRequest.newBuilder()