This is an automated email from the ASF dual-hosted git repository. apupier pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/camel.git
commit 8264237d823404a2b8c3782448285f3083a9ade1 Author: Aurélien Pupier <[email protected]> AuthorDate: Thu Jan 22 13:26:02 2026 +0100 Improve Docling metadata retrieval: pageNumber and format (mimetype) Signed-off-by: Aurélien Pupier <[email protected]> --- .../camel/component/docling/DoclingProducer.java | 262 ++------------------- .../docling/integration/MetadataExtractionIT.java | 33 +++ .../src/test/resources/multi_page.pdf | Bin 0 -> 128322 bytes 3 files changed, 51 insertions(+), 244 deletions(-) diff --git a/components/camel-ai/camel-docling/src/main/java/org/apache/camel/component/docling/DoclingProducer.java b/components/camel-ai/camel-docling/src/main/java/org/apache/camel/component/docling/DoclingProducer.java index e9449e5a31a2..7963f368a339 100644 --- a/components/camel-ai/camel-docling/src/main/java/org/apache/camel/component/docling/DoclingProducer.java +++ b/components/camel-ai/camel-docling/src/main/java/org/apache/camel/component/docling/DoclingProducer.java @@ -42,6 +42,8 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import java.util.stream.Stream; +import ai.docling.core.DoclingDocument; +import ai.docling.core.DoclingDocument.DocumentOrigin; import ai.docling.serve.api.DoclingServeApi; import ai.docling.serve.api.convert.request.ConvertDocumentRequest; import ai.docling.serve.api.convert.request.options.ConvertDocumentOptions; @@ -70,14 +72,12 @@ public class DoclingProducer extends DefaultProducer { private static final Logger LOG = LoggerFactory.getLogger(DoclingProducer.class); - private DoclingEndpoint endpoint; private DoclingConfiguration configuration; private DoclingServeApi doclingServeApi; private ObjectMapper objectMapper; public DoclingProducer(DoclingEndpoint endpoint) { super(endpoint); - this.endpoint = endpoint; this.configuration = endpoint.getConfiguration(); this.objectMapper = new ObjectMapper(); } @@ -384,7 +384,8 @@ public class DoclingProducer extends DefaultProducer { metadata.setFilePath(inputPath); try { - JsonNode rootNode = objectMapper.readTree(jsonOutput); + // TODO: get directly the DoclingDocument instead of converting back and forth + DoclingDocument doclingDocument = objectMapper.readValue(jsonOutput, DoclingDocument.class); // Extract basic file information for file paths if (!inputPath.startsWith("http://") && !inputPath.startsWith("https://")) { @@ -395,44 +396,18 @@ public class DoclingProducer extends DefaultProducer { } } - // Try to extract metadata from the JSON structure - if (rootNode.has(DoclingMetadataFields.METADATA)) { - JsonNode metadataNode = rootNode.get(DoclingMetadataFields.METADATA); - extractMetadataFieldsFromJson(metadata, metadataNode); + if (doclingDocument.getPages() != null) { + metadata.setPageCount(doclingDocument.getPages().size()); } - // Look for document-level information - if (rootNode.has(DoclingMetadataFields.DOCUMENT)) { - JsonNode docNode = rootNode.get(DoclingMetadataFields.DOCUMENT); - if (docNode.has(DoclingMetadataFields.NAME) && metadata.getTitle() == null) { - metadata.setTitle(docNode.get(DoclingMetadataFields.NAME).asText()); - } - } - - // Extract main text to determine document type/format - if (rootNode.has(DoclingMetadataFields.MAIN_TEXT)) { - JsonNode mainTextNode = rootNode.get(DoclingMetadataFields.MAIN_TEXT); - if (mainTextNode.isArray() && mainTextNode.size() > 0) { - // Document has text content - metadata.setDocumentType("Text Document"); - } - } - - // Count pages if available - if (rootNode.has(DoclingMetadataFields.PAGES)) { - if (rootNode.get(DoclingMetadataFields.PAGES).isArray()) { - metadata.setPageCount(rootNode.get(DoclingMetadataFields.PAGES).size()); - } else if (rootNode.get(DoclingMetadataFields.PAGES).isInt()) { - metadata.setPageCount(rootNode.get(DoclingMetadataFields.PAGES).asInt()); - } - } else if (rootNode.has(DoclingMetadataFields.NUM_PAGES)) { - metadata.setPageCount(rootNode.get(DoclingMetadataFields.NUM_PAGES).asInt()); - } else if (rootNode.has(DoclingMetadataFields.PAGE_COUNT)) { - metadata.setPageCount(rootNode.get(DoclingMetadataFields.PAGE_COUNT).asInt()); + DocumentOrigin origin = doclingDocument.getOrigin(); + if (origin != null && origin.getMimetype() != null) { + metadata.setFormat(origin.getMimetype()); } // Store raw metadata if requested if (configuration.isIncludeRawMetadata()) { + JsonNode rootNode = objectMapper.readTree(jsonOutput); @SuppressWarnings("unchecked") Map<String, Object> rawMap = objectMapper.convertValue(rootNode, Map.class); metadata.setRawMetadata(rawMap); @@ -446,123 +421,6 @@ public class DoclingProducer extends DefaultProducer { return metadata; } - private void extractMetadataFieldsFromJson(DocumentMetadata metadata, JsonNode metadataNode) { - // Extract standard metadata fields - if (metadataNode.has(DoclingMetadataFields.TITLE)) { - metadata.setTitle(metadataNode.get(DoclingMetadataFields.TITLE).asText()); - } - if (metadataNode.has(DoclingMetadataFields.AUTHOR) || metadataNode.has(DoclingMetadataFields.AUTHOR_PASCAL)) { - String author = metadataNode.has(DoclingMetadataFields.AUTHOR) - ? metadataNode.get(DoclingMetadataFields.AUTHOR).asText() - : metadataNode.get(DoclingMetadataFields.AUTHOR_PASCAL).asText(); - metadata.setAuthor(author); - } - if (metadataNode.has(DoclingMetadataFields.CREATOR) || metadataNode.has(DoclingMetadataFields.CREATOR_PASCAL)) { - String creator = metadataNode.has(DoclingMetadataFields.CREATOR) - ? metadataNode.get(DoclingMetadataFields.CREATOR).asText() - : metadataNode.get(DoclingMetadataFields.CREATOR_PASCAL).asText(); - metadata.setCreator(creator); - } - if (metadataNode.has(DoclingMetadataFields.PRODUCER) || metadataNode.has(DoclingMetadataFields.PRODUCER_PASCAL)) { - String producer = metadataNode.has(DoclingMetadataFields.PRODUCER) - ? metadataNode.get(DoclingMetadataFields.PRODUCER).asText() - : metadataNode.get(DoclingMetadataFields.PRODUCER_PASCAL).asText(); - metadata.setProducer(producer); - } - if (metadataNode.has(DoclingMetadataFields.SUBJECT) || metadataNode.has(DoclingMetadataFields.SUBJECT_PASCAL)) { - String subject = metadataNode.has(DoclingMetadataFields.SUBJECT) - ? metadataNode.get(DoclingMetadataFields.SUBJECT).asText() - : metadataNode.get(DoclingMetadataFields.SUBJECT_PASCAL).asText(); - metadata.setSubject(subject); - } - if (metadataNode.has(DoclingMetadataFields.KEYWORDS) || metadataNode.has(DoclingMetadataFields.KEYWORDS_PASCAL)) { - String keywords = metadataNode.has(DoclingMetadataFields.KEYWORDS) - ? metadataNode.get(DoclingMetadataFields.KEYWORDS).asText() - : metadataNode.get(DoclingMetadataFields.KEYWORDS_PASCAL).asText(); - metadata.setKeywords(keywords); - } - if (metadataNode.has(DoclingMetadataFields.LANGUAGE) || metadataNode.has(DoclingMetadataFields.LANGUAGE_PASCAL)) { - String language = metadataNode.has(DoclingMetadataFields.LANGUAGE) - ? metadataNode.get(DoclingMetadataFields.LANGUAGE).asText() - : metadataNode.get(DoclingMetadataFields.LANGUAGE_PASCAL).asText(); - metadata.setLanguage(language); - } - - // Extract format information - if (metadataNode.has(DoclingMetadataFields.FORMAT) || metadataNode.has(DoclingMetadataFields.FORMAT_PASCAL)) { - String format = metadataNode.has(DoclingMetadataFields.FORMAT) - ? metadataNode.get(DoclingMetadataFields.FORMAT).asText() - : metadataNode.get(DoclingMetadataFields.FORMAT_PASCAL).asText(); - metadata.setFormat(format); - } - - // Extract dates - try multiple field name variations - extractDateFieldFromJson(metadata, metadataNode, DoclingMetadataFields.CREATION_DATE, - DoclingMetadataFields.CREATION_DATE_PASCAL, DoclingMetadataFields.CREATED, - DoclingMetadataFields.CREATED_PASCAL, (date) -> metadata.setCreationDate(date)); - extractDateFieldFromJson(metadata, metadataNode, DoclingMetadataFields.MODIFICATION_DATE, - DoclingMetadataFields.MODIFICATION_DATE_PASCAL, DoclingMetadataFields.MODIFIED, - DoclingMetadataFields.MODIFIED_PASCAL, (date) -> metadata.setModificationDate(date)); - - // Extract all other fields as custom metadata if requested - if (configuration.isExtractAllMetadata()) { - metadataNode.fields().forEachRemaining(entry -> { - String key = entry.getKey(); - // Skip standard fields we already extracted - if (!DoclingMetadataFields.isStandardField(key)) { - JsonNode value = entry.getValue(); - if (value.isTextual()) { - metadata.addCustomMetadata(key, value.asText()); - } else if (value.isInt()) { - metadata.addCustomMetadata(key, value.asInt()); - } else if (value.isLong()) { - metadata.addCustomMetadata(key, value.asLong()); - } else if (value.isBoolean()) { - metadata.addCustomMetadata(key, value.asBoolean()); - } else if (value.isDouble()) { - metadata.addCustomMetadata(key, value.asDouble()); - } else { - metadata.addCustomMetadata(key, value.toString()); - } - } - }); - } - } - - private void extractDateFieldFromJson( - DocumentMetadata metadata, JsonNode metadataNode, String fieldName1, - String fieldName2, String fieldName3, String fieldName4, - java.util.function.Consumer<java.time.Instant> setter) { - String dateStr = null; - - if (metadataNode.has(fieldName1)) { - dateStr = metadataNode.get(fieldName1).asText(); - } else if (metadataNode.has(fieldName2)) { - dateStr = metadataNode.get(fieldName2).asText(); - } else if (metadataNode.has(fieldName3)) { - dateStr = metadataNode.get(fieldName3).asText(); - } else if (metadataNode.has(fieldName4)) { - dateStr = metadataNode.get(fieldName4).asText(); - } - - if (dateStr != null && !dateStr.isEmpty()) { - try { - java.time.Instant instant = java.time.Instant.parse(dateStr); - setter.accept(instant); - } catch (Exception e) { - LOG.debug("Failed to parse date field {}: {}", fieldName1, dateStr); - // Try parsing as ISO local date time - try { - java.time.LocalDateTime ldt = java.time.LocalDateTime.parse(dateStr); - java.time.Instant instant = ldt.atZone(java.time.ZoneId.systemDefault()).toInstant(); - setter.accept(instant); - } catch (Exception e2) { - LOG.debug("Failed to parse date as LocalDateTime: {}", dateStr); - } - } - } - } - private DocumentMetadata extractMetadataUsingCLI(String inputPath, Exchange exchange) throws Exception { LOG.debug("Extracting metadata using Docling CLI for: {}", inputPath); @@ -570,9 +428,7 @@ public class DoclingProducer extends DefaultProducer { String jsonOutput = executeDoclingCommand(inputPath, "json", exchange); // Parse the JSON output to extract metadata - DocumentMetadata metadata = parseMetadataFromJson(jsonOutput, inputPath); - - return metadata; + return parseMetadataFromJson(jsonOutput, inputPath); } private DocumentMetadata parseMetadataFromJson(String jsonOutput, String inputPath) { @@ -580,7 +436,7 @@ public class DoclingProducer extends DefaultProducer { metadata.setFilePath(inputPath); try { - JsonNode rootNode = objectMapper.readTree(jsonOutput); + DoclingDocument doclingDocument = objectMapper.readValue(jsonOutput, DoclingDocument.class); // Extract basic file information File file = new File(inputPath); @@ -589,30 +445,18 @@ public class DoclingProducer extends DefaultProducer { metadata.setFileSizeBytes(file.length()); } - // Try to extract metadata from the JSON structure - // Docling JSON output may have different structures depending on the document - if (rootNode.has(DoclingMetadataFields.METADATA)) { - JsonNode metadataNode = rootNode.get(DoclingMetadataFields.METADATA); - extractFieldsFromJsonNode(metadata, metadataNode); + if (doclingDocument.getPages() != null) { + metadata.setPageCount(doclingDocument.getPages().size()); } - // Look for document-level information - if (rootNode.has(DoclingMetadataFields.DOCUMENT)) { - JsonNode docNode = rootNode.get(DoclingMetadataFields.DOCUMENT); - if (docNode.has(DoclingMetadataFields.NAME)) { - metadata.setTitle(docNode.get(DoclingMetadataFields.NAME).asText()); - } - } - - // Count pages if available - if (rootNode.has(DoclingMetadataFields.PAGES)) { - metadata.setPageCount(rootNode.get(DoclingMetadataFields.PAGES).size()); - } else if (rootNode.has(DoclingMetadataFields.NUM_PAGES)) { - metadata.setPageCount(rootNode.get(DoclingMetadataFields.NUM_PAGES).asInt()); + DocumentOrigin origin = doclingDocument.getOrigin(); + if (origin != null && origin.getMimetype() != null) { + metadata.setFormat(origin.getMimetype()); } // Store raw metadata if configured if (configuration.isIncludeRawMetadata()) { + JsonNode rootNode = objectMapper.readTree(jsonOutput); @SuppressWarnings("unchecked") Map<String, Object> rawMap = objectMapper.convertValue(rootNode, java.util.Map.class); metadata.setRawMetadata(rawMap); @@ -625,76 +469,6 @@ public class DoclingProducer extends DefaultProducer { return metadata; } - private void extractFieldsFromJsonNode(DocumentMetadata metadata, JsonNode metadataNode) { - // Extract common metadata fields - if (metadataNode.has(DoclingMetadataFields.TITLE)) { - metadata.setTitle(metadataNode.get(DoclingMetadataFields.TITLE).asText()); - } - if (metadataNode.has(DoclingMetadataFields.AUTHOR)) { - metadata.setAuthor(metadataNode.get(DoclingMetadataFields.AUTHOR).asText()); - } - if (metadataNode.has(DoclingMetadataFields.CREATOR)) { - metadata.setCreator(metadataNode.get(DoclingMetadataFields.CREATOR).asText()); - } - if (metadataNode.has(DoclingMetadataFields.PRODUCER)) { - metadata.setProducer(metadataNode.get(DoclingMetadataFields.PRODUCER).asText()); - } - if (metadataNode.has(DoclingMetadataFields.SUBJECT)) { - metadata.setSubject(metadataNode.get(DoclingMetadataFields.SUBJECT).asText()); - } - if (metadataNode.has(DoclingMetadataFields.KEYWORDS)) { - metadata.setKeywords(metadataNode.get(DoclingMetadataFields.KEYWORDS).asText()); - } - if (metadataNode.has(DoclingMetadataFields.LANGUAGE)) { - metadata.setLanguage(metadataNode.get(DoclingMetadataFields.LANGUAGE).asText()); - } - - // Extract dates - if (metadataNode.has(DoclingMetadataFields.CREATION_DATE) - || metadataNode.has(DoclingMetadataFields.CREATION_DATE_CAMEL)) { - String dateStr = metadataNode.has(DoclingMetadataFields.CREATION_DATE) - ? metadataNode.get(DoclingMetadataFields.CREATION_DATE).asText() - : metadataNode.get(DoclingMetadataFields.CREATION_DATE_CAMEL).asText(); - try { - metadata.setCreationDate(java.time.Instant.parse(dateStr)); - } catch (Exception e) { - LOG.debug("Failed to parse creation date: {}", dateStr); - } - } - - if (metadataNode.has(DoclingMetadataFields.MODIFICATION_DATE) - || metadataNode.has(DoclingMetadataFields.MODIFICATION_DATE_CAMEL)) { - String dateStr = metadataNode.has(DoclingMetadataFields.MODIFICATION_DATE) - ? metadataNode.get(DoclingMetadataFields.MODIFICATION_DATE).asText() - : metadataNode.get(DoclingMetadataFields.MODIFICATION_DATE_CAMEL).asText(); - try { - metadata.setModificationDate(java.time.Instant.parse(dateStr)); - } catch (Exception e) { - LOG.debug("Failed to parse modification date: {}", dateStr); - } - } - - // Extract custom metadata if extractAllMetadata is enabled - if (configuration.isExtractAllMetadata()) { - metadataNode.fields().forEachRemaining(entry -> { - String key = entry.getKey(); - // Skip standard fields we already extracted - if (!DoclingMetadataFields.isStandardField(key)) { - JsonNode value = entry.getValue(); - if (value.isTextual()) { - metadata.addCustomMetadata(key, value.asText()); - } else if (value.isNumber()) { - metadata.addCustomMetadata(key, value.asLong()); - } else if (value.isBoolean()) { - metadata.addCustomMetadata(key, value.asBoolean()); - } else { - metadata.addCustomMetadata(key, value.toString()); - } - } - }); - } - } - private void setMetadataHeaders(Exchange exchange, DocumentMetadata metadata) { if (metadata.getTitle() != null) { exchange.getIn().setHeader(DoclingHeaders.METADATA_TITLE, metadata.getTitle()); diff --git a/components/camel-ai/camel-docling/src/test/java/org/apache/camel/component/docling/integration/MetadataExtractionIT.java b/components/camel-ai/camel-docling/src/test/java/org/apache/camel/component/docling/integration/MetadataExtractionIT.java index fb7546645b25..0e9003801d99 100644 --- a/components/camel-ai/camel-docling/src/test/java/org/apache/camel/component/docling/integration/MetadataExtractionIT.java +++ b/components/camel-ai/camel-docling/src/test/java/org/apache/camel/component/docling/integration/MetadataExtractionIT.java @@ -16,8 +16,11 @@ */ package org.apache.camel.component.docling.integration; +import java.io.IOException; +import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.Map; import org.apache.camel.CamelContext; @@ -34,6 +37,7 @@ import org.junit.jupiter.api.extension.RegisterExtension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -80,6 +84,35 @@ public class MetadataExtractionIT extends CamelTestSupport { LOG.info("File size: {} bytes", metadata.getFileSizeBytes()); } + @Test + void testMetadataExtractionFromPdf() throws Exception { + Path testFile = createTestPdfFile(); + + DocumentMetadata metadata = template.requestBody("direct:extract-metadata", + testFile.toString(), DocumentMetadata.class); + + assertNotNull(metadata, "Metadata should not be null"); + assertNotNull(metadata.getFileName(), "File name should be extracted"); + assertTrue(metadata.getFileSizeBytes() > 0, "File size should be greater than 0"); + assertNotNull(metadata.getFilePath(), "File path should be set"); + assertThat(metadata.getPageCount()).isEqualTo(5); + assertThat(metadata.getFormat()).isEqualTo("application/pdf"); + // TODO: assertThat(metadata.getTitle()).isEqualTo("The Evolution of the Word Processor"); + // TODO: assertThat(metadata.getDocumentType()).isEqualTo("PDF"); + + LOG.info("Successfully extracted metadata: {}", metadata); + LOG.info("File name: {}", metadata.getFileName()); + LOG.info("File size: {} bytes", metadata.getFileSizeBytes()); + } + + private Path createTestPdfFile() throws IOException { + try (InputStream is = getClass().getClassLoader().getResourceAsStream("multi_page.pdf")) { + java.nio.file.Path tempFile = Files.createTempFile("docling-test-multi_page", ".pdf"); + Files.copy(is, tempFile.toAbsolutePath(), StandardCopyOption.REPLACE_EXISTING); + return tempFile; + } + } + @Test public void testMetadataExtractionWithHeaders() throws Exception { Path testFile = createTestMarkdownFile(); diff --git a/components/camel-ai/camel-docling/src/test/resources/multi_page.pdf b/components/camel-ai/camel-docling/src/test/resources/multi_page.pdf new file mode 100644 index 000000000000..7d9eb1818d61 Binary files /dev/null and b/components/camel-ai/camel-docling/src/test/resources/multi_page.pdf differ
