This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4035 in repository https://gitbox.apache.org/repos/asf/tika.git
commit f6a607b1da4bced8bfffaf8092fd4a0a7d7b63d3 Author: tallison <talli...@apache.org> AuthorDate: Wed May 10 10:39:34 2023 -0400 TIKA-4035 -- extract file system metadata --- .../java/org/apache/tika/metadata/FileSystem.java | 30 ++++++++++++++++++ .../tika/pipes/fetcher/fs/FileSystemFetcher.java | 37 ++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java b/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java new file mode 100644 index 000000000..87afab71c --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata; + +/** + * A collection of metadata elements for file system level metadata + */ +public interface FileSystem { + + final String PREFIX = "fs:"; + + Property CREATED = Property.externalDate(PREFIX + "created"); + Property MODIFIED = Property.externalDate(PREFIX + "modified"); + Property ACCESSED = Property.externalDate(PREFIX + "accessed"); + +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java index c169aa815..70ad5ab86 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java @@ -22,6 +22,9 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.nio.file.attribute.BasicFileAttributes; +import java.nio.file.attribute.FileTime; +import java.util.Date; import java.util.Map; import org.slf4j.Logger; @@ -34,7 +37,9 @@ import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.FileSystem; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.pipes.fetcher.AbstractFetcher; @@ -45,6 +50,8 @@ public class FileSystemFetcher extends AbstractFetcher implements Initializable //Warning! basePath can be null! private Path basePath = null; + private boolean extractFileSystemMetadata = false; + static boolean isDescendant(Path root, Path descendant) { return descendant.toAbsolutePath().normalize() .startsWith(root.toAbsolutePath().normalize()); @@ -70,6 +77,7 @@ public class FileSystemFetcher extends AbstractFetcher implements Initializable } metadata.set(TikaCoreProperties.SOURCE_PATH, fetchKey); + updateFileSystemMetadata(p, metadata); if (!Files.isRegularFile(p)) { if (basePath != null && !Files.isDirectory(basePath)) { @@ -82,6 +90,24 @@ public class FileSystemFetcher extends AbstractFetcher implements Initializable return TikaInputStream.get(p, metadata); } + private void updateFileSystemMetadata(Path p, Metadata metadata) throws IOException { + if (! extractFileSystemMetadata) { + return; + } + BasicFileAttributes attrs = Files.readAttributes(p, BasicFileAttributes.class); + updateFileTime(FileSystem.CREATED, attrs.creationTime(), metadata); + updateFileTime(FileSystem.MODIFIED, attrs.lastModifiedTime(), metadata); + updateFileTime(FileSystem.ACCESSED, attrs.lastAccessTime(), metadata); + //TODO extract owner or group? + } + + private void updateFileTime(Property property, FileTime fileTime, Metadata metadata) { + if (fileTime == null) { + return; + } + metadata.set(property, new Date(fileTime.toMillis())); + } + /** * * @return the basePath or <code>null</code> if no base path was set @@ -102,6 +128,17 @@ public class FileSystemFetcher extends AbstractFetcher implements Initializable this.basePath = Paths.get(basePath); } + /** + * Extract file system metadata (created, modified, accessed) when fetching file. + * The default is <code>false</code>. + * + * @param extractFileSystemMetadata + */ + @Field + public void setExtractFileSystemMetadata(boolean extractFileSystemMetadata) { + this.extractFileSystemMetadata = extractFileSystemMetadata; + } + @Override public void initialize(Map<String, Param> params) throws TikaConfigException { //no-op