[
https://issues.apache.org/jira/browse/NIFI-296?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14326807#comment-14326807
]
ASF GitHub Bot commented on NIFI-296:
-------------------------------------
Github user markap14 commented on a diff in the pull request:
https://github.com/apache/incubator-nifi/pull/27#discussion_r24956244
--- Diff:
nifi/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/IdentifyMimeType.java
---
@@ -327,148 +168,41 @@ public void process(final InputStream in) throws
IOException {
session.transfer(flowFile, REL_SUCCESS);
}
- private static interface ContentScanningMimeTypeIdentifier {
-
- boolean isEnabled(ProcessContext context);
-
- String getMimeType(InputStream in) throws IOException;
- }
-
- private static class ZipIdentifier implements
ContentScanningMimeTypeIdentifier {
-
- @Override
- public String getMimeType(final InputStream in) throws IOException
{
- final ZipInputStream zipIn = new ZipInputStream(in);
- try {
- if (zipIn.getNextEntry() != null) {
- return "application/zip";
- }
- } catch (final Exception e) {
- }
- return null;
- }
-
- @Override
- public boolean isEnabled(final ProcessContext context) {
- return context.getProperty(IDENTIFY_ZIP).asBoolean();
- }
- }
-
- private static class TarIdentifier implements
ContentScanningMimeTypeIdentifier {
-
- @Override
- public String getMimeType(final InputStream in) throws IOException
{
- try (final TarArchiveInputStream tarIn = new
TarArchiveInputStream(in)) {
- final TarArchiveEntry firstEntry = tarIn.getNextTarEntry();
- if (firstEntry != null) {
- if
(firstEntry.getName().equals(FlowFilePackagerV1.FILENAME_ATTRIBUTES)) {
- final TarArchiveEntry secondEntry =
tarIn.getNextTarEntry();
- if (secondEntry != null &&
secondEntry.getName().equals(FlowFilePackagerV1.FILENAME_CONTENT)) {
- return "application/flowfile-v1";
- }
- }
- return "application/tar";
- }
- } catch (final Exception e) {
- }
- return null;
- }
-
- @Override
- public boolean isEnabled(final ProcessContext context) {
- return context.getProperty(IDENTIFY_TAR).asBoolean();
- }
+ private Detector getFlowFileV3Detector() {
+ return new MagicDetector(FLOWFILE_V3,
FlowFilePackagerV3.MAGIC_HEADER);
}
- private static interface MagicHeader {
-
- int getRequiredBufferLength();
-
- String getMimeType();
-
- boolean matches(final byte[] header);
+ private Detector getFlowFileV1Detector() {
+ return new FlowFileV1Detector();
}
- private static class SimpleMagicHeader implements MagicHeader {
-
- private final String mimeType;
- private final int offset;
- private final byte[] byteSequence;
-
- public SimpleMagicHeader(final String mimeType, final byte[]
byteSequence) {
- this(mimeType, byteSequence, 0);
- }
-
- public SimpleMagicHeader(final String mimeType, final byte[]
byteSequence, final int offset) {
- this.mimeType = mimeType;
- this.byteSequence = byteSequence;
- this.offset = offset;
- }
-
- @Override
- public int getRequiredBufferLength() {
- return byteSequence.length + offset;
- }
-
- @Override
- public String getMimeType() {
- return mimeType;
- }
+ private class FlowFileV1Detector implements Detector {
@Override
- public boolean matches(final byte[] header) {
- if (header.length < getRequiredBufferLength()) {
- return false;
+ public MediaType detect(InputStream in, Metadata mtdt) throws
IOException {
+ // Sanity check the stream. This may not be a tarfile at all
+ in.mark(FlowFilePackagerV1.FILENAME_ATTRIBUTES.length());
--- End diff --
wow good call on the first 100 bytes being the filename -- I looked up the
tar format to see if that was indeed the case but found this big, verbose,
confusing explanation of the header that I didn't understand -- should have
tried wikipedia first. I am sorry that I doubted you :)
> Extend the capability of IdentifyMimeType and extract document metadata
> -----------------------------------------------------------------------
>
> Key: NIFI-296
> URL: https://issues.apache.org/jira/browse/NIFI-296
> Project: Apache NiFi
> Issue Type: New Feature
> Components: Extensions
> Reporter: Joseph Witt
> Priority: Minor
>
> Apache Tika is pretty awesome and can handle a large range of document types.
> It could perhaps be used to extend the capability of IdentifyMimeType and it
> could also potentially be used to automatically extract document
> metadata/data as flow file attributes to be used for data flow routing
> decisions.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)