This is an automated email from the ASF dual-hosted git repository.

vogievetsky pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new 132a1c9  Re-order and document format detection in web console (#9887)
132a1c9 is described below

commit 132a1c9fe770209792973f52b8aee6f36f06aa3d
Author: Joseph Glanville <j...@jpg.id.au>
AuthorDate: Fri May 22 06:29:39 2020 +0700

    Re-order and document format detection in web console (#9887)
    
    Motivation for this change is to not inadvertently identify binary
    formats that contain uncompressed string data as TSV or CSV.
    
    Moving detection of magic byte headers before heuristics should be more
    robust in general.
---
 web-console/src/utils/ingestion-spec.tsx | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/web-console/src/utils/ingestion-spec.tsx 
b/web-console/src/utils/ingestion-spec.tsx
index 0bc3ff3..9a2d700 100644
--- a/web-console/src/utils/ingestion-spec.tsx
+++ b/web-console/src/utils/ingestion-spec.tsx
@@ -2676,29 +2676,35 @@ function guessInputFormat(sampleData: string[]): 
InputFormat {
   if (sampleDatum) {
     sampleDatum = String(sampleDatum); // Really ensure it is a string
 
-    if (sampleDatum.startsWith('{') && sampleDatum.endsWith('}')) {
-      return inputFormatFromType('json');
-    }
-
-    if (sampleDatum.split('\t').length > 3) {
-      return inputFormatFromType('tsv', !/\t\d+\t/.test(sampleDatum));
-    }
-
-    if (sampleDatum.split(',').length > 3) {
-      return inputFormatFromType('csv', !/,\d+,/.test(sampleDatum));
-    }
+    // First check for magic byte sequences as they rarely yield false 
positives
 
+    // Parquet 4 byte magic header: 
https://github.com/apache/parquet-format#file-format
     if (sampleDatum.startsWith('PAR1')) {
       return inputFormatFromType('parquet');
     }
-
+    // ORC 3 byte magic header: https://orc.apache.org/specification/ORCv1/
     if (sampleDatum.startsWith('ORC')) {
       return inputFormatFromType('orc');
     }
-
+    // Avro OCF 4 byte magic header: 
https://avro.apache.org/docs/current/spec.html#Object+Container+Files
     if (sampleDatum.startsWith('Obj1')) {
       return inputFormatFromType('avro_ocf');
     }
+
+    // After checking for magic byte sequences perform heuristics to deduce 
string formats
+
+    // If the string starts and ends with curly braces assume JSON
+    if (sampleDatum.startsWith('{') && sampleDatum.endsWith('}')) {
+      return inputFormatFromType('json');
+    }
+    // Contains more than 3 tabs assume TSV
+    if (sampleDatum.split('\t').length > 3) {
+      return inputFormatFromType('tsv', !/\t\d+\t/.test(sampleDatum));
+    }
+    // Contains more than 3 commas assume CSV
+    if (sampleDatum.split(',').length > 3) {
+      return inputFormatFromType('csv', !/,\d+,/.test(sampleDatum));
+    }
   }
 
   return inputFormatFromType('regex');


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@druid.apache.org
For additional commands, e-mail: commits-h...@druid.apache.org

Reply via email to