import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.detect.Detector;
import org.apache.tika.mime.MediaType;
import org.apache.tika.io.TikaInputStream;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.BufferedInputStream;
import java.nio.file.Paths;

public class TikaExample {
    public static void main(String[] args) {
        // Specify the file path

        System.out.println("Arguments:");
        for (int i = 0; i < args.length; ++i) {
            System.out.println(args[i]);
        }
        
        try {

            if (args.length < 1) {
                throw new Exception("Wrong arguments, missing file path");
            }

            System.out.println("Reading file:" + args[0]);

            String filePath = args[0];

            // Create a new instance of AutoDetectParser
            AutoDetectParser parser = new AutoDetectParser();

            // Create a new instance of BodyContentHandler to handle the content
            BodyContentHandler handler = new BodyContentHandler();

            // Create a new instance of Metadata to hold metadata
            Metadata metadata = new Metadata();

            // Create a new instance of ParseContext
            ParseContext context = new ParseContext();

            // Create a FileInputStream for the file
            TikaInputStream stream = TikaInputStream.get(Paths.get(filePath), metadata);

            Detector detector = parser.getDetector();
            MediaType mediaType = detector.detect(stream, metadata);

            // Internal processor used to extract Embedded data (i.e. images) from documents
            // EmbeddedDocumentExtractor customExtractor = new CustomContentProcessor();
            // context.set(EmbeddedDocumentExtractor.class, customExtractor);

            // How can we configure the parser so all the email text is included in the 
            // data sent to the handler (except by embedded content like images)?
            if (mediaType.toString().startsWith("message/rfc822")) {

            }

            // Parse the file using Tika
            parser.parse(stream, handler, metadata, context);

            // Display the content
            System.out.println("File Content:\n" + handler.toString());

            // Display metadata
            System.out.println("\nSerialized metadata:");
            for (String name : metadata.names()) {
                System.out.println(name + ": " + metadata.get(name));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}