apache · phrocker · May 24, 2019
diff --git a/...ain/java/org/apache/nifi/registry/bundle/extract/minificpp/HeaderLocationInputStream.java b/...ain/java/org/apache/nifi/registry/bundle/extract/minificpp/HeaderLocationInputStream.java
@@ -0,0 +1,156 @@
+package org.apache.nifi.registry.bundle.extract.minificpp;
+
+import java.io.*;
+import java.nio.channels.Channels;
+import java.nio.channels.FileChannel;
+import java.util.Objects;
+
+/**
+ *
+ * Description: Simple implementation of Knuth-Moore-Pratt. See http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/kmpen.htm
+ * Since this exists within the MiNiFi CPP package we know that we expect our zip to be at the end of this binary.
+ *
+ * Purpose: Locates the byte headers presented through the constructor
+ *
+ * Justification:
+ * java.util.zip.* and hence all extensions thereof ( including JarInputStream) expect the Zip header to be at the
+ * front of the stream. This is not a requirement by any specification. Unix and windows zip utilities allow
+ * the header to be anywhere within the file. This stream will attempt to locate it.
+ *
+ * If we are uncertain the origin of the input stream we will attempt a forward to back lookup with a buffered
+ * input stream. This will improve lookup speed. If we know that the input stream is a FileInputStream we can
+ * reference the channel to determine the size, and split the file into segment walking back if we inevitably
+ * know the header will be near the end of the file. This provides a benefit and replicates the behavior of *nix
+ * unzip utilities.
+ */
+public class HeaderLocationInputStream extends InputStream {
+
+
+    /**
+     * Magic number to locate
+     */
+    private final byte[] magicNumbers;
+    /**
+     * Known position of the aforementioned magic bytes.
+     */
+    private long headerPosition=-1;
+    /**
+     * Buffered input stream.
+     */
+    private final BufferedInputStream baseStream;
+    /**
+     * current position
+     */
+    private int currentPosition = 0;
+    /**
+     * Expected length of the file
+     */
+    private long expectedLength = 0;
+
+    /**
+     * Base constructor
+     * @param stream input search stream
+     * @param input magic bytes to locate
+     * @param reverseLookup determine if a reverse lookup is desired.
+     * @throws IOException
+     */
+    public HeaderLocationInputStream(InputStream stream, final byte [] input, final boolean reverseLookup) throws IOException {
+        Objects.requireNonNull(stream);
+        Objects.requireNonNull(input);
+        baseStream = new BufferedInputStream(stream);
+        magicNumbers = new byte[ input.length];
+        System.arraycopy(input, 0, this.magicNumbers, 0, input.length);
+        if (input.length > 0) {
+            if (reverseLookup && stream instanceof FileInputStream) {
+                FileChannel channel = ((FileInputStream) stream).getChannel();
+                expectedLength = channel.size();
+                // split this up into 10ths.
+                long interval = expectedLength / 10;
+
+
+                /**
+                 *  Knuth-Moore-Pratt doesn't work very well in reverse, particularly because the JavaInputStreams
+                 *  don't work well in reverse ( since we're dealing with a generic InputStream ).
+                 *  Splitting the file into segments and searching those segments is typically ideal.
+                 *
+                 *  This may mean that we're redundantly searching segments especially if the magic bytes are not
+                 *  found within the end segment or cross a segment boundary.
+                 *
+                 *  In the case where we do not find the file in the last segment, we will attempt to skip
+                 *
+                 *  The unzip command locates these magic bytes within milliseconds, primarily because of how
+                 *  I/O is performed.
+                 */
+                long loc = interval * 9;
+                do {
+                    try {
+                        // let's ensure we reposition our buffers
+                        baseStream.mark((int) interval);
+                        channel.position(loc);
+                        seekToMagicSequence();
+                    } catch (IOException io) {
+
+                    }
+                    loc -= interval;
+                } while (loc > 0 && headerPosition == -1);
+
+                if (headerPosition == -1) {
+                    throw new IOException("Could not find magic header");
+                }
+            } else {
+                seekToMagicSequence();
+            }
+        }
+    }
+
+    /**
+     * Seeks to the magic sequence
+     * @throws IOException Exception in underlying stream.
+     */
+    private void seekToMagicSequence() throws IOException {
+        // Adapted from https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm
+        final int[] pattern= new int[magicNumbers.length + 1 ];
+        int i = 0;
+        int j = -1;
+        pattern[i] = j;
+        while (i < magicNumbers.length) {
+            while (j >= 0 && magicNumbers[i] != magicNumbers[j]) {
+                j = pattern[j];
+            }
+            pattern[++i] = ++j;
+        }
+
+        long bytesConsumed = 0;
+        int myByte = 0;
+        while ((myByte = baseStream.read()) != -1) {
+            bytesConsumed++;
+
+            while (j >= 0 && (byte)myByte  != magicNumbers[j]) {
+                j = pattern[j];
+            }
+            ++j;
+
+            if (j == magicNumbers.length) {
+                headerPosition = bytesConsumed - magicNumbers.length;
+                return;
+            }
+        }
+        throw new IOException("Could not find magic header");
+    }
+
+
+    /**
+     * Since we located the header, there is no need to walk back again,
+     * so return that header entry.
+     * @return
+     * @throws IOException
+     */
+    @Override
+    public int read() throws IOException {
+        if (currentPosition < magicNumbers.length){
+            int ret = magicNumbers[currentPosition++];
+            return ret;
+        }
+        return baseStream.read();
+    }
+}
diff --git a/...main/java/org/apache/nifi/registry/bundle/extract/minificpp/MiNiFiCppBundleExtractor.java b/...main/java/org/apache/nifi/registry/bundle/extract/minificpp/MiNiFiCppBundleExtractor.java
@@ -16,21 +16,60 @@
  */
 package org.apache.nifi.registry.bundle.extract.minificpp;
 
+import org.apache.nifi.registry.bundle.extract.nar.NarBundleExtractor;
 import org.apache.nifi.registry.bundle.model.BundleDetails;
-import org.apache.nifi.registry.bundle.extract.BundleExtractor;
+import org.apache.nifi.registry.extension.component.manifest.Extension;
 
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.text.ParseException;
 
 /**
- * ExtensionBundleExtractor for MiNiFi CPP extensions.
+ * Description: Layers a header location stream to locate the JAR entries near the end of the file, which
+ * is the expected location of the file format.
+ *
+ * This can and may be adjusted for differing binary types; however, the expectation is that we locate
+ * a zip/jar of some sort that we can extract via the base class.
+ *
+ * Purpose: Provides BundleDetails for MiNiFi CPP binaries
+ *
+ *
+ * Design:
+ *
+ * The specification of the input files are expected to have a zip archive at the end. As a result,
+ * the binary is still executable, but can carry a payload as needed.
+ *
  */
-public class MiNiFiCppBundleExtractor implements BundleExtractor {
+public class MiNiFiCppBundleExtractor extends NarBundleExtractor {
+
+    /**
+     * Zip magic bytes.
+     */
+    public static final byte [] MAGIC_HEADER = new byte[] {(byte) 0x50, (byte) 0x4B,(byte) 0x03, (byte) 0x04};
+
+    @Override
+    protected long getBuildTime(final String timeStamp) throws ParseException {
+        try{
+            // still want to support opening NARs as we will be delivering some binaries
+            // as NAR files.
+            return super.getBuildTime(timeStamp);
+        }catch(ParseException pe){
+
+        }
+        try {
+            return Long.valueOf(timeStamp);
+        }catch(NumberFormatException nfe){
+            throw new ParseException("Could not parse " + timeStamp + " as a valid long",0);
+        }
+    }
 
     @Override
     public BundleDetails extract(final InputStream inputStream) throws IOException {
-        // TODO implement
-        throw new UnsupportedOperationException("Minifi CPP extensions are not yet supported");
+
+        // for now we will disable reverselookup to maintain backwards compatibility with NARS and keep the
+        // door open other archive types.
+        return super.extract(new HeaderLocationInputStream(inputStream,MAGIC_HEADER,false));
     }
 
 }
diff --git a/...e-utils/src/main/java/org/apache/nifi/registry/bundle/extract/nar/NarBundleExtractor.java b/...e-utils/src/main/java/org/apache/nifi/registry/bundle/extract/nar/NarBundleExtractor.java
@@ -20,20 +20,15 @@
 import org.apache.nifi.registry.bundle.extract.BundleExtractor;
 import org.apache.nifi.registry.bundle.extract.nar.docs.ExtensionManifestParser;
 import org.apache.nifi.registry.bundle.extract.nar.docs.JacksonExtensionManifestParser;
-import org.apache.nifi.registry.bundle.model.BundleIdentifier;
 import org.apache.nifi.registry.bundle.model.BundleDetails;
+import org.apache.nifi.registry.bundle.model.BundleIdentifier;
 import org.apache.nifi.registry.extension.bundle.BuildInfo;
 import org.apache.nifi.registry.extension.component.manifest.ExtensionManifest;
 
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.FilterInputStream;
-import java.io.IOException;
-import java.io.InputStream;
+import java.io.*;
 import java.nio.charset.StandardCharsets;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
-import java.util.Date;
 import java.util.jar.Attributes;
 import java.util.jar.JarEntry;
 import java.util.jar.JarInputStream;
@@ -122,6 +117,11 @@ private BundleIdentifier getDependencyBundleCoordinate(final Attributes attribut
         }
     }
 
+    protected long getBuildTime(final String timeStamp) throws ParseException {
+        final SimpleDateFormat simpleDateFormat = new SimpleDateFormat(BUILT_TIMESTAMP_FORMAT);
+        return simpleDateFormat.parse(timeStamp).getTime();
+    }
+
     private BuildInfo getBuildInfo(final Attributes attributes) {
         final String buildBranch = attributes.getValue(NarManifestEntry.BUILD_BRANCH.getManifestName());
         final String buildTag = attributes.getValue(NarManifestEntry.BUILD_TAG.getManifestName());
@@ -130,16 +130,15 @@ private BuildInfo getBuildInfo(final Attributes attributes) {
         final String buildJdk = attributes.getValue(NarManifestEntry.BUILD_JDK.getManifestName());
         final String builtBy = attributes.getValue(NarManifestEntry.BUILT_BY.getManifestName());
 
-        final SimpleDateFormat simpleDateFormat = new SimpleDateFormat(BUILT_TIMESTAMP_FORMAT);
         try {
-            final Date buildDate = simpleDateFormat.parse(buildTimestamp);
+            final long buildTime = getBuildTime(buildTimestamp);
 
             final BuildInfo buildInfo = new BuildInfo();
             buildInfo.setBuildTool(isBlank(buildJdk) ? NA : buildJdk);
             buildInfo.setBuildBranch(isBlank(buildBranch) ? NA : buildBranch);
             buildInfo.setBuildTag(isBlank(buildTag) ? NA : buildTag);
             buildInfo.setBuildRevision(isBlank(buildRevision) ? NA : buildRevision);
-            buildInfo.setBuilt(buildDate.getTime());
+            buildInfo.setBuilt(buildTime);
             buildInfo.setBuiltBy(isBlank(builtBy) ? NA : builtBy);
             buildInfo.setBuildFlags(NA);
             return buildInfo;