apache
diff --git a/‎parquet-cli/pom.xml‎
Lines changed: 12 additions & 0 deletions b/‎parquet-cli/pom.xml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpConstants.java‎
Lines changed: 5 additions & 2 deletions b/‎parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpConstants.java‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpEncoderDecoder.java‎
Lines changed: 98 additions & 17 deletions b/‎parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpEncoderDecoder.java‎
Lines changed: 98 additions & 17 deletions
diff --git a/‎parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForDouble.java‎
Lines changed: 19 additions & 11 deletions b/‎parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForDouble.java‎
Lines changed: 19 additions & 11 deletions
diff --git a/‎parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForFloat.java‎
Lines changed: 19 additions & 12 deletions b/‎parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForFloat.java‎
Lines changed: 19 additions & 12 deletions
@@ -87,6 +87,18 @@
     </dependency>
 
     <!-- Protobuf dependencies for CLI Tests -->
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <version>4.11.0</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-inline</artifactId>
+      <version>4.11.0</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-protobuf</artifactId>
 
@@ -53,8 +53,11 @@ private AlpConstants() {
   static final int FLOAT_MAX_EXPONENT = 10;
   static final int DOUBLE_MAX_EXPONENT = 18;
 
-  // Preset caching: full search for the first N vectors, then lock in the top combos
-  static final int SAMPLER_SAMPLE_VECTORS = 8;
+  // Sampler constants matching C++ AlpConstants.
+  // Sample SAMPLER_SAMPLE_VECTORS_PER_ROWGROUP vectors evenly distributed across a rowgroup
+  // of SAMPLER_ROWGROUP_SIZE values, then lock in top MAX_PRESET_COMBINATIONS combos.
+  static final int SAMPLER_ROWGROUP_SIZE = 122_880;
+  static final int SAMPLER_SAMPLE_VECTORS_PER_ROWGROUP = 8;
   static final int MAX_PRESET_COMBINATIONS = 5;
 
   // Magic numbers for the fast-rounding trick (see ALP paper, Section 3.2)
 
@@ -170,26 +170,51 @@ public static class EncodingParams {
     }
   }
 
-  /** Try all (exponent, factor) combos and pick the one with fewest exceptions. */
+  /**
+   * Try all (exponent, factor) combos and pick the one with the smallest estimated compressed size.
+   *
+   * <p>Estimated size (in bits) = {@code length * bitWidth + exceptions * (Float.SIZE + Short.SIZE)},
+   * where bitWidth is the number of bits needed to represent the unsigned range of non-exception
+   * encoded values after frame-of-reference subtraction. This matches the C++ ALP cost model and
+   * produces better compression ratios than minimizing exception count alone.
+   */
   static EncodingParams findBestFloatParams(float[] values, int offset, int length) {
     int bestExponent = 0;
     int bestFactor = 0;
     int bestExceptions = length;
+    long bestEstimatedSize = Long.MAX_VALUE;
 
     for (int e = 0; e <= FLOAT_MAX_EXPONENT; e++) {
       for (int f = 0; f <= e; f++) {
         int exceptions = 0;
+        int minEncoded = Integer.MAX_VALUE;
+        int maxEncoded = Integer.MIN_VALUE;
         for (int i = 0; i < length; i++) {
-          if (isFloatException(values[offset + i], e, f)) {
+          float value = values[offset + i];
+          if (isFloatException(value, e, f)) {
             exceptions++;
+          } else {
+            int encoded = encodeFloat(value, e, f);
+            if (encoded < minEncoded) minEncoded = encoded;
+            if (encoded > maxEncoded) maxEncoded = encoded;
           }
         }
-        if (exceptions < bestExceptions) {
+        int nonExceptions = length - exceptions;
+        if (nonExceptions == 0) continue;
+        long delta = (nonExceptions < 2) ? 0 :
+            Integer.toUnsignedLong(maxEncoded) - Integer.toUnsignedLong(minEncoded);
+        int bitsPerValue = (delta == 0) ? 0 : (64 - Long.numberOfLeadingZeros(delta));
+        long estimatedSize = (long) length * bitsPerValue
+            + (long) exceptions * (Float.SIZE + Short.SIZE);
+        if (estimatedSize < bestEstimatedSize
+            || (estimatedSize == bestEstimatedSize
+                && (e > bestExponent || (e == bestExponent && f > bestFactor)))) {
+          bestEstimatedSize = estimatedSize;
           bestExponent = e;
           bestFactor = f;
           bestExceptions = exceptions;
-          if (bestExceptions == 0) {
-            return new EncodingParams(bestExponent, bestFactor, bestExceptions);
+          if (bestExceptions == 0 && bitsPerValue == 0) {
+            return new EncodingParams(bestExponent, bestFactor, 0);
           }
         }
       }
@@ -202,74 +227,130 @@ static EncodingParams findBestFloatParamsWithPresets(float[] values, int offset,
     int bestExponent = presets[0][0];
     int bestFactor = presets[0][1];
     int bestExceptions = length;
+    long bestEstimatedSize = Long.MAX_VALUE;
 
     for (int[] preset : presets) {
       int e = preset[0];
       int f = preset[1];
       int exceptions = 0;
+      int minEncoded = Integer.MAX_VALUE;
+      int maxEncoded = Integer.MIN_VALUE;
       for (int i = 0; i < length; i++) {
-        if (isFloatException(values[offset + i], e, f)) {
+        float value = values[offset + i];
+        if (isFloatException(value, e, f)) {
           exceptions++;
+        } else {
+          int encoded = encodeFloat(value, e, f);
+          if (encoded < minEncoded) minEncoded = encoded;
+          if (encoded > maxEncoded) maxEncoded = encoded;
         }
       }
-      if (exceptions < bestExceptions) {
+      int nonExceptions = length - exceptions;
+      if (nonExceptions == 0) continue;
+      long delta = (nonExceptions < 2) ? 0 :
+          Integer.toUnsignedLong(maxEncoded) - Integer.toUnsignedLong(minEncoded);
+      int bitsPerValue = (delta == 0) ? 0 : (64 - Long.numberOfLeadingZeros(delta));
+      long estimatedSize = (long) length * bitsPerValue
+          + (long) exceptions * (Float.SIZE + Short.SIZE);
+      if (estimatedSize < bestEstimatedSize
+          || (estimatedSize == bestEstimatedSize
+              && (e > bestExponent || (e == bestExponent && f > bestFactor)))) {
+        bestEstimatedSize = estimatedSize;
         bestExponent = e;
         bestFactor = f;
         bestExceptions = exceptions;
-        if (bestExceptions == 0) {
-          return new EncodingParams(bestExponent, bestFactor, bestExceptions);
+        if (bestExceptions == 0 && bitsPerValue == 0) {
+          return new EncodingParams(bestExponent, bestFactor, 0);
         }
       }
     }
     return new EncodingParams(bestExponent, bestFactor, bestExceptions);
   }
 
+  /** Try all (exponent, factor) combos and pick the one with the smallest estimated compressed size. */
   static EncodingParams findBestDoubleParams(double[] values, int offset, int length) {
     int bestExponent = 0;
     int bestFactor = 0;
     int bestExceptions = length;
+    long bestEstimatedSize = Long.MAX_VALUE;
 
     for (int e = 0; e <= DOUBLE_MAX_EXPONENT; e++) {
       for (int f = 0; f <= e; f++) {
         int exceptions = 0;
+        long minEncoded = Long.MAX_VALUE;
+        long maxEncoded = Long.MIN_VALUE;
         for (int i = 0; i < length; i++) {
-          if (isDoubleException(values[offset + i], e, f)) {
+          double value = values[offset + i];
+          if (isDoubleException(value, e, f)) {
             exceptions++;
+          } else {
+            long encoded = encodeDouble(value, e, f);
+            if (encoded < minEncoded) minEncoded = encoded;
+            if (encoded > maxEncoded) maxEncoded = encoded;
           }
         }
-        if (exceptions < bestExceptions) {
+        int nonExceptions = length - exceptions;
+        if (nonExceptions == 0) continue;
+        // delta as signed subtraction; Long.numberOfLeadingZeros handles the unsigned bit width
+        // correctly even when the subtraction overflows (large range → penalized with 64 bits).
+        long delta = (nonExceptions < 2) ? 0 : (maxEncoded - minEncoded);
+        int bitsPerValue = (delta == 0) ? 0 : (64 - Long.numberOfLeadingZeros(delta));
+        long estimatedSize = (long) length * bitsPerValue
+            + (long) exceptions * (Double.SIZE + Short.SIZE);
+        if (estimatedSize < bestEstimatedSize
+            || (estimatedSize == bestEstimatedSize
+                && (e > bestExponent || (e == bestExponent && f > bestFactor)))) {
+          bestEstimatedSize = estimatedSize;
           bestExponent = e;
           bestFactor = f;
           bestExceptions = exceptions;
-          if (bestExceptions == 0) {
-            return new EncodingParams(bestExponent, bestFactor, bestExceptions);
+          if (bestExceptions == 0 && bitsPerValue == 0) {
+            return new EncodingParams(bestExponent, bestFactor, 0);
           }
         }
       }
     }
     return new EncodingParams(bestExponent, bestFactor, bestExceptions);
   }
 
+  /** Same as findBestDoubleParams but only tries the cached preset combos. */
   static EncodingParams findBestDoubleParamsWithPresets(double[] values, int offset, int length, int[][] presets) {
     int bestExponent = presets[0][0];
     int bestFactor = presets[0][1];
     int bestExceptions = length;
+    long bestEstimatedSize = Long.MAX_VALUE;
 
     for (int[] preset : presets) {
       int e = preset[0];
       int f = preset[1];
       int exceptions = 0;
+      long minEncoded = Long.MAX_VALUE;
+      long maxEncoded = Long.MIN_VALUE;
       for (int i = 0; i < length; i++) {
-        if (isDoubleException(values[offset + i], e, f)) {
+        double value = values[offset + i];
+        if (isDoubleException(value, e, f)) {
           exceptions++;
+        } else {
+          long encoded = encodeDouble(value, e, f);
+          if (encoded < minEncoded) minEncoded = encoded;
+          if (encoded > maxEncoded) maxEncoded = encoded;
         }
       }
-      if (exceptions < bestExceptions) {
+      int nonExceptions = length - exceptions;
+      if (nonExceptions == 0) continue;
+      long delta = (nonExceptions < 2) ? 0 : (maxEncoded - minEncoded);
+      int bitsPerValue = (delta == 0) ? 0 : (64 - Long.numberOfLeadingZeros(delta));
+      long estimatedSize = (long) length * bitsPerValue
+          + (long) exceptions * (Double.SIZE + Short.SIZE);
+      if (estimatedSize < bestEstimatedSize
+          || (estimatedSize == bestEstimatedSize
+              && (e > bestExponent || (e == bestExponent && f > bestFactor)))) {
+        bestEstimatedSize = estimatedSize;
         bestExponent = e;
         bestFactor = f;
         bestExceptions = exceptions;
-        if (bestExceptions == 0) {
-          return new EncodingParams(bestExponent, bestFactor, bestExceptions);
+        if (bestExceptions == 0 && bitsPerValue == 0) {
+          return new EncodingParams(bestExponent, bestFactor, 0);
         }
       }
     }
 
@@ -34,6 +34,10 @@
 public class AlpValuesReaderForDouble extends AlpValuesReader {
 
   private double[] decodedValues;
+  private long[] deltasBuffer;
+  private int[] excPositionsBuffer;
+  private final long[] unpackPadBuf = new long[8];
+  private byte[] unpackByteBuf;
 
   public AlpValuesReaderForDouble() {
     super();
@@ -42,6 +46,9 @@ public AlpValuesReaderForDouble() {
   @Override
   protected void allocateDecodedBuffer(int capacity) {
     this.decodedValues = new double[capacity];
+    this.deltasBuffer = new long[capacity];
+    this.excPositionsBuffer = new int[capacity];
+    this.unpackByteBuf = new byte[Long.SIZE]; // max bit width for long = 64 bytes
   }
 
   @Override
@@ -69,24 +76,24 @@ protected void decodeVector(int vectorIdx) {
     int bitWidth = vectorsData.get(pos + 8) & 0xFF;
     pos += DOUBLE_FOR_INFO_SIZE;
 
-    long[] deltas = new long[vectorLen];
     if (bitWidth > 0) {
-      pos = unpackLongsWithBytePacker(vectorsData, pos, deltas, vectorLen, bitWidth);
+      pos = unpackLongsWithBytePacker(vectorsData, pos, deltasBuffer, vectorLen, bitWidth);
+    } else {
+      java.util.Arrays.fill(deltasBuffer, 0, vectorLen, 0L);
     }
 
     for (int i = 0; i < vectorLen; i++) {
-      long encoded = deltas[i] + frameOfReference;
+      long encoded = deltasBuffer[i] + frameOfReference;
       decodedValues[i] = AlpEncoderDecoder.decodeDouble(encoded, exponent, factor);
     }
 
     if (numExceptions > 0) {
-      int[] excPositions = new int[numExceptions];
       for (int e = 0; e < numExceptions; e++) {
-        excPositions[e] = getShortLE(vectorsData, pos) & 0xFFFF;
+        excPositionsBuffer[e] = getShortLE(vectorsData, pos) & 0xFFFF;
         pos += Short.BYTES;
       }
       for (int e = 0; e < numExceptions; e++) {
-        decodedValues[excPositions[e]] = getDoubleLE(vectorsData, pos);
+        decodedValues[excPositionsBuffer[e]] = getDoubleLE(vectorsData, pos);
         pos += Double.BYTES;
       }
     }
@@ -109,14 +116,15 @@ private int unpackLongsWithBytePacker(ByteBuffer buf, int pos, long[] output, in
       int alreadyRead = numFullGroups * bitWidth;
       int partialBytes = totalPackedBytes - alreadyRead;
 
-      byte[] padded = new byte[bitWidth];
       for (int i = 0; i < partialBytes; i++) {
-        padded[i] = buf.get(pos + i);
+        unpackByteBuf[i] = buf.get(pos + i);
+      }
+      for (int i = partialBytes; i < bitWidth; i++) {
+        unpackByteBuf[i] = 0;
       }
 
-      long[] temp = new long[8];
-      packer.unpack8Values(padded, 0, temp, 0);
-      System.arraycopy(temp, 0, output, numFullGroups * 8, remaining);
+      packer.unpack8Values(unpackByteBuf, 0, unpackPadBuf, 0);
+      System.arraycopy(unpackPadBuf, 0, output, numFullGroups * 8, remaining);
       pos += partialBytes;
     }
 
 
@@ -34,6 +34,10 @@
 public class AlpValuesReaderForFloat extends AlpValuesReader {
 
   private float[] decodedValues;
+  private int[] deltasBuffer;
+  private int[] excPositionsBuffer;
+  private final int[] unpackPadBuf = new int[8];
+  private byte[] unpackByteBuf;
 
   public AlpValuesReaderForFloat() {
     super();
@@ -42,6 +46,9 @@ public AlpValuesReaderForFloat() {
   @Override
   protected void allocateDecodedBuffer(int capacity) {
     this.decodedValues = new float[capacity];
+    this.deltasBuffer = new int[capacity];
+    this.excPositionsBuffer = new int[capacity];
+    this.unpackByteBuf = new byte[Integer.SIZE]; // max bit width for int = 32 bytes
   }
 
   @Override
@@ -69,26 +76,25 @@ protected void decodeVector(int vectorIdx) {
     int bitWidth = vectorsData.get(pos + 4) & 0xFF;
     pos += FLOAT_FOR_INFO_SIZE;
 
-    int[] deltas = new int[vectorLen];
     if (bitWidth > 0) {
-      pos = unpackIntsWithBytePacker(vectorsData, pos, deltas, vectorLen, bitWidth);
+      pos = unpackIntsWithBytePacker(vectorsData, pos, deltasBuffer, vectorLen, bitWidth);
+    } else {
+      java.util.Arrays.fill(deltasBuffer, 0, vectorLen, 0);
     }
 
-    // Reverse the frame-of-reference subtraction, then decimal-decode
     for (int i = 0; i < vectorLen; i++) {
-      int encoded = deltas[i] + frameOfReference;
+      int encoded = deltasBuffer[i] + frameOfReference;
       decodedValues[i] = AlpEncoderDecoder.decodeFloat(encoded, exponent, factor);
     }
 
     // Overwrite exception slots with their original float values
     if (numExceptions > 0) {
-      int[] excPositions = new int[numExceptions];
       for (int e = 0; e < numExceptions; e++) {
-        excPositions[e] = getShortLE(vectorsData, pos) & 0xFFFF;
+        excPositionsBuffer[e] = getShortLE(vectorsData, pos) & 0xFFFF;
         pos += Short.BYTES;
       }
       for (int e = 0; e < numExceptions; e++) {
-        decodedValues[excPositions[e]] = getFloatLE(vectorsData, pos);
+        decodedValues[excPositionsBuffer[e]] = getFloatLE(vectorsData, pos);
         pos += Float.BYTES;
       }
     }
@@ -110,14 +116,15 @@ private int unpackIntsWithBytePacker(ByteBuffer buf, int pos, int[] output, int
       int alreadyRead = numFullGroups * bitWidth;
       int partialBytes = totalPackedBytes - alreadyRead;
 
-      byte[] padded = new byte[bitWidth];
       for (int i = 0; i < partialBytes; i++) {
-        padded[i] = buf.get(pos + i);
+        unpackByteBuf[i] = buf.get(pos + i);
+      }
+      for (int i = partialBytes; i < bitWidth; i++) {
+        unpackByteBuf[i] = 0;
       }
 
-      int[] temp = new int[8];
-      packer.unpack8Values(padded, 0, temp, 0);
-      System.arraycopy(temp, 0, output, numFullGroups * 8, remaining);
+      packer.unpack8Values(unpackByteBuf, 0, unpackPadBuf, 0);
+      System.arraycopy(unpackPadBuf, 0, output, numFullGroups * 8, remaining);
       pos += partialBytes;
     }