|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +package org.apache.parquet.column.values.alp; |
| 21 | + |
| 22 | +import static org.junit.Assert.assertTrue; |
| 23 | +import static org.junit.Assume.assumeTrue; |
| 24 | + |
| 25 | +import java.io.BufferedReader; |
| 26 | +import java.io.File; |
| 27 | +import java.io.FileReader; |
| 28 | +import java.io.IOException; |
| 29 | +import java.util.ArrayList; |
| 30 | +import java.util.List; |
| 31 | +import org.junit.Test; |
| 32 | + |
| 33 | +/** |
| 34 | + * Measures exception counts and compression ratios using the real Spotify and Arade datasets from |
| 35 | + * the parquet-testing repository (Pratik's alpFloatingPointDataset branch). |
| 36 | + * |
| 37 | + * <p>These are the same datasets used in the meeting discussion to compare Java vs C++ ALP |
| 38 | + * exception counts. Run from the project root with the alp-test-data/ directory present: |
| 39 | + * |
| 40 | + * <pre> |
| 41 | + * ./mvnw test -pl parquet-column -Dtest=AlpExceptionCountTest |
| 42 | + * </pre> |
| 43 | + * |
| 44 | + * <p>To use a different data directory: |
| 45 | + * |
| 46 | + * <pre> |
| 47 | + * ./mvnw test -pl parquet-column -Dtest=AlpExceptionCountTest \ |
| 48 | + * -DALP_TEST_DATA_DIR=/path/to/alp-test-data |
| 49 | + * </pre> |
| 50 | + */ |
| 51 | +public class AlpExceptionCountTest { |
| 52 | + |
| 53 | + private static final int VECTOR_SIZE = AlpConstants.DEFAULT_VECTOR_SIZE; |
| 54 | + |
| 55 | + private File getDataDir() { |
| 56 | + String dir = System.getProperty("ALP_TEST_DATA_DIR"); |
| 57 | + if (dir == null) dir = System.getenv("ALP_TEST_DATA_DIR"); |
| 58 | + if (dir != null && new File(dir).isDirectory()) return new File(dir); |
| 59 | + // Default: alp-test-data/ at project root (three levels up from module target) |
| 60 | + File candidate = new File(System.getProperty("user.dir")).getParentFile(); |
| 61 | + if (candidate != null) { |
| 62 | + File d = new File(candidate, "alp-test-data"); |
| 63 | + if (d.isDirectory()) return d; |
| 64 | + } |
| 65 | + // Also try relative to cwd |
| 66 | + File d2 = new File(System.getProperty("user.dir"), "alp-test-data"); |
| 67 | + return d2.isDirectory() ? d2 : null; |
| 68 | + } |
| 69 | + |
| 70 | + /** Reads a CSV file into per-column double arrays. Handles both ',' and '|' delimiters. */ |
| 71 | + private List<double[]> readCsvColumns(File file) throws IOException { |
| 72 | + List<List<Double>> cols = new ArrayList<>(); |
| 73 | + String delim = null; |
| 74 | + try (BufferedReader br = new BufferedReader(new FileReader(file))) { |
| 75 | + String line; |
| 76 | + boolean header = true; |
| 77 | + while ((line = br.readLine()) != null) { |
| 78 | + if (delim == null) delim = line.contains("|") ? "\\|" : ","; |
| 79 | + String[] parts = line.split(delim); |
| 80 | + if (header) { |
| 81 | + for (String ignored : parts) cols.add(new ArrayList<>()); |
| 82 | + header = false; |
| 83 | + continue; |
| 84 | + } |
| 85 | + for (int i = 0; i < parts.length && i < cols.size(); i++) { |
| 86 | + try { |
| 87 | + cols.get(i).add(Double.parseDouble(parts[i].trim())); |
| 88 | + } catch (NumberFormatException e) { |
| 89 | + cols.get(i).add(0.0); |
| 90 | + } |
| 91 | + } |
| 92 | + } |
| 93 | + } |
| 94 | + List<double[]> result = new ArrayList<>(); |
| 95 | + for (List<Double> col : cols) { |
| 96 | + double[] arr = new double[col.size()]; |
| 97 | + for (int i = 0; i < arr.length; i++) arr[i] = col.get(i); |
| 98 | + result.add(arr); |
| 99 | + } |
| 100 | + return result; |
| 101 | + } |
| 102 | + |
| 103 | + /** Reads the header row of a CSV file. */ |
| 104 | + private String[] readHeader(File file) throws IOException { |
| 105 | + try (BufferedReader br = new BufferedReader(new FileReader(file))) { |
| 106 | + String line = br.readLine(); |
| 107 | + if (line == null) return new String[0]; |
| 108 | + String delim = line.contains("|") ? "\\|" : ","; |
| 109 | + return line.split(delim); |
| 110 | + } |
| 111 | + } |
| 112 | + |
| 113 | + /** Counts total exceptions across all vectors for a double column. */ |
| 114 | + private int countExceptions(double[] values) { |
| 115 | + int totalExceptions = 0; |
| 116 | + for (int offset = 0; offset < values.length; offset += VECTOR_SIZE) { |
| 117 | + int len = Math.min(VECTOR_SIZE, values.length - offset); |
| 118 | + AlpEncoderDecoder.EncodingParams params = |
| 119 | + AlpEncoderDecoder.findBestDoubleParams(values, offset, len); |
| 120 | + totalExceptions += params.numExceptions; |
| 121 | + } |
| 122 | + return totalExceptions; |
| 123 | + } |
| 124 | + |
| 125 | + private void reportDataset(String label, File file) throws IOException { |
| 126 | + List<double[]> columns = readCsvColumns(file); |
| 127 | + String[] headers = readHeader(file); |
| 128 | + int rows = columns.isEmpty() ? 0 : columns.get(0).length; |
| 129 | + int numVectors = (int) Math.ceil((double) rows / VECTOR_SIZE); |
| 130 | + |
| 131 | + System.out.printf("%n=== %s (%d rows, %d cols, %d vectors/col) ===%n", |
| 132 | + label, rows, columns.size(), numVectors); |
| 133 | + System.out.printf(" %-20s %6s %6s %7s%n", "column", "rows", "exc", "exc%"); |
| 134 | + System.out.printf(" %-20s %6s %6s %7s%n", "------", "----", "---", "----"); |
| 135 | + |
| 136 | + int totalExc = 0; |
| 137 | + int totalRows = 0; |
| 138 | + for (int i = 0; i < columns.size(); i++) { |
| 139 | + double[] col = columns.get(i); |
| 140 | + int exc = countExceptions(col); |
| 141 | + totalExc += exc; |
| 142 | + totalRows += col.length; |
| 143 | + String name = (i < headers.length) ? headers[i].trim() : "col" + i; |
| 144 | + System.out.printf(" %-20s %6d %6d %6.2f%%%n", |
| 145 | + name, col.length, exc, 100.0 * exc / col.length); |
| 146 | + } |
| 147 | + System.out.printf(" %-20s %6d %6d %6.2f%%%n", |
| 148 | + "TOTAL", totalRows, totalExc, 100.0 * totalExc / totalRows); |
| 149 | + } |
| 150 | + |
| 151 | + @Test |
| 152 | + public void testSpotifyExceptionCounts() throws IOException { |
| 153 | + File dir = getDataDir(); |
| 154 | + assumeTrue("alp-test-data/ not found. Run from project root or set ALP_TEST_DATA_DIR", dir != null); |
| 155 | + File file = new File(dir, "floatingpoint_spotify1.csv"); |
| 156 | + assumeTrue("floatingpoint_spotify1.csv not found in " + dir, file.exists()); |
| 157 | + |
| 158 | + reportDataset("Spotify", file); |
| 159 | + |
| 160 | + // Basic sanity: Spotify data is well-behaved floats, expect < 5% exceptions overall |
| 161 | + List<double[]> columns = readCsvColumns(file); |
| 162 | + int totalExc = 0, totalRows = 0; |
| 163 | + for (double[] col : columns) { |
| 164 | + totalExc += countExceptions(col); |
| 165 | + totalRows += col.length; |
| 166 | + } |
| 167 | + double excRate = 100.0 * totalExc / totalRows; |
| 168 | + System.out.printf("%nSpotify overall exception rate: %.2f%%%n", excRate); |
| 169 | + assertTrue("Exception rate should be < 10% for Spotify data, got: " + excRate, excRate < 10.0); |
| 170 | + } |
| 171 | + |
| 172 | + @Test |
| 173 | + public void testAradeExceptionCounts() throws IOException { |
| 174 | + File dir = getDataDir(); |
| 175 | + assumeTrue("alp-test-data/ not found. Run from project root or set ALP_TEST_DATA_DIR", dir != null); |
| 176 | + File file = new File(dir, "floatingpoint_arade.csv"); |
| 177 | + assumeTrue("floatingpoint_arade.csv not found in " + dir, file.exists()); |
| 178 | + |
| 179 | + reportDataset("Arade", file); |
| 180 | + |
| 181 | + List<double[]> columns = readCsvColumns(file); |
| 182 | + int totalExc = 0, totalRows = 0; |
| 183 | + for (double[] col : columns) { |
| 184 | + totalExc += countExceptions(col); |
| 185 | + totalRows += col.length; |
| 186 | + } |
| 187 | + double excRate = 100.0 * totalExc / totalRows; |
| 188 | + System.out.printf("%nArade overall exception rate: %.2f%%%n", excRate); |
| 189 | + assertTrue("Exception rate should be < 10% for Arade data, got: " + excRate, excRate < 10.0); |
| 190 | + } |
| 191 | + |
| 192 | + @Test |
| 193 | + public void testAllDatasetsExceptionCounts() throws IOException { |
| 194 | + File dir = getDataDir(); |
| 195 | + assumeTrue("alp-test-data/ not found. Run from project root or set ALP_TEST_DATA_DIR", dir != null); |
| 196 | + |
| 197 | + File[] csvFiles = dir.listFiles((d, name) -> name.startsWith("floatingpoint_") && name.endsWith(".csv")); |
| 198 | + assumeTrue("No floatingpoint_*.csv files found in " + dir, |
| 199 | + csvFiles != null && csvFiles.length > 0); |
| 200 | + |
| 201 | + System.out.printf("%n=== All Datasets Summary ===%n"); |
| 202 | + System.out.printf(" %-30s %6s %6s %7s%n", "dataset", "rows", "exc", "exc%"); |
| 203 | + System.out.printf(" %-30s %6s %6s %7s%n", "-------", "----", "---", "----"); |
| 204 | + |
| 205 | + for (File file : csvFiles) { |
| 206 | + List<double[]> columns = readCsvColumns(file); |
| 207 | + int totalExc = 0, totalRows = 0; |
| 208 | + for (double[] col : columns) { |
| 209 | + totalExc += countExceptions(col); |
| 210 | + totalRows += col.length; |
| 211 | + } |
| 212 | + String name = file.getName().replace("floatingpoint_", "").replace(".csv", ""); |
| 213 | + System.out.printf(" %-30s %6d %6d %6.2f%%%n", |
| 214 | + name, totalRows, totalExc, 100.0 * totalExc / totalRows); |
| 215 | + } |
| 216 | + } |
| 217 | +} |
0 commit comments