Skip to content

Commit 56ce400

Browse files
Add ALP interop and exception count tests
- Rewrite TestInterOpReadAlp to use LocalInputFile instead of Hadoop FileSystem, fixing failures on Java 24+ where Subject.getSubject is removed. Tests now read C++ ALP parquet files directly without going through Hadoop security/UGI. - Add AlpExceptionCountTest with per-column exception rate reporting against the real Spotify and Arade floating-point datasets from the parquet-testing repository. Useful for comparing Java vs C++ ALP compression ratios.
1 parent 4e34a9d commit 56ce400

2 files changed

Lines changed: 372 additions & 212 deletions

File tree

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.parquet.column.values.alp;
21+
22+
import static org.junit.Assert.assertTrue;
23+
import static org.junit.Assume.assumeTrue;
24+
25+
import java.io.BufferedReader;
26+
import java.io.File;
27+
import java.io.FileReader;
28+
import java.io.IOException;
29+
import java.util.ArrayList;
30+
import java.util.List;
31+
import org.junit.Test;
32+
33+
/**
34+
* Measures exception counts and compression ratios using the real Spotify and Arade datasets from
35+
* the parquet-testing repository (Pratik's alpFloatingPointDataset branch).
36+
*
37+
* <p>These are the same datasets used in the meeting discussion to compare Java vs C++ ALP
38+
* exception counts. Run from the project root with the alp-test-data/ directory present:
39+
*
40+
* <pre>
41+
* ./mvnw test -pl parquet-column -Dtest=AlpExceptionCountTest
42+
* </pre>
43+
*
44+
* <p>To use a different data directory:
45+
*
46+
* <pre>
47+
* ./mvnw test -pl parquet-column -Dtest=AlpExceptionCountTest \
48+
* -DALP_TEST_DATA_DIR=/path/to/alp-test-data
49+
* </pre>
50+
*/
51+
public class AlpExceptionCountTest {
52+
53+
private static final int VECTOR_SIZE = AlpConstants.DEFAULT_VECTOR_SIZE;
54+
55+
private File getDataDir() {
56+
String dir = System.getProperty("ALP_TEST_DATA_DIR");
57+
if (dir == null) dir = System.getenv("ALP_TEST_DATA_DIR");
58+
if (dir != null && new File(dir).isDirectory()) return new File(dir);
59+
// Default: alp-test-data/ at project root (three levels up from module target)
60+
File candidate = new File(System.getProperty("user.dir")).getParentFile();
61+
if (candidate != null) {
62+
File d = new File(candidate, "alp-test-data");
63+
if (d.isDirectory()) return d;
64+
}
65+
// Also try relative to cwd
66+
File d2 = new File(System.getProperty("user.dir"), "alp-test-data");
67+
return d2.isDirectory() ? d2 : null;
68+
}
69+
70+
/** Reads a CSV file into per-column double arrays. Handles both ',' and '|' delimiters. */
71+
private List<double[]> readCsvColumns(File file) throws IOException {
72+
List<List<Double>> cols = new ArrayList<>();
73+
String delim = null;
74+
try (BufferedReader br = new BufferedReader(new FileReader(file))) {
75+
String line;
76+
boolean header = true;
77+
while ((line = br.readLine()) != null) {
78+
if (delim == null) delim = line.contains("|") ? "\\|" : ",";
79+
String[] parts = line.split(delim);
80+
if (header) {
81+
for (String ignored : parts) cols.add(new ArrayList<>());
82+
header = false;
83+
continue;
84+
}
85+
for (int i = 0; i < parts.length && i < cols.size(); i++) {
86+
try {
87+
cols.get(i).add(Double.parseDouble(parts[i].trim()));
88+
} catch (NumberFormatException e) {
89+
cols.get(i).add(0.0);
90+
}
91+
}
92+
}
93+
}
94+
List<double[]> result = new ArrayList<>();
95+
for (List<Double> col : cols) {
96+
double[] arr = new double[col.size()];
97+
for (int i = 0; i < arr.length; i++) arr[i] = col.get(i);
98+
result.add(arr);
99+
}
100+
return result;
101+
}
102+
103+
/** Reads the header row of a CSV file. */
104+
private String[] readHeader(File file) throws IOException {
105+
try (BufferedReader br = new BufferedReader(new FileReader(file))) {
106+
String line = br.readLine();
107+
if (line == null) return new String[0];
108+
String delim = line.contains("|") ? "\\|" : ",";
109+
return line.split(delim);
110+
}
111+
}
112+
113+
/** Counts total exceptions across all vectors for a double column. */
114+
private int countExceptions(double[] values) {
115+
int totalExceptions = 0;
116+
for (int offset = 0; offset < values.length; offset += VECTOR_SIZE) {
117+
int len = Math.min(VECTOR_SIZE, values.length - offset);
118+
AlpEncoderDecoder.EncodingParams params =
119+
AlpEncoderDecoder.findBestDoubleParams(values, offset, len);
120+
totalExceptions += params.numExceptions;
121+
}
122+
return totalExceptions;
123+
}
124+
125+
private void reportDataset(String label, File file) throws IOException {
126+
List<double[]> columns = readCsvColumns(file);
127+
String[] headers = readHeader(file);
128+
int rows = columns.isEmpty() ? 0 : columns.get(0).length;
129+
int numVectors = (int) Math.ceil((double) rows / VECTOR_SIZE);
130+
131+
System.out.printf("%n=== %s (%d rows, %d cols, %d vectors/col) ===%n",
132+
label, rows, columns.size(), numVectors);
133+
System.out.printf(" %-20s %6s %6s %7s%n", "column", "rows", "exc", "exc%");
134+
System.out.printf(" %-20s %6s %6s %7s%n", "------", "----", "---", "----");
135+
136+
int totalExc = 0;
137+
int totalRows = 0;
138+
for (int i = 0; i < columns.size(); i++) {
139+
double[] col = columns.get(i);
140+
int exc = countExceptions(col);
141+
totalExc += exc;
142+
totalRows += col.length;
143+
String name = (i < headers.length) ? headers[i].trim() : "col" + i;
144+
System.out.printf(" %-20s %6d %6d %6.2f%%%n",
145+
name, col.length, exc, 100.0 * exc / col.length);
146+
}
147+
System.out.printf(" %-20s %6d %6d %6.2f%%%n",
148+
"TOTAL", totalRows, totalExc, 100.0 * totalExc / totalRows);
149+
}
150+
151+
@Test
152+
public void testSpotifyExceptionCounts() throws IOException {
153+
File dir = getDataDir();
154+
assumeTrue("alp-test-data/ not found. Run from project root or set ALP_TEST_DATA_DIR", dir != null);
155+
File file = new File(dir, "floatingpoint_spotify1.csv");
156+
assumeTrue("floatingpoint_spotify1.csv not found in " + dir, file.exists());
157+
158+
reportDataset("Spotify", file);
159+
160+
// Basic sanity: Spotify data is well-behaved floats, expect < 5% exceptions overall
161+
List<double[]> columns = readCsvColumns(file);
162+
int totalExc = 0, totalRows = 0;
163+
for (double[] col : columns) {
164+
totalExc += countExceptions(col);
165+
totalRows += col.length;
166+
}
167+
double excRate = 100.0 * totalExc / totalRows;
168+
System.out.printf("%nSpotify overall exception rate: %.2f%%%n", excRate);
169+
assertTrue("Exception rate should be < 10% for Spotify data, got: " + excRate, excRate < 10.0);
170+
}
171+
172+
@Test
173+
public void testAradeExceptionCounts() throws IOException {
174+
File dir = getDataDir();
175+
assumeTrue("alp-test-data/ not found. Run from project root or set ALP_TEST_DATA_DIR", dir != null);
176+
File file = new File(dir, "floatingpoint_arade.csv");
177+
assumeTrue("floatingpoint_arade.csv not found in " + dir, file.exists());
178+
179+
reportDataset("Arade", file);
180+
181+
List<double[]> columns = readCsvColumns(file);
182+
int totalExc = 0, totalRows = 0;
183+
for (double[] col : columns) {
184+
totalExc += countExceptions(col);
185+
totalRows += col.length;
186+
}
187+
double excRate = 100.0 * totalExc / totalRows;
188+
System.out.printf("%nArade overall exception rate: %.2f%%%n", excRate);
189+
assertTrue("Exception rate should be < 10% for Arade data, got: " + excRate, excRate < 10.0);
190+
}
191+
192+
@Test
193+
public void testAllDatasetsExceptionCounts() throws IOException {
194+
File dir = getDataDir();
195+
assumeTrue("alp-test-data/ not found. Run from project root or set ALP_TEST_DATA_DIR", dir != null);
196+
197+
File[] csvFiles = dir.listFiles((d, name) -> name.startsWith("floatingpoint_") && name.endsWith(".csv"));
198+
assumeTrue("No floatingpoint_*.csv files found in " + dir,
199+
csvFiles != null && csvFiles.length > 0);
200+
201+
System.out.printf("%n=== All Datasets Summary ===%n");
202+
System.out.printf(" %-30s %6s %6s %7s%n", "dataset", "rows", "exc", "exc%");
203+
System.out.printf(" %-30s %6s %6s %7s%n", "-------", "----", "---", "----");
204+
205+
for (File file : csvFiles) {
206+
List<double[]> columns = readCsvColumns(file);
207+
int totalExc = 0, totalRows = 0;
208+
for (double[] col : columns) {
209+
totalExc += countExceptions(col);
210+
totalRows += col.length;
211+
}
212+
String name = file.getName().replace("floatingpoint_", "").replace(".csv", "");
213+
System.out.printf(" %-30s %6d %6d %6.2f%%%n",
214+
name, totalRows, totalExc, 100.0 * totalExc / totalRows);
215+
}
216+
}
217+
}

0 commit comments

Comments
 (0)