Skip to content

Commit e33b3eb

Browse files
authored
Merge pull request #1130 from ijatinydv/fix-hash-big-endian
Fix test_hash_functions failures on big-endian architectures
2 parents 4c308bf + 602a0fa commit e33b3eb

4 files changed

Lines changed: 328 additions & 6 deletions

File tree

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
name: CI - Big-Endian
2+
3+
on:
4+
# Always run on pushes to the main branch
5+
push:
6+
branches: [master, main]
7+
# For PRs, ONLY run if the hash files or this workflow change
8+
pull_request:
9+
paths:
10+
- '.github/workflows/ci_big_endian.yml'
11+
- 'src/hash/**'
12+
- 'test/hash_functions/**'
13+
# Allow manual triggers from the GitHub Actions tab
14+
workflow_dispatch:
15+
16+
env:
17+
CTEST_TIME_TIMEOUT: "5" # some failures hang forever
18+
19+
jobs:
20+
test-big-endian:
21+
runs-on: ubuntu-24.04
22+
name: Test on s390x (big-endian)
23+
strategy:
24+
fail-fast: false
25+
26+
steps:
27+
- name: Checkout code
28+
uses: actions/checkout@v4
29+
30+
- name: Build and test on s390x
31+
uses: uraimo/run-on-arch-action@v3
32+
with:
33+
arch: s390x
34+
distro: ubuntu24.04
35+
36+
# Cache Docker image layer in GitHub Package Registry
37+
githubToken: ${{ github.token }}
38+
39+
# Install dependencies (cached in Docker image layer)
40+
install: |
41+
apt-get update -q -y
42+
apt-get install -q -y gfortran gcc g++ cmake python3-pip ninja-build git
43+
pip3 install --break-system-packages fypp
44+
45+
# Build and run big-endian tests
46+
run: |
47+
echo "=== Architecture Info ==="
48+
uname -m
49+
echo "Byte order: $(python3 -c 'import sys; print(sys.byteorder)')"
50+
51+
echo "=== Compiler Version ==="
52+
gfortran --version
53+
54+
echo "=== CMake Configure ==="
55+
cmake -G Ninja \
56+
-DCMAKE_BUILD_TYPE=Release \
57+
-DCMAKE_MAXIMUM_RANK:String=4 \
58+
-DFIND_BLAS:STRING=FALSE \
59+
-S . -B build
60+
61+
echo "=== Build (Targeted) ==="
62+
cmake --build build --target test_hash_functions --parallel
63+
64+
echo "=== Run Big-Endian Tests ==="
65+
ctest --test-dir build \
66+
-R hash_functions \
67+
--output-on-failure \
68+
--no-tests=error

doc/specs/stdlib_hash_procedures.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,16 @@ generating seeds for `universal_mult_hash`.
394394
All assume a two's complement sign bit, and no out of
395395
range checks.
396396

397+
**Endianness note:** Both `pengy_hash` and `spooky_hash` will produce
398+
different hash values on big-endian processors (e.g., s390x, SPARC,
399+
PowerPC) compared to little-endian processors (e.g., x86, ARM, RISC-V)
400+
for the same input bytes. The hash quality is equally good on both
401+
architectures, but the outputs are not cross-architecture portable.
402+
Do not use these hashes for cross-architecture verification (e.g.,
403+
hashing a file on an x86 system and verifying the hash on an IBM
404+
mainframe). The 32-bit hashes `nmhash32`, `nmhash32x`, and
405+
`water_hash` produce identical results on all architectures.
406+
397407
The `stdlib_hash_32bit_fnv` and `stdlib_hash_64bit_fnv`
398408
submodules each provide implementations of the FNV-1 and FNV-1A
399409
algorithms in the form of two separate overloaded functions: `FNV_1`
@@ -1366,6 +1376,9 @@ performance on long keys. It passes all the SMHasher tests, and has
13661376
no known bad seeds.
13671377
It is a *pure* function for integer arrays, and an *elemental*
13681378
function for character strings.
1379+
Note that `pengy_hash` will produce different hash values on
1380+
big-endian and little-endian processors for the same input. The hash
1381+
quality is equally good on both architectures.
13691382

13701383
##### Example
13711384

test/hash_functions/test_hash_functions.f90

Lines changed: 222 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ subroutine collect_hash_functions(testsuite)
4141
, new_unittest("water_hash", test_water_hash) &
4242
, new_unittest("pengy_hash", test_pengy_hash) &
4343
, new_unittest("spooky_hash", test_spooky_hash) &
44+
, new_unittest("hash_determinism", test_hash_determinism) &
45+
, new_unittest("hash_distribution", test_hash_distribution) &
46+
, new_unittest("nmhash32_kat", test_nmhash32_kat) &
47+
, new_unittest("nmhash32x_kat", test_nmhash32x_kat) &
4448
]
4549

4650
end subroutine collect_hash_functions
@@ -49,10 +53,11 @@ subroutine test_little_endian(error)
4953
!> Error handling
5054
type(error_type), allocatable, intent(out) :: error
5155

52-
! Test for endianness
53-
54-
call check(error, little_endian, "The processor is not Little-Endian")
55-
if (allocated(error)) return
56+
! Skip test on big-endian systems instead of failing
57+
if (.not. little_endian) then
58+
call skip_test(error, "The processor is not Little-Endian (skipping)")
59+
return
60+
end if
5661

5762
end subroutine test_little_endian
5863

@@ -64,6 +69,15 @@ subroutine test_nmhash32(error)
6469
integer(int8) :: key_array(size_key_array)
6570
integer(int32) :: c_hash(0:size_key_array)
6671

72+
! The C reference implementation (nmhash.h) does not support
73+
! big-endian. Skip C-comparison on BE; value-correctness is
74+
! verified by the test_nmhash32_kat known-answer test instead.
75+
if (.not. little_endian) then
76+
call skip_test(error, &
77+
"NMHASH32 C-comparison skipped on Big-Endian (see KAT test)")
78+
return
79+
end if
80+
6781
call read_array("key_array.bin", key_array )
6882

6983
! Read hash array generated from key array by the C version of nmhash32
@@ -85,6 +99,15 @@ subroutine test_nmhash32x(error)
8599
integer(int8) :: key_array(size_key_array)
86100
integer(int32) :: c_hash(0:size_key_array)
87101

102+
! The C reference implementation (nmhash.h) does not support
103+
! big-endian. Skip C-comparison on BE; value-correctness is
104+
! verified by the test_nmhash32x_kat known-answer test instead.
105+
if (.not. little_endian) then
106+
call skip_test(error, &
107+
"NMHASH32X C-comparison skipped on Big-Endian (see KAT test)")
108+
return
109+
end if
110+
88111
call read_array("key_array.bin", key_array )
89112

90113
! Read hash array generated from key array by the C version of nmhash32x
@@ -162,6 +185,201 @@ subroutine test_spooky_hash(error)
162185
end subroutine test_spooky_hash
163186

164187

188+
!> Test that all hash functions produce deterministic results
189+
!> This test runs on ALL platforms (LE and BE)
190+
subroutine test_hash_determinism(error)
191+
!> Error handling
192+
type(error_type), allocatable, intent(out) :: error
193+
194+
integer(int8) :: key(8)
195+
integer(int32) :: h32_a, h32_b
196+
integer(int64) :: h64_a, h64_b
197+
integer(int64) :: h128_a(2), h128_b(2)
198+
199+
key = [1_int8, 2_int8, 3_int8, 4_int8, &
200+
5_int8, 6_int8, 7_int8, 8_int8]
201+
202+
! nmhash32 determinism
203+
h32_a = nmhash32(key, nm_seed)
204+
h32_b = nmhash32(key, nm_seed)
205+
call check(error, h32_a == h32_b, "NMHASH32 not deterministic")
206+
if (allocated(error)) return
207+
208+
! nmhash32x determinism
209+
h32_a = nmhash32x(key, nm_seed)
210+
h32_b = nmhash32x(key, nm_seed)
211+
call check(error, h32_a == h32_b, "NMHASH32X not deterministic")
212+
if (allocated(error)) return
213+
214+
! water_hash determinism
215+
h32_a = water_hash(key, water_seed)
216+
h32_b = water_hash(key, water_seed)
217+
call check(error, h32_a == h32_b, "WATER_HASH not deterministic")
218+
if (allocated(error)) return
219+
220+
! pengy_hash determinism
221+
h64_a = pengy_hash(key, pengy_seed)
222+
h64_b = pengy_hash(key, pengy_seed)
223+
call check(error, h64_a == h64_b, "PENGY_HASH not deterministic")
224+
if (allocated(error)) return
225+
226+
! spooky_hash determinism
227+
h128_a = spooky_hash(key, spooky_seed)
228+
h128_b = spooky_hash(key, spooky_seed)
229+
call check(error, all(h128_a == h128_b), &
230+
"SPOOKY_HASH not deterministic")
231+
if (allocated(error)) return
232+
233+
end subroutine test_hash_determinism
234+
235+
!> Collision sanity check: verify distinct inputs produce distinct hashes.
236+
!> For these well-tested hash functions with fixed deterministic inputs,
237+
!> an accidental collision probability is ~2^-32 (32-bit) or ~2^-64
238+
!> (64-bit), making this effectively deterministic.
239+
!> This test runs on ALL platforms (LE and BE).
240+
subroutine test_hash_distribution(error)
241+
!> Error handling
242+
type(error_type), allocatable, intent(out) :: error
243+
244+
integer(int8) :: key_a(8), key_b(8)
245+
integer(int32) :: h32_a, h32_b
246+
integer(int64) :: h64_a, h64_b
247+
integer(int64) :: h128_a(2), h128_b(2)
248+
249+
key_a = [1_int8, 2_int8, 3_int8, 4_int8, &
250+
5_int8, 6_int8, 7_int8, 8_int8]
251+
key_b = [1_int8, 2_int8, 3_int8, 4_int8, &
252+
5_int8, 6_int8, 7_int8, 9_int8] ! differs in last byte
253+
254+
! nmhash32 collision check
255+
h32_a = nmhash32(key_a, nm_seed)
256+
h32_b = nmhash32(key_b, nm_seed)
257+
call check(error, h32_a /= h32_b, &
258+
"NMHASH32 same hash for different inputs")
259+
if (allocated(error)) return
260+
261+
! nmhash32x collision check
262+
h32_a = nmhash32x(key_a, nm_seed)
263+
h32_b = nmhash32x(key_b, nm_seed)
264+
call check(error, h32_a /= h32_b, &
265+
"NMHASH32X same hash for different inputs")
266+
if (allocated(error)) return
267+
268+
! water_hash collision check
269+
h32_a = water_hash(key_a, water_seed)
270+
h32_b = water_hash(key_b, water_seed)
271+
call check(error, h32_a /= h32_b, &
272+
"WATER_HASH same hash for different inputs")
273+
if (allocated(error)) return
274+
275+
! pengy_hash collision check
276+
h64_a = pengy_hash(key_a, pengy_seed)
277+
h64_b = pengy_hash(key_b, pengy_seed)
278+
call check(error, h64_a /= h64_b, &
279+
"PENGY_HASH same hash for different inputs")
280+
if (allocated(error)) return
281+
282+
! spooky_hash collision check
283+
h128_a = spooky_hash(key_a, spooky_seed)
284+
h128_b = spooky_hash(key_b, spooky_seed)
285+
call check(error, any(h128_a /= h128_b), &
286+
"SPOOKY_HASH same hash for different inputs")
287+
if (allocated(error)) return
288+
289+
end subroutine test_hash_distribution
290+
291+
292+
!> Known-Answer Test for NMHASH32.
293+
!> Verifies the Fortran implementation produces the exact canonical
294+
!> LE-normalized hash values across all code paths. Reference values
295+
!> were computed on a little-endian platform using the upstream C code.
296+
!> This test runs on ALL platforms (LE and BE).
297+
subroutine test_nmhash32_kat(error)
298+
!> Error handling
299+
type(error_type), allocatable, intent(out) :: error
300+
301+
! Number of test vectors
302+
integer, parameter :: num_kat = 14
303+
304+
! Input lengths covering every code path:
305+
! 0=zero, 1/2/3/4=small, 7/8=5-8 path, 9/32=9-32 path,
306+
! 33/100/255=33-255 path, 256/300=long path (256+)
307+
integer, parameter :: kat_lengths(num_kat) = [ &
308+
0, 1, 2, 3, 4, 7, 8, &
309+
9, 32, 33, 100, 255, 256, 300 ]
310+
311+
! Reference NMHASH32 values (computed on LE with seed=0xDEADBEEF)
312+
integer(int32), parameter :: kat_expected(num_kat) = [ &
313+
int(z'B0D9C845', int32), int(z'D52AD23F', int32), &
314+
int(z'E909FDFF', int32), int(z'FF1A009C', int32), &
315+
int(z'097D4183', int32), int(z'55CC8BBF', int32), &
316+
int(z'660D67B4', int32), int(z'CB939B94', int32), &
317+
int(z'4CBE45F8', int32), int(z'2FD88BD0', int32), &
318+
int(z'83AC6B02', int32), int(z'CC0E4E26', int32), &
319+
int(z'567D6B58', int32), int(z'865F0BC9', int32) ]
320+
321+
! Deterministic key: key(i) = IAND(i, 255)
322+
integer(int8) :: key(300)
323+
integer :: i
324+
integer(int32) :: got
325+
326+
do i = 1, 300
327+
key(i) = int(iand(i, 255), int8)
328+
end do
329+
330+
do i = 1, num_kat
331+
got = nmhash32(key(1:kat_lengths(i)), nm_seed)
332+
call check(error, got == kat_expected(i), &
333+
"NMHASH32 KAT failed")
334+
if (allocated(error)) return
335+
end do
336+
337+
end subroutine test_nmhash32_kat
338+
339+
!> Known-Answer Test for NMHASH32X.
340+
!> Same approach as test_nmhash32_kat but for the NMHASH32X variant.
341+
!> This test runs on ALL platforms (LE and BE).
342+
subroutine test_nmhash32x_kat(error)
343+
!> Error handling
344+
type(error_type), allocatable, intent(out) :: error
345+
346+
! Number of test vectors
347+
integer, parameter :: num_kat = 14
348+
349+
! Input lengths covering every code path
350+
integer, parameter :: kat_lengths(num_kat) = [ &
351+
0, 1, 2, 3, 4, 7, 8, &
352+
9, 32, 33, 100, 255, 256, 300 ]
353+
354+
! Reference NMHASH32X values (computed on LE with seed=0xDEADBEEF)
355+
integer(int32), parameter :: kat_expected(num_kat) = [ &
356+
int(z'76844735', int32), int(z'B7AE2C90', int32), &
357+
int(z'EE2224FD', int32), int(z'BBE39609', int32), &
358+
int(z'08467EE3', int32), int(z'10E572DA', int32), &
359+
int(z'2570CFA8', int32), int(z'1A06128A', int32), &
360+
int(z'EABBF1B8', int32), int(z'9B1B3428', int32), &
361+
int(z'F6F0233D', int32), int(z'7EB7CAFC', int32), &
362+
int(z'B34D6C45', int32), int(z'E89BEE9E', int32) ]
363+
364+
! Deterministic key: key(i) = IAND(i, 255)
365+
integer(int8) :: key(300)
366+
integer :: i
367+
integer(int32) :: got
368+
369+
do i = 1, 300
370+
key(i) = int(iand(i, 255), int8)
371+
end do
372+
373+
do i = 1, num_kat
374+
got = nmhash32x(key(1:kat_lengths(i)), nm_seed)
375+
call check(error, got == kat_expected(i), &
376+
"NMHASH32X KAT failed")
377+
if (allocated(error)) return
378+
end do
379+
380+
end subroutine test_nmhash32x_kat
381+
382+
165383
subroutine generate_key_array()
166384

167385
integer :: i, lun

test/hash_functions/waterhash.h

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,16 @@
1010
#include <string.h>
1111
#include <stdint.h>
1212
#include <math.h>
13+
14+
/* Endian detection: broaden beyond just __BYTE_ORDER__ to cover all major BE
15+
* toolchains (IBM XL, older GCC, etc.) — mirrors nmhash.h's approach. */
16+
#ifndef WATERHASH_BIG_ENDIAN
17+
# if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
18+
# define WATERHASH_BIG_ENDIAN 1
19+
# else
20+
# define WATERHASH_BIG_ENDIAN 0
21+
# endif
22+
#endif
1323
const uint64_t _waterp0 = 0xa0761d65ull, _waterp1 = 0xe7037ed1ull, _waterp2 = 0x8ebc6af1ull;
1424
const uint64_t _waterp3 = 0x589965cdull, _waterp4 = 0x1d8e4e27ull, _waterp5 = 0xeb44accbull;
1525

@@ -19,8 +29,21 @@ static inline uint64_t _watermum(const uint64_t A, const uint64_t B) {
1929
}
2030

2131
static inline uint64_t _waterr08(const uint8_t *p){ uint8_t v; memcpy(&v, p, 1); return v; }
22-
static inline uint64_t _waterr16(const uint8_t *p){ uint16_t v; memcpy(&v, p, 2); return v; }
23-
static inline uint64_t _waterr32(const uint8_t *p){ uint32_t v; memcpy(&v, p, 4); return v; }
32+
static inline uint64_t _waterr16(const uint8_t *p){
33+
uint16_t v; memcpy(&v, p, 2);
34+
#if WATERHASH_BIG_ENDIAN
35+
v = (uint16_t)((v << 8) | (v >> 8));
36+
#endif
37+
return v;
38+
}
39+
static inline uint64_t _waterr32(const uint8_t *p){
40+
uint32_t v; memcpy(&v, p, 4);
41+
#if WATERHASH_BIG_ENDIAN
42+
v = ((v >> 24) & 0xff) | ((v >> 8) & 0xff00) |
43+
((v << 8) & 0xff0000) | ((v << 24) & 0xff000000u);
44+
#endif
45+
return v;
46+
}
2447
static inline uint32_t waterhash(const void* key, uint32_t len, uint64_t seed){
2548
const uint8_t *p = (const uint8_t*)key;
2649
uint32_t i;

0 commit comments

Comments
 (0)