diff --git a/.gitignore b/.gitignore index 7d6d944acf2..feedc3461f9 100644 --- a/.gitignore +++ b/.gitignore @@ -54,5 +54,5 @@ e2e/playwright-report e2e/test-results .aider* /tools/server/.lwjgl/ -/tools/server/.lwjgl/ .m2_repo/ +.serena/ diff --git a/AGENTS.md b/AGENTS.md index 1a43fca9b46..33fde3d827b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -59,6 +59,24 @@ It is illegal to `-q` when running tests! --- +## PIOSEE Decision Model (Adopted) + +Use PIOSEE on every task to structure thinking and execution. It complements the routines below and ties directly into the Traceability trio (Description, Evidence, Plan). + +- Problem: restate the task in one sentence, note constraints/timebox, and identify likely routine (A/B/C). +- Information: inspect modules and AGENTS.md, gather environment constraints, locate existing tests/reports, and search code to localize the work. +- Options: list 2–3 viable approaches (routine choice, test scope, fix location) and weigh them with the Proportionality Model. +- Select: choose one option and routine; update the Living Plan with exactly one `in_progress` step. +- Execute: follow the Working Loop and house rules; for Routine A add the smallest failing test first; capture an Evidence block after each grouped action. +- Evaluate: check against the Definition of Done; if gaps remain, adjust the plan or change routine; record final Evidence and a brief retrospective. + +PIOSEE → Traceability trio mapping +- P/I/O → Description +- S → Plan (one `in_progress`) +- E/E → Evidence and Verification + +For documentation‑only edits and other Routine B cases, still run PIOSEE briefly to confirm neutrality and reversibility. + ## Proportionality Model (Think before you test) Score the change on these lenses. If any are **High**, prefer **Routine A**. @@ -342,6 +360,7 @@ It is illegal to `-q` when running tests! ## Working Loop +* **PIOSEE first:** restate Problem, gather Information, list Options; then Select, Execute, Evaluate. * **Plan:** small, verifiable steps; keep one `in_progress`. * **Change:** minimal, surgical edits; keep style/structure consistent. * **Format:** `mvn -o -Dmaven.repo.local=.m2_repo -q -T 2C formatter:format impsort:sort xml-format:xml-format` @@ -506,6 +525,7 @@ Do **not** modify existing headers’ years. * **Files touched:** list file paths. * **Commands run:** key build/test commands. * **Verification:** which tests passed, where you checked reports. +* **PIOSEE trace (concise):** P/I/O summary, selected option/routine, key evaluate outcomes. * **Evidence:** *Routine A:* failing output (pre‑fix) and passing output (post‑fix). *Routine B:* pre‑ and post‑green snippets from the **same selection** + **Hit Proof**. diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStore.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStore.java index 91414f78c18..1c88be4e601 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStore.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/NativeSailStore.java @@ -37,7 +37,6 @@ import org.eclipse.rdf4j.model.Value; import org.eclipse.rdf4j.model.ValueFactory; import org.eclipse.rdf4j.query.algebra.evaluation.impl.EvaluationStatistics; -import org.eclipse.rdf4j.sail.Sail; import org.eclipse.rdf4j.sail.SailException; import org.eclipse.rdf4j.sail.base.BackingSailSource; import org.eclipse.rdf4j.sail.base.Changeset; diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java index 37787ac610c..3f1e72c3d79 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/ValueStore.java @@ -35,6 +35,7 @@ import org.eclipse.rdf4j.model.vocabulary.XSD; import org.eclipse.rdf4j.sail.SailException; import org.eclipse.rdf4j.sail.nativerdf.datastore.DataStore; +import org.eclipse.rdf4j.sail.nativerdf.datastore.RecoveredDataException; import org.eclipse.rdf4j.sail.nativerdf.model.CorruptIRI; import org.eclipse.rdf4j.sail.nativerdf.model.CorruptIRIOrBNode; import org.eclipse.rdf4j.sail.nativerdf.model.CorruptLiteral; @@ -145,7 +146,7 @@ public ValueStore(File dataDir, boolean forceSync) throws IOException { public ValueStore(File dataDir, boolean forceSync, int valueCacheSize, int valueIDCacheSize, int namespaceCacheSize, int namespaceIDCacheSize) throws IOException { super(); - dataStore = new DataStore(dataDir, FILENAME_PREFIX, forceSync); + dataStore = new DataStore(dataDir, FILENAME_PREFIX, forceSync, this); valueCache = new ConcurrentCache<>(valueCacheSize); valueIDCache = new ConcurrentCache<>(valueIDCacheSize); @@ -194,15 +195,31 @@ public NativeValue getValue(int id) throws IOException { NativeValue resultValue = valueCache.get(cacheID); if (resultValue == null) { - // Value not in cache, fetch it from file - byte[] data = dataStore.getData(id); - - if (data != null) { - resultValue = data2value(id, data); - - if (!(resultValue instanceof CorruptValue)) { - // Store value in cache - valueCache.put(cacheID, resultValue); + try { + // Value not in cache, fetch it from file + byte[] data = dataStore.getData(id); + if (data != null) { + resultValue = data2value(id, data); + if (!(resultValue instanceof CorruptValue)) { + // Store value in cache + valueCache.put(cacheID, resultValue); + } + } + } catch (RecoveredDataException rde) { + byte[] recovered = rde.getData(); + if (recovered != null && recovered.length > 0) { + byte t = recovered[0]; + if (t == URI_VALUE) { + resultValue = new CorruptIRI(revision, id, null, recovered); + } else if (t == BNODE_VALUE) { + resultValue = new CorruptIRIOrBNode(revision, id, recovered); + } else if (t == LITERAL_VALUE) { + resultValue = new CorruptLiteral(revision, id, recovered); + } else { + resultValue = new CorruptUnknownValue(revision, id, recovered); + } + } else { + resultValue = new CorruptUnknownValue(revision, id, recovered); } } } @@ -434,21 +451,30 @@ public void close() throws IOException { public void checkConsistency() throws SailException, IOException { int maxID = dataStore.getMaxID(); for (int id = 1; id <= maxID; id++) { - byte[] data = dataStore.getData(id); - if (isNamespaceData(data)) { - String namespace = data2namespace(data); - try { - if (id == getNamespaceID(namespace, false) - && java.net.URI.create(namespace + "part").isAbsolute()) { - continue; + try { + byte[] data = dataStore.getData(id); + if (isNamespaceData(data)) { + String namespace = data2namespace(data); + try { + if (id == getNamespaceID(namespace, false) + && java.net.URI.create(namespace + "part").isAbsolute()) { + continue; + } + } catch (IllegalArgumentException e) { + // throw SailException + } + throw new SailException( + "Store must be manually exported and imported to fix namespaces like " + namespace); + } else { + Value value = this.data2value(id, data); + if (id != this.getID(copy(value))) { + throw new SailException( + "Store must be manually exported and imported to merge values like " + value); } - } catch (IllegalArgumentException e) { - // throw SailException } - throw new SailException( - "Store must be manually exported and imported to fix namespaces like " + namespace); - } else { - Value value = this.data2value(id, data); + } catch (RecoveredDataException rde) { + // Treat as a corrupt unknown value during consistency check + Value value = new CorruptUnknownValue(revision, id, rde.getData()); if (id != this.getID(copy(value))) { throw new SailException( "Store must be manually exported and imported to merge values like " + value); @@ -584,7 +610,8 @@ private boolean isNamespaceData(byte[] data) { return data[0] != URI_VALUE && data[0] != BNODE_VALUE && data[0] != LITERAL_VALUE; } - private NativeValue data2value(int id, byte[] data) throws IOException { + @InternalUseOnly + public NativeValue data2value(int id, byte[] data) throws IOException { if (data.length == 0) { if (SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES) { logger.error("Soft fail on corrupt data: Empty data array for value with id {}", id); @@ -704,8 +731,12 @@ private String getNamespace(int id) throws IOException { String namespace = namespaceCache.get(cacheID); if (namespace == null) { - byte[] namespaceData = dataStore.getData(id); - namespace = data2namespace(namespaceData); + try { + byte[] namespaceData = dataStore.getData(id); + namespace = data2namespace(namespaceData); + } catch (RecoveredDataException rde) { + namespace = data2namespace(rde.getData()); + } namespaceCache.put(cacheID, namespace); } @@ -829,13 +860,18 @@ public static void main(String[] args) throws Exception { int maxID = valueStore.dataStore.getMaxID(); for (int id = 1; id <= maxID; id++) { - byte[] data = valueStore.dataStore.getData(id); - if (valueStore.isNamespaceData(data)) { - String ns = valueStore.data2namespace(data); - System.out.println("[" + id + "] " + ns); - } else { - Value value = valueStore.data2value(id, data); - System.out.println("[" + id + "] " + value.toString()); + try { + byte[] data = valueStore.dataStore.getData(id); + if (valueStore.isNamespaceData(data)) { + String ns = valueStore.data2namespace(data); + System.out.println("[" + id + "] " + ns); + } else { + Value value = valueStore.data2value(id, data); + System.out.println("[" + id + "] " + value.toString()); + } + } catch (RecoveredDataException rde) { + System.out.println("[" + id + "] CorruptUnknownValue:" + + new CorruptUnknownValue(valueStore.revision, id, rde.getData())); } } } diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java index 73e9c349de7..550954750a7 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataFile.java @@ -117,6 +117,30 @@ public File getFile() { return nioFile.getFile(); } + /** + * Returns the current file size (after flushing any pending writes). + */ + public long getFileSize() throws IOException { + flush(); + return nioFileSize; + } + + /** + * Attempts to recover data bytes between two known entry offsets when the length field at {@code startOffset} is + * corrupt (e.g., zero). This returns up to {@code endOffset - startOffset - 4} bytes starting after the length + * field, capped to a reasonable maximum to avoid large allocations. + */ + public byte[] tryRecoverBetweenOffsets(long startOffset, long endOffset) throws IOException { + flush(); + if (endOffset <= startOffset + 4) { + return new byte[0]; + } + long available = endOffset - (startOffset + 4); + int cap = 32 * 1024 * 1024; // 32MB cap for recovery + int toRead = (int) Math.min(Math.max(available, 0), cap); + return nioFile.readBytes(startOffset + 4L, toRead); + } + /** * Stores the specified data and returns the byte-offset at which it has been stored. * diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataStore.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataStore.java index 981c2230cf0..b8e775c49e1 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataStore.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataStore.java @@ -13,10 +13,16 @@ import java.io.Closeable; import java.io.File; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.zip.CRC32; import org.eclipse.rdf4j.common.io.ByteArrayUtil; +import org.eclipse.rdf4j.sail.nativerdf.NativeStore; +import org.eclipse.rdf4j.sail.nativerdf.ValueStore; +import org.eclipse.rdf4j.sail.nativerdf.model.NativeValue; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Class that provides indexed storage and retrieval of arbitrary length data. @@ -35,6 +41,9 @@ public class DataStore implements Closeable { private final HashFile hashFile; + private static final Logger logger = LoggerFactory.getLogger(DataStore.class); + private ValueStore valueStore; + /*--------------* * Constructors * *--------------*/ @@ -49,6 +58,11 @@ public DataStore(File dataDir, String filePrefix, boolean forceSync) throws IOEx hashFile = new HashFile(new File(dataDir, filePrefix + ".hash"), forceSync); } + public DataStore(File dataDir, String filePrefix, boolean forceSync, ValueStore valueStore) throws IOException { + this(dataDir, filePrefix, forceSync); + this.valueStore = valueStore; + } + /*---------* * Methods * *---------*/ @@ -67,7 +81,108 @@ public byte[] getData(int id) throws IOException { long offset = idFile.getOffset(id); if (offset != 0L) { - return dataFile.getData(offset); + byte[] data = dataFile.getData(offset); + if (data.length == 0 && NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES) { + try { + long offsetNoCache = idFile.getOffsetNoCache(id); + if (offset != offsetNoCache) { + logger.error("IDFile cache mismatch for id {}: cached={}, raw={}. Using raw.", id, offset, + offsetNoCache); + offset = offsetNoCache; + data = dataFile.getData(offset); + } + } catch (IOException e) { + // If raw read fails, keep cached offset + } + + // Attempt recovery by using neighboring offsets to infer the bounds + long startData = offset + 4; // default start if no previous valid entry + // Find previous entry end: prevOffset + 4 + prevLength + int prev = id - 1; + for (; prev >= 1; prev--) { + long po = idFile.getOffset(prev); + try { + long poRaw = idFile.getOffsetNoCache(prev); + if (po != poRaw) { + logger.error("IDFile cache mismatch for prev id {}: cached={}, raw={}. Using raw.", prev, + po, poRaw); + po = poRaw; + } + } catch (IOException e) { + // use cached po if raw read fails + } + if (po > 0L) { + try { + byte[] prevData = dataFile.getData(po); + if (prevData != null && prevData.length > 0) { + try { + if (valueStore != null && Thread.currentThread().getStackTrace().length < 512) { + NativeValue nativeValue = valueStore.data2value(prev, prevData); + logger.warn("Data in previous ID ({}) is: {}", prev, nativeValue); + } else { + logger.warn("Data in previous ID ({}) is: {}", prev, + new String(prevData, StandardCharsets.UTF_8)); + } + } catch (Exception ignored) { + } + startData = po + 4L + prevData.length; + break; + } + } catch (Exception ignored) { + } + } + } + + // Find next entry start as the end bound + long endOffset = 0L; + int maxId = idFile.getMaxID(); + int next = id + 1; + for (; next <= maxId; next++) { + long no = idFile.getOffset(next); + try { + long noRaw = idFile.getOffsetNoCache(next); + if (no != noRaw) { + logger.error("IDFile cache mismatch for next id {}: cached={}, raw={}. Using raw.", next, + no, noRaw); + no = noRaw; + } + } catch (IOException e) { + // use cached value if raw read fails + } + if (no > 0L) { + + try { + byte[] nextData = dataFile.getData(no); + if (nextData != null && nextData.length > 0) { + try { + if (valueStore != null && Thread.currentThread().getStackTrace().length < 512) { + NativeValue nativeValue = valueStore.data2value(next, nextData); + logger.warn("Data in next ID ({}) is: {}", next, nativeValue); + } else { + logger.warn("Data in next ID ({}) is: {}", next, + new String(nextData, StandardCharsets.UTF_8)); + } + } catch (Exception ignored) { + } + endOffset = no; + break; + } + } catch (Exception e) { + } + + } + } + if (endOffset == 0L) { + // Fallback: use current file size as end bound + endOffset = dataFile.getFileSize(); + } + if (endOffset > startData) { + // tryRecoverBetweenOffsets expects an offset to a 4-byte length, so pass (startData - 4) + byte[] recovered = dataFile.tryRecoverBetweenOffsets(Math.max(0L, startData - 4L), endOffset); + throw new RecoveredDataException(id, recovered); + } + } + return data; } return null; diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/IDFile.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/IDFile.java index 2c7814e25bd..805ce7382cf 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/IDFile.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/IDFile.java @@ -230,6 +230,19 @@ public long getOffset(int id) throws IOException { return nioFile.readLong(ITEM_SIZE * id); } + /** + * Gets the offset directly from the underlying file, bypassing any caches. Useful for validating cached results + * when diagnosing corruption. + * + * @param id The ID to get the offset for, must be larger than 0. + * @return the raw offset stored for the id + * @throws IOException if an I/O error occurs + */ + public long getOffsetNoCache(int id) throws IOException { + assert id > 0 : "id must be larger than 0, is: " + id; + return nioFile.readLong(ITEM_SIZE * id); + } + /** * Discards all stored data. * diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/RecoveredDataException.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/RecoveredDataException.java new file mode 100644 index 00000000000..aaa8f97b3af --- /dev/null +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/datastore/RecoveredDataException.java @@ -0,0 +1,39 @@ +/******************************************************************************* + * Copyright (c) 2025 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + *******************************************************************************/ +package org.eclipse.rdf4j.sail.nativerdf.datastore; + +import java.io.IOException; + +/** + * Signals that data for a given id was recovered heuristically (e.g., by inferring the length from neighboring + * offsets). Carries the recovered bytes to enable callers to construct a CorruptValue. + */ +public class RecoveredDataException extends IOException { + + private static final long serialVersionUID = 1L; + + private final int id; + private final byte[] data; + + public RecoveredDataException(int id, byte[] data) { + super("Recovered data for id " + id + " using neighboring offsets"); + this.id = id; + this.data = data; + } + + public int getId() { + return id; + } + + public byte[] getData() { + return data; + } +} diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRIOrBNode.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRIOrBNode.java index 455a88878d3..94beb34aad8 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRIOrBNode.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRIOrBNode.java @@ -12,6 +12,7 @@ package org.eclipse.rdf4j.sail.nativerdf.model; import java.nio.charset.StandardCharsets; +import java.util.Arrays; import org.apache.commons.codec.binary.Hex; import org.eclipse.rdf4j.model.BNode; @@ -63,17 +64,94 @@ public String getNamespace() { @Override public String getLocalName() { byte[] data = getData(); - if (data != null && data.length < 1024) { - try { - String localName = new String(data, 5, data.length - 5, StandardCharsets.UTF_8); - return "CORRUPT_" + UrlEscapers.urlPathSegmentEscaper().escape(localName); - } catch (Throwable ignored) { + if (data != null && data.length > 0) { + // check if all bytes are zero + boolean allZero = true; + for (byte b : data) { + if (b != 0) { + allZero = false; + break; + } } - return "CORRUPT_" + Hex.encodeHexString(data); + if (allZero) { + return "CORRUPT_ID_" + getInternalID() + "_all_" + data.length + "_data_bytes_are_0x00"; + } + + data = truncateData(data); + + // 1) Try full UTF-8 decode of the slice + if (data.length > 0) { + try { + String utf8 = new String(data, StandardCharsets.UTF_8); + // If replacement character is not present, we got a clean decode + if (utf8.indexOf('\uFFFD') < 0 && !utf8.trim().isEmpty()) { + return "CORRUPT_ID_" + getInternalID() + "_" + UrlEscapers.urlPathSegmentEscaper().escape(utf8); + } + } catch (Throwable ignored) { + // fall through to recovery strategies + } + } + + // 2) Try to narrow down to a valid UTF-8 decodable substring (avoid replacement char) + String recoveredUtf8 = null; + int bestByteLen = 0; + for (int start = 0; start < data.length; start++) { + for (int end = data.length; end > start; end--) { + int candidateLen = end - start; + if (candidateLen <= bestByteLen) { + break; // can't beat current best + } + try { + String s = new String(data, start, candidateLen, StandardCharsets.UTF_8); + if (s.indexOf('\uFFFD') < 0) { + recoveredUtf8 = s; + bestByteLen = candidateLen; + break; // no need to try smaller end for this start + } + } catch (Throwable ignored) { + // continue scanning + } + } + } + if (recoveredUtf8 != null && !recoveredUtf8.trim().isEmpty()) { + return "CORRUPT_ID_" + getInternalID() + "_" + + UrlEscapers.urlPathSegmentEscaper().escape(recoveredUtf8); + } + + // 3) Try ASCII: find the longest contiguous run of printable US-ASCII bytes and use that + int bestAsciiStart = -1; + int bestAsciiLen = 0; + int i = 0; + while (i < data.length) { + // printable ASCII range 0x20 (space) to 0x7E (~) + if (data[i] >= 0x20 && data[i] <= 0x7E) { + int runStart = i; + while (i < data.length && data[i] >= 0x20 && data[i] <= 0x7E) { + i++; + } + int runLen = i - runStart; + if (runLen > bestAsciiLen) { + bestAsciiLen = runLen; + bestAsciiStart = runStart; + } + } else { + i++; + } + } + if (bestAsciiLen > 0) { + String ascii = new String(data, bestAsciiStart, bestAsciiLen, StandardCharsets.US_ASCII); + if (!ascii.trim().isEmpty()) { + return "CORRUPT_ID_" + getInternalID() + "_" + UrlEscapers.urlPathSegmentEscaper().escape(ascii); + } + } + + // 4) Fallback: hex-encode the entire raw data + return "CORRUPT_ID_" + getInternalID() + "_HEX_" + + Hex.encodeHexString(Arrays.copyOfRange(data, 0, data.length)); } - return "CORRUPT"; + return "CORRUPT_ID_" + getInternalID(); } @Override diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptLiteral.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptLiteral.java index eb6b2587c25..21ef57bbbe6 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptLiteral.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptLiteral.java @@ -14,10 +14,12 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.nio.charset.StandardCharsets; +import java.util.Arrays; import java.util.Optional; import javax.xml.datatype.XMLGregorianCalendar; +import org.apache.commons.codec.binary.Hex; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Literal; import org.eclipse.rdf4j.model.base.CoreDatatype; @@ -42,21 +44,97 @@ public CorruptLiteral(ValueStoreRevision revision, int internalID, byte[] data) super(revision, internalID, data); } + @Override public String stringValue() { - return "CorruptLiteral_with_ID_" + getInternalID(); + return "CorruptLiteral_with_ID_" + getInternalID() + ": " + getLabel(); } @Override public String getLabel() { byte[] data = getData(); try { - if (data != null && data.length < 1024) { - return "CorruptUnknownValue with ID " + getInternalID() + " with possible data: " - + new String(data, StandardCharsets.UTF_8); + if (data != null && data.length > 0) { + // check if all bytes are zero + boolean allZero = true; + for (byte b : data) { + if (b != 0) { + allZero = false; + break; + } + } + + if (allZero) { + return "All " + data.length + " data bytes are 0x00"; + } + + String prefix = this.getClass().getSimpleName() + " with ID " + getInternalID() + + " with possible data: "; + + data = truncateData(data); + + // 1) Try full UTF-8 decode of the slice + try { + String utf8 = new String(data, StandardCharsets.UTF_8); + if (utf8.indexOf('\uFFFD') < 0) { + return prefix + utf8; + } + } catch (Throwable ignored) { + } + + // 2) Longest clean UTF-8 substring + String recoveredUtf8 = null; + int bestLen = 0; + for (int start = 0; start < data.length; start++) { + for (int end = data.length; end > start; end--) { + int len = end - start; + if (len <= bestLen) { + break; + } + try { + String s = new String(data, start, len, StandardCharsets.UTF_8); + if (s.indexOf('\uFFFD') < 0) { + recoveredUtf8 = s; + bestLen = len; + break; + } + } catch (Throwable ignored) { + } + } + } + if (recoveredUtf8 != null && !recoveredUtf8.isEmpty()) { + return prefix + recoveredUtf8; + } + + // 3) Longest contiguous printable ASCII run in slice + int bestAsciiStart = -1; + int bestAsciiLen = 0; + int i = 0; + while (i < data.length) { + if (data[i] >= 0x20 && data[i] <= 0x7E) { + int runStart = i; + while (i < data.length && data[i] >= 0x20 && data[i] <= 0x7E) { + i++; + } + int runLen = i - runStart; + if (runLen > bestAsciiLen) { + bestAsciiLen = runLen; + bestAsciiStart = runStart; + } + } else { + i++; + } + } + if (bestAsciiLen > 0) { + String ascii = new String(data, bestAsciiStart, bestAsciiLen, StandardCharsets.US_ASCII); + return prefix + ascii; + } + + // 4) Fallback: hex encode only up to sentinel data.length + return prefix + Hex.encodeHexString(Arrays.copyOfRange(data, 0, data.length)); } } catch (Throwable ignored) { } - return "CorruptUnknownValue_with_ID_" + getInternalID(); + return this.getClass().getSimpleName() + " with ID " + getInternalID(); } @Override diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptUnknownValue.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptUnknownValue.java index ea200b55fa5..ce7a9dde53a 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptUnknownValue.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptUnknownValue.java @@ -14,10 +14,12 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.nio.charset.StandardCharsets; +import java.util.Arrays; import java.util.Optional; import javax.xml.datatype.XMLGregorianCalendar; +import org.apache.commons.codec.binary.Hex; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Literal; import org.eclipse.rdf4j.model.base.CoreDatatype; @@ -35,22 +37,114 @@ public class CorruptUnknownValue extends CorruptValue implements Literal { private static final long serialVersionUID = -6650510290226676279L; + private final String hex; public CorruptUnknownValue(ValueStoreRevision revision, int internalID, byte[] data) { super(revision, internalID, data); + var truncated = data; + if (truncated != null && truncated.length > 2048) { + truncated = new byte[2048]; + System.arraycopy(data, 0, truncated, 0, 2048); + } + if (truncated != null) { + this.hex = Hex.encodeHexString(truncated); + } else { + this.hex = ""; + } } @Override public String getLabel() { byte[] data = getData(); try { - if (data != null && data.length < 1024) { - return "CorruptUnknownValue with ID " + getInternalID() + " with possible data: " - + new String(data, StandardCharsets.UTF_8); + if (data != null && data.length > 0) { + // check if all bytes are zero + boolean allZero = true; + for (byte b : data) { + if (b != 0) { + allZero = false; + break; + } + } + + if (allZero) { + return this.getClass().getSimpleName() + " with ID " + getInternalID() + + " all data bytes are 0x00, tried to read " + data.length + " bytes"; + } + + String prefix = this.getClass().getSimpleName() + " with ID " + getInternalID() + + " with possible data: "; + + data = truncateData(data); + + // 1) Try full UTF-8 decode + try { + String utf8 = new String(data, StandardCharsets.UTF_8); + if (utf8.indexOf('\uFFFD') < 0) { + return prefix + utf8; + } + } catch (Throwable ignored) { + // continue with recovery paths + } + + // 2) Longest clean UTF-8 substring (no replacement char) + String recoveredUtf8 = null; + int bestLen = 0; + for (int start = 0; start < data.length; start++) { + for (int end = data.length; end > start; end--) { + int len = end - start; + if (len <= bestLen) { + break; // can't beat best + } + try { + String s = new String(data, start, len, StandardCharsets.UTF_8); + if (s.indexOf('\uFFFD') < 0) { + recoveredUtf8 = s; + bestLen = len; + break; // shorter end won't beat this start + } + } catch (Throwable ignored) { + // keep scanning + } + } + } + if (recoveredUtf8 != null && !recoveredUtf8.trim().isEmpty()) { + return prefix + recoveredUtf8; + } + + // 3) Longest contiguous printable ASCII run + int bestAsciiStart = -1; + int bestAsciiLen = 0; + int i = 0; + while (i < data.length) { + if (data[i] >= 0x20 && data[i] <= 0x7E) { + int runStart = i; + while (i < data.length && data[i] >= 0x20 && data[i] <= 0x7E) { + i++; + } + int runLen = i - runStart; + if (runLen > bestAsciiLen) { + bestAsciiLen = runLen; + bestAsciiStart = runStart; + } + } else { + i++; + } + } + if (bestAsciiLen > 0) { + String ascii = new String(data, bestAsciiStart, bestAsciiLen, StandardCharsets.US_ASCII); + if (!ascii.trim().isEmpty()) { + return prefix + ascii; + } + } + + // 4) Fallback to hex of full data + return prefix + "COULD NOT DECODE. SHOWING HEX: " + + Hex.encodeHexString(Arrays.copyOfRange(data, 0, data.length)); } } catch (Throwable ignored) { } - return "CorruptUnknownValue_with_ID_" + getInternalID(); + return this.getClass().getSimpleName() + " with ID " + getInternalID(); } @Override @@ -137,4 +231,9 @@ && getValueStoreRevision().equals(otherCorruptValue.getValueStoreRevision())) { return super.equals(o); } + @Override + public String toString() { + return getLabel(); + } + } diff --git a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptValue.java b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptValue.java index 94028b5c579..db4c1834bdb 100644 --- a/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptValue.java +++ b/core/sail/nativerdf/src/main/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptValue.java @@ -86,4 +86,35 @@ public boolean equals(Object o) { return super.equals(o); } + static byte[] truncateData(byte[] data) { + int offset = data.length - 1; + int limit = data.length; + // Only consider 0x00 0x00 0x00 AFTER a non-zero byte has been seen + for (int j = 0; j < data.length; j++) { + if (data[j] != 0) { + offset = j; + break; + } + } + + for (int j = offset; j + 2 < data.length; j++) { + if (data[j] == 0x00 && data[j + 1] == 0x00 && data[j + 2] == 0x00) { + limit = j; + break; + } + } + + byte[] truncated = new byte[limit - offset]; + System.arraycopy(data, offset, truncated, 0, limit - offset); + data = truncated; + + // truncate data to first 2048 bytes + if (data.length > 2048) { + truncated = new byte[2048]; + System.arraycopy(data, 0, truncated, 0, 2048); + data = truncated; + } + return data; + } + } diff --git a/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeStoreRDFStarRejectionTest.java b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeStoreRDFStarRejectionTest.java index e44546cf185..b249a0db5cf 100644 --- a/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeStoreRDFStarRejectionTest.java +++ b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/NativeStoreRDFStarRejectionTest.java @@ -16,7 +16,6 @@ import java.io.File; import java.nio.charset.StandardCharsets; -import org.eclipse.rdf4j.common.transaction.IsolationLevel; import org.eclipse.rdf4j.common.transaction.IsolationLevels; import org.eclipse.rdf4j.model.Statement; import org.eclipse.rdf4j.repository.Repository; diff --git a/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataStoreRecoveryTest.java b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataStoreRecoveryTest.java new file mode 100644 index 00000000000..1c2a9ffab98 --- /dev/null +++ b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/datastore/DataStoreRecoveryTest.java @@ -0,0 +1,74 @@ +/******************************************************************************* + * Copyright (c) 2025 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + *******************************************************************************/ +package org.eclipse.rdf4j.sail.nativerdf.datastore; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; + +import java.io.File; +import java.io.RandomAccessFile; + +import org.eclipse.rdf4j.sail.nativerdf.NativeStore; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +/** + * Tests recovery in DataStore.getData when the stored data length is zero but neighboring ID offsets exist. The + * recovery uses the next ID's offset to infer the correct data length. + */ +public class DataStoreRecoveryTest { + + @TempDir + File tempDir; + + private boolean previousSoftFlag; + + @BeforeEach + public void setup() { + previousSoftFlag = NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES; + NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES = true; + } + + @AfterEach + public void teardown() { + NativeStore.SOFT_FAIL_ON_CORRUPT_DATA_AND_REPAIR_INDEXES = previousSoftFlag; + } + + @Test + public void recoversDataUsingNextOffsetWhenLengthIsZero() throws Exception { + DataStore ds = new DataStore(tempDir, "values"); + + byte[] d1 = new byte[] { 1, 2, 3, 4, 5 }; + byte[] d2 = new byte[] { 9, 8, 7 }; + + int id1 = ds.storeData(d1); + int id2 = ds.storeData(d2); + ds.sync(); + + // Corrupt the first record's length to zero + IDFile idFile = new IDFile(new File(tempDir, "values.id")); + long off1 = idFile.getOffset(id1); + try (RandomAccessFile raf = new RandomAccessFile(new File(tempDir, "values.dat"), "rw")) { + raf.seek(off1); + raf.write(new byte[] { 0, 0, 0, 0 }); + } + + // Now ds.getData(id1) should throw with recovered data + try { + ds.getData(id1); + } catch (RecoveredDataException rde) { + assertArrayEquals(d1, rde.getData(), "Recovered data should match original bytes"); + return; + } + throw new AssertionError("Expected RecoveredDataException to be thrown"); + } +} diff --git a/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRIOrBNodeTest.java b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRIOrBNodeTest.java new file mode 100644 index 00000000000..24e54d2121b --- /dev/null +++ b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptIRIOrBNodeTest.java @@ -0,0 +1,143 @@ +/******************************************************************************* + * Copyright (c) 2025 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + *******************************************************************************/ +package org.eclipse.rdf4j.sail.nativerdf.model; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.charset.StandardCharsets; + +import org.apache.commons.codec.binary.Hex; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link CorruptIRIOrBNode#getLocalName()} recovery behavior. + */ +public class CorruptIRIOrBNodeTest { + + private static CorruptIRIOrBNode nodeWithData(byte[] data) { + return new CorruptIRIOrBNode(null, 123, data); + } + + @Test + public void recoversLongestValidUtf8Substring() { + // Prepare a byte array with 5-byte header followed by: invalid, valid ASCII/UTF-8, invalid, short valid + byte[] header = new byte[] { 0, 0, 0, 0, 0 }; + byte[] invalid1 = new byte[] { (byte) 0xC3, (byte) 0x28 }; // invalid UTF-8 sequence + byte[] validLong = "validlong".getBytes(StandardCharsets.UTF_8); + byte[] invalid2 = new byte[] { (byte) 0xC0, (byte) 0xAF }; // invalid UTF-8 sequence + byte[] validShort = "abc".getBytes(StandardCharsets.UTF_8); + + byte[] data = new byte[header.length + invalid1.length + validLong.length + invalid2.length + + validShort.length]; + int pos = 0; + System.arraycopy(header, 0, data, pos, header.length); + pos += header.length; + System.arraycopy(invalid1, 0, data, pos, invalid1.length); + pos += invalid1.length; + System.arraycopy(validLong, 0, data, pos, validLong.length); + pos += validLong.length; + System.arraycopy(invalid2, 0, data, pos, invalid2.length); + pos += invalid2.length; + System.arraycopy(validShort, 0, data, pos, validShort.length); + + CorruptIRIOrBNode node = nodeWithData(data); + String localName = node.getLocalName(); + + // Expect a valid decodable segment to be chosen containing the core text + assertTrue(localName.startsWith("CORRUPT_"), "Should be prefixed with CORRUPT_"); + assertTrue(localName.contains("validlong"), "Should recover the core decodable segment"); + } + + @Test + public void fallsBackToHexWhenNoDecodableSubstring() { + // Prepare a byte array with 5-byte header followed by bytes with no ASCII/UTF-8 decodable sequences + byte[] header = new byte[] { 0, 0, 0, 0, 0 }; + byte[] body = new byte[] { (byte) 0x80, (byte) 0x81, (byte) 0xFE, (byte) 0xFF }; + + byte[] data = new byte[header.length + body.length]; + System.arraycopy(header, 0, data, 0, header.length); + System.arraycopy(body, 0, data, header.length, body.length); + + CorruptIRIOrBNode node = nodeWithData(data); + String expectedHex = Hex.encodeHexString(stripLeavingZeros(data)); + + String localName = node.getLocalName(); + assertTrue(localName.startsWith("CORRUPT_"), "Should be prefixed with CORRUPT_"); + assertEquals("CORRUPT_ID_" + node.getInternalID() + "_HEX_" + expectedHex, localName); + } + + private byte[] stripLeavingZeros(byte[] data) { + int firstNonZero = 0; + for (int i = 0; i < data.length; i++) { + if (data[i] != 0) { + firstNonZero = i; + break; + } + } + byte[] stripped = new byte[data.length - firstNonZero]; + System.arraycopy(data, firstNonZero, stripped, 0, stripped.length); + return stripped; + } + + @Test + public void stopsParsingAtTripleZeroSentinel() { + byte[] header = new byte[] { 0, 0, 0, 0, 0 }; + byte[] valid = "abc".getBytes(StandardCharsets.UTF_8); + byte[] sentinel = new byte[] { 0, 0, 0 }; + byte[] tail = "tail".getBytes(StandardCharsets.UTF_8); + + byte[] data = new byte[header.length + valid.length + sentinel.length + tail.length]; + int pos = 0; + System.arraycopy(header, 0, data, pos, header.length); + pos += header.length; + System.arraycopy(valid, 0, data, pos, valid.length); + pos += valid.length; + System.arraycopy(sentinel, 0, data, pos, sentinel.length); + pos += sentinel.length; + System.arraycopy(tail, 0, data, pos, tail.length); + + CorruptIRIOrBNode node = nodeWithData(data); + String localName = node.getLocalName(); + + assertTrue(localName.startsWith("CORRUPT_")); + assertTrue(localName.contains("abc"), "Should recover text before sentinel"); + assertTrue(!localName.contains("tail"), "Should not parse past sentinel"); + } + + @Test + public void ignoresLeadingZerosBeforeSentinel() { + byte[] header = new byte[] { 0, 0, 0, 0, 0 }; + byte[] leadingZeros = new byte[] { 0, 0, 0, 0, 0, 0 }; + byte[] valid = "abc".getBytes(StandardCharsets.UTF_8); + byte[] sentinel = new byte[] { 0, 0, 0 }; + byte[] tail = "tail".getBytes(StandardCharsets.UTF_8); + + byte[] data = new byte[header.length + leadingZeros.length + valid.length + sentinel.length + tail.length]; + int pos = 0; + System.arraycopy(header, 0, data, pos, header.length); + pos += header.length; + System.arraycopy(leadingZeros, 0, data, pos, leadingZeros.length); + pos += leadingZeros.length; + System.arraycopy(valid, 0, data, pos, valid.length); + pos += valid.length; + System.arraycopy(sentinel, 0, data, pos, sentinel.length); + pos += sentinel.length; + System.arraycopy(tail, 0, data, pos, tail.length); + + CorruptIRIOrBNode node = nodeWithData(data); + String localName = node.getLocalName(); + + assertTrue(localName.startsWith("CORRUPT_")); + assertTrue(localName.contains("abc"), "Should recover data after leading zeros"); + assertTrue(!localName.contains("tail"), "Should stop at sentinel after non-zero encountered"); + } +} diff --git a/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptLiteralTest.java b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptLiteralTest.java new file mode 100644 index 00000000000..134ffb64403 --- /dev/null +++ b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptLiteralTest.java @@ -0,0 +1,76 @@ +/******************************************************************************* + * Copyright (c) 2025 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + *******************************************************************************/ +package org.eclipse.rdf4j.sail.nativerdf.model; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.charset.StandardCharsets; + +import org.apache.commons.codec.binary.Hex; +import org.junit.jupiter.api.Test; + +public class CorruptLiteralTest { + + private static CorruptLiteral litWithData(byte[] data) { + return new CorruptLiteral(null, 789, data); + } + + @Test + public void recoversUtf8OrAscii() { + byte[] invalid1 = new byte[] { (byte) 0xC3, (byte) 0x28 }; + byte[] valid = "validlong".getBytes(StandardCharsets.UTF_8); + byte[] invalid2 = new byte[] { (byte) 0xC0, (byte) 0xAF }; + byte[] tail = "abc".getBytes(StandardCharsets.UTF_8); + + byte[] data = new byte[invalid1.length + valid.length + invalid2.length + tail.length]; + int pos = 0; + System.arraycopy(invalid1, 0, data, pos, invalid1.length); + pos += invalid1.length; + System.arraycopy(valid, 0, data, pos, valid.length); + pos += valid.length; + System.arraycopy(invalid2, 0, data, pos, invalid2.length); + pos += invalid2.length; + System.arraycopy(tail, 0, data, pos, tail.length); + + CorruptLiteral lit = litWithData(data); + String label = lit.getLabel(); + + assertTrue(label.startsWith("CorruptLiteral with ID 789 with possible data: ")); + assertTrue(label.contains("validlong"), "Should recover core decodable region"); + } + + @Test + public void fallsBackToHexWhenNoDecodable() { + byte[] body = new byte[] { (byte) 0x80, (byte) 0x81, (byte) 0xFE, (byte) 0xFF }; + CorruptLiteral lit = litWithData(body); + String label = lit.getLabel(); + assertTrue(label.contains(Hex.encodeHexString(body)), "Should include hex fallback"); + } + + @Test + public void stopsAtTripleZeroSentinel() { + byte[] head = "xyz".getBytes(StandardCharsets.UTF_8); + byte[] sentinel = new byte[] { 0, 0, 0 }; + byte[] tail = "end".getBytes(StandardCharsets.UTF_8); + byte[] data = new byte[head.length + sentinel.length + tail.length]; + int pos = 0; + System.arraycopy(head, 0, data, pos, head.length); + pos += head.length; + System.arraycopy(sentinel, 0, data, pos, sentinel.length); + pos += sentinel.length; + System.arraycopy(tail, 0, data, pos, tail.length); + + CorruptLiteral lit = litWithData(data); + String label = lit.getLabel(); + assertTrue(label.contains("xyz"), "Should include data before sentinel"); + assertTrue(!label.contains("end"), "Should not include data after sentinel"); + } +} diff --git a/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptUnknownValueTest.java b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptUnknownValueTest.java new file mode 100644 index 00000000000..b32ee049334 --- /dev/null +++ b/core/sail/nativerdf/src/test/java/org/eclipse/rdf4j/sail/nativerdf/model/CorruptUnknownValueTest.java @@ -0,0 +1,86 @@ +/******************************************************************************* + * Copyright (c) 2025 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + *******************************************************************************/ +package org.eclipse.rdf4j.sail.nativerdf.model; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.charset.StandardCharsets; + +import org.apache.commons.codec.binary.Hex; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link CorruptUnknownValue#getLabel()} recovery behavior. + */ +public class CorruptUnknownValueTest { + + private static CorruptUnknownValue valueWithData(byte[] data) { + return new CorruptUnknownValue(null, 456, data); + } + + @Test + public void recoversLongestValidUtf8Substring() { + byte[] invalid1 = new byte[] { (byte) 0xC3, (byte) 0x28 }; // invalid UTF-8 + byte[] validLong = "validlong".getBytes(StandardCharsets.UTF_8); + byte[] invalid2 = new byte[] { (byte) 0xC0, (byte) 0xAF }; // invalid UTF-8 + byte[] validShort = "abc".getBytes(StandardCharsets.UTF_8); + + byte[] data = new byte[invalid1.length + validLong.length + invalid2.length + validShort.length]; + int pos = 0; + System.arraycopy(invalid1, 0, data, pos, invalid1.length); + pos += invalid1.length; + System.arraycopy(validLong, 0, data, pos, validLong.length); + pos += validLong.length; + System.arraycopy(invalid2, 0, data, pos, invalid2.length); + pos += invalid2.length; + System.arraycopy(validShort, 0, data, pos, validShort.length); + + CorruptUnknownValue v = valueWithData(data); + String label = v.getLabel(); + + assertTrue(label.startsWith("CorruptUnknownValue with ID 456 with possible data: ")); + assertTrue(label.contains("validlong"), "Should recover the core decodable segment"); + } + + @Test + public void fallsBackToHexWhenNoDecodableSubstring() { + byte[] data = new byte[] { (byte) 0x80, (byte) 0x81, (byte) 0xFE, (byte) 0xFF }; + CorruptUnknownValue v = valueWithData(data); + + String label = v.getLabel(); + String expectedHex = Hex.encodeHexString(data); + + assertTrue(label.startsWith("CorruptUnknownValue with ID 456 with possible data: ")); + assertTrue(label.contains(expectedHex), "Should fall back to hex encoding when undecodable"); + } + + @Test + public void stopsParsingAtTripleZeroSentinel() { + byte[] valid = "xyz".getBytes(StandardCharsets.UTF_8); + byte[] sentinel = new byte[] { 0, 0, 0 }; + byte[] tail = "end".getBytes(StandardCharsets.UTF_8); + + byte[] data = new byte[valid.length + sentinel.length + tail.length]; + int pos = 0; + System.arraycopy(valid, 0, data, pos, valid.length); + pos += valid.length; + System.arraycopy(sentinel, 0, data, pos, sentinel.length); + pos += sentinel.length; + System.arraycopy(tail, 0, data, pos, tail.length); + + CorruptUnknownValue v = valueWithData(data); + String label = v.getLabel(); + + assertTrue(label.startsWith("CorruptUnknownValue with ID 456 with possible data: ")); + assertTrue(label.contains("xyz"), "Should use data before sentinel"); + assertTrue(!label.contains("end"), "Should not parse past sentinel"); + } +} diff --git a/core/sail/nativerdf/src/test/resources/logback-test.xml b/core/sail/nativerdf/src/test/resources/logback-test.xml index 64b3764879e..4b3ea0773e9 100644 --- a/core/sail/nativerdf/src/test/resources/logback-test.xml +++ b/core/sail/nativerdf/src/test/resources/logback-test.xml @@ -6,7 +6,7 @@ - +