Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions scripts/metadata-validate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,56 @@ if [ "$LENGTH_CHECK_FAILED" -ne 0 ]; then
exit 1
fi

# Content sanity checks (warnings only) on CIP-108 prose fields:
# - leading or trailing whitespace
# - very long markdown headings
# - nested CIP-108 field-name headings (e.g. "### abstract" inside the abstract body)
print_section "Checking content sanity"
CONTENT_WARN_COUNT=0
PROSE_FIELDS=(title abstract motivation rationale)
HEADING_LEN_MAX=80

for field in "${PROSE_FIELDS[@]}"; do
text=$(jq -r ".body.$field // empty" "$JSON_FILE")
[ -z "$text" ] && continue

if [[ "$text" =~ ^[[:space:]] ]]; then
print_warn "'$field' has leading whitespace."
CONTENT_WARN_COUNT=$((CONTENT_WARN_COUNT+1))
fi
if [[ "$text" =~ [[:space:]]$ ]]; then
print_warn "'$field' has trailing whitespace."
CONTENT_WARN_COUNT=$((CONTENT_WARN_COUNT+1))
fi

# Very long ATX-style headings (line-start only — that's what real markdown headings are).
while IFS= read -r heading_line; do
heading_text=$(printf '%s' "$heading_line" | tr -d '\r' \
| sed -E 's/^[[:space:]]{0,3}#+[[:space:]]*//; s/[[:space:]]+#+[[:space:]]*$//')
[ -z "$heading_text" ] && continue
len=${#heading_text}
if [ "$len" -gt "$HEADING_LEN_MAX" ]; then
print_warn "'$field' has a markdown heading of $len chars (>$HEADING_LEN_MAX): \"$heading_text\""
CONTENT_WARN_COUNT=$((CONTENT_WARN_COUNT+1))
fi
done < <(printf '%s\n' "$text" | grep -E '^[[:space:]]{0,3}#+[[:space:]]+' || true)

# Nested CIP-108 field-name reuse: '### abstract', '## motivation', etc., anywhere in
# the text — line-start OR mid-string. Mid-string matches because writers paste
# heading-style markers inline (e.g. "My amazing ### motivation") and that's still a
# field-name reuse worth flagging.
for nested in "${PROSE_FIELDS[@]}"; do
if printf '%s\n' "$text" | grep -qiE "(^|[^[:alnum:]_#])#{1,6}[[:space:]]+${nested}([^[:alnum:]_]|\$)"; then
print_warn "'$field' contains nested CIP-108 field heading marker for '$nested' (e.g. '### $nested'). Field content shouldn't redeclare a CIP-108 field name as a heading."
CONTENT_WARN_COUNT=$((CONTENT_WARN_COUNT+1))
fi
done
done

if [ "$CONTENT_WARN_COUNT" -eq 0 ]; then
print_pass "No content sanity warnings."
fi

# Basic spell check on key data fields (requires 'aspell' installed)
if [ "$check_spelling" = "true" ]; then
print_section "Applying spell check"
Expand Down