Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions _includes/head/custom.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{%- if page.url contains "/wiki/" -%}
<link rel="stylesheet" href="{{ '/assets/css/see-also.css' | relative_url }}">
<script defer src="{{ '/assets/js/see-also.js' | relative_url }}"></script>
{%- endif -%}
67 changes: 67 additions & 0 deletions _includes/see-also-tokenize.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{%- comment -%}
Per-article tokenization helper for assets/see-also.html.
Parameters:
include.article — hash with .title and .url (a child or parent-as-page cat).
include.cat_title — string, the parent category's title.
Side effect: appends one `url@@@title@@@cat@@@|title_toks|@@@|body_toks|###`
record to the outer `blob` variable.
{%- endcomment -%}
{%- assign norm_url = include.article.url | append: "/" | replace: "//", "/" -%}
{%- assign page = site.pages | where: "url", norm_url | first -%}

{%- assign title_text = include.article.title | downcase -%}
{%- assign title_text = title_text | replace: ".", " " | replace: ",", " " | replace: ";", " " | replace: ":", " " | replace: "(", " " | replace: ")", " " | replace: "[", " " | replace: "]", " " | replace: "{", " " | replace: "}", " " | replace: "/", " " | replace: "-", " " | replace: "_", " " | replace: "&", " " | replace: "?", " " | replace: "!", " " | replace: "=", " " | replace: "+", " " | replace: "*", " " | replace: "'", " " | replace: '"', " " | replace: "\\", " " | replace: "<", " " | replace: ">", " " | replace: "#", " " -%}
{%- assign title_wrapped = "|" -%}
{%- assign title_raw = title_text | split: " " -%}
{%- for tok in title_raw -%}
{%- assign t = tok | strip -%}
{%- if t.size < 3 or t.size > 24 -%}{%- continue -%}{%- endif -%}
{%- if STOP contains t -%}{%- continue -%}{%- endif -%}
{%- assign first_char = t | slice: 0, 1 -%}
{%- if first_char == "0" or first_char == "1" or first_char == "2" or first_char == "3" or first_char == "4" or first_char == "5" or first_char == "6" or first_char == "7" or first_char == "8" or first_char == "9" -%}{%- continue -%}{%- endif -%}
{%- comment -%} Light stemming: strip trailing 's' to unify plurals
(apriltags->apriltag, sensors->sensor). Skip if "ss" (address) or
short (toss). Naive but effective on technical English. {%- endcomment -%}
{%- assign last2 = t | slice: -2, 2 -%}
{%- if t.size > 4 and last2 != "ss" -%}
{%- assign last1 = t | slice: -1, 1 -%}
{%- if last1 == "s" -%}
{%- assign sz_m = t.size | minus: 1 -%}
{%- assign t = t | slice: 0, sz_m -%}
{%- endif -%}
{%- endif -%}
{%- assign needle = "|" | append: t | append: "|" -%}
{%- if title_wrapped contains needle -%}{%- continue -%}{%- endif -%}
{%- assign title_wrapped = title_wrapped | append: t | append: "|" -%}
{%- endfor -%}

{%- assign body_wrapped = "|" -%}
{%- if page and page.content -%}
{%- assign body_text = page.content | strip_html | strip_newlines | truncate: BODY_LEAD_CHARS, "" | downcase -%}
{%- assign body_text = body_text | replace: ".", " " | replace: ",", " " | replace: ";", " " | replace: ":", " " | replace: "(", " " | replace: ")", " " | replace: "[", " " | replace: "]", " " | replace: "{", " " | replace: "}", " " | replace: "/", " " | replace: "-", " " | replace: "_", " " | replace: "&", " " | replace: "?", " " | replace: "!", " " | replace: "=", " " | replace: "+", " " | replace: "*", " " | replace: "'", " " | replace: '"', " " | replace: "\\", " " | replace: "<", " " | replace: ">", " " | replace: "#", " " -%}
{%- comment -%} Body dedup is seeded with title_wrapped so we never
double-count tokens that appeared in the title. {%- endcomment -%}
{%- assign b_seen = title_wrapped -%}
{%- assign body_raw = body_text | split: " " -%}
{%- for tok in body_raw -%}
{%- assign t = tok | strip -%}
{%- if t.size < 4 or t.size > 24 -%}{%- continue -%}{%- endif -%}
{%- if STOP contains t -%}{%- continue -%}{%- endif -%}
{%- assign first_char = t | slice: 0, 1 -%}
{%- if first_char == "0" or first_char == "1" or first_char == "2" or first_char == "3" or first_char == "4" or first_char == "5" or first_char == "6" or first_char == "7" or first_char == "8" or first_char == "9" -%}{%- continue -%}{%- endif -%}
{%- assign last2 = t | slice: -2, 2 -%}
{%- if t.size > 4 and last2 != "ss" -%}
{%- assign last1 = t | slice: -1, 1 -%}
{%- if last1 == "s" -%}
{%- assign sz_m = t.size | minus: 1 -%}
{%- assign t = t | slice: 0, sz_m -%}
{%- endif -%}
{%- endif -%}
{%- assign needle = "|" | append: t | append: "|" -%}
{%- if b_seen contains needle -%}{%- continue -%}{%- endif -%}
{%- assign b_seen = b_seen | append: t | append: "|" -%}
{%- assign body_wrapped = body_wrapped | append: t | append: "|" -%}
{%- endfor -%}
{%- endif -%}

{%- assign blob = blob | append: norm_url | append: "@@@" | append: include.article.title | append: "@@@" | append: include.cat_title | append: "@@@" | append: title_wrapped | append: "@@@" | append: body_wrapped | append: "###" -%}
15 changes: 15 additions & 0 deletions assets/css/see-also.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
.sa-panel {
margin: 2.2rem 0 1rem;
padding-top: 1.2rem;
border-top: 1px solid #e5e8ec;
}
.sa-heading {
font-size: 1.05rem;
margin: 0 0 0.5rem;
color: #2c3e50;
font-weight: 600;
}
.sa-list { list-style: disc; padding-left: 1.4rem; margin: 0; }
.sa-list li { margin: 0.15rem 0; font-size: 0.95rem; }
.sa-list a { text-decoration: none; color: #2563aa; }
.sa-list a:hover { text-decoration: underline; }
48 changes: 48 additions & 0 deletions assets/js/see-also.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
(function () {
'use strict';

var path = window.location.pathname;
if (path.indexOf('/wiki/') !== 0) return;

var target = document.querySelector('.page__content');
if (!target) return;

fetch('/assets/see-also.json', { credentials: 'same-origin' })
.then(function (r) {
if (!r.ok) throw new Error('HTTP ' + r.status);
return r.json();
})
.then(function (data) {
var recs = data[path];
if (!recs || recs.length === 0) return;
render(recs);
})
.catch(function (err) {
if (window.console && console.warn) console.warn('[see-also]', err);
});

function render(recs) {
var panel = document.createElement('section');
panel.className = 'sa-panel';
panel.setAttribute('aria-label', 'Related articles');

var h = document.createElement('h3');
h.className = 'sa-heading';
h.textContent = 'See also';
panel.appendChild(h);

var ul = document.createElement('ul');
ul.className = 'sa-list';
recs.forEach(function (r) {
var li = document.createElement('li');
var a = document.createElement('a');
a.href = r.url;
a.textContent = r.title;
li.appendChild(a);
ul.appendChild(li);
});
panel.appendChild(ul);

target.appendChild(panel);
}
})();
207 changes: 207 additions & 0 deletions assets/see-also.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
---
permalink: /assets/see-also.json
layout: null
sitemap: false
---
{%- comment -%}
Per-article "See also" recommendations, derived purely from token overlap.
No human curation, no preprocessor — pure Liquid at GH-Pages build time.

Output: { "/wiki/cat/article/": [{"url": ..., "title": ...}, ...] }
Up to MAX_K entries; fewer (or zero) if no targets clear the score threshold.

Algorithm: bidirectional title matching + IDF-bucketed body overlap.
Phase 1 — tokenize title and body lead per article.
Phase 1.5 — bucket body tokens by document frequency (rare/medium/common),
dropping ones too rare or too generic to discriminate.
Phase 2 — pairwise score: title_hits × TITLE_WEIGHT + body_score, where
title_hits counts BOTH directions (source-title-in-target AND
target-title-in-source). Same-category match gets a 1.2× bonus.
Phase 3 — adaptive K: keep recs with score ≥ max(MIN_SCORE, top/2),
cap at MAX_K. Strong articles get 3–4 recs; weak get 0–2.

Hyperparameters tuned via Python ablation harness against 26 hand-curated
source articles (3–7 expected good recs + 2–3 hard-negatives each):
best config scored 92/130 on TP-2*FP, with zero hard false positives.
{%- endcomment -%}

{%- assign STOP = "the,and,for,with,this,that,from,have,has,had,can,will,would,could,should,may,might,must,does,did,doing,done,been,being,about,above,after,again,against,all,also,any,are,because,before,below,between,both,but,each,few,more,most,much,other,over,same,some,such,than,then,there,these,those,through,under,until,very,was,were,what,when,where,which,while,who,whom,whose,why,how,you,your,our,his,her,its,their,they,them,not,now,off,one,two,too,nor,yes,upon,unto,onto,into,https,http,html,com,net,org,old,new,use,used,see,seen,via,let,etc,non" | split: "," -%}
{%- assign TITLE_WEIGHT = 7 -%}
{%- assign MIN_SCORE = 5 -%}
{%- assign MAX_K = 4 -%}
{%- assign REL_THRESHOLD_NUM = 1 -%}
{%- assign REL_THRESHOLD_DEN = 2 -%}
{%- assign SAME_CAT_NUM = 12 -%}
{%- assign SAME_CAT_DEN = 10 -%}
{%- assign MIN_BODY_DF = 2 -%}
{%- assign RARE_DF_MAX = 7 -%}
{%- assign MEDIUM_DF_MAX = 10 -%}
{%- assign MAX_BODY_DF = 30 -%}
{%- assign BODY_LEAD_CHARS = 400 -%}

{%- comment -%} ============================================================
Phase 1: tokenize title + body separately for every article in the wiki.
Per-article record: url@@@title@@@cat@@@|title_toks|@@@|body_toks|
Article separator: ###
Per-article tokenization is factored into _includes/see-also-tokenize.html
so the same logic runs for both regular `cat.children` entries and the rare
parent-as-page cat (e.g. "Robotics Project Guide" → master-guide.md).
============================================================ {%- endcomment -%}
{%- assign blob = "" -%}
{%- for cat in site.data.navigation.wiki -%}
{%- if cat.title == "Overview" -%}{%- continue -%}{%- endif -%}

{%- comment -%} Parent-as-page nav entry (e.g. Robotics Project Guide → master-guide):
include the cat as an article only when its URL has a slug after the category
(/wiki/foo/bar/), not a bare category landing (/wiki/foo/) — those resolve to
auto-generated index pages with generic titles that pollute the recommender. {%- endcomment -%}
{%- if cat.url -%}
{%- assign cat_norm_url = cat.url | append: "/" | replace: "//", "/" -%}
{%- assign cat_suffix = cat_norm_url | remove_first: "/wiki/" | replace: "/", " " | strip -%}
{%- if cat_suffix contains " " -%}
{%- assign cat_page = site.pages | where: "url", cat_norm_url | first -%}
{%- if cat_page and cat_page.content -%}
{%- include see-also-tokenize.html article=cat cat_title=cat.title -%}
{%- endif -%}
{%- endif -%}
{%- endif -%}

{%- if cat.children -%}
{%- for child in cat.children -%}
{%- include see-also-tokenize.html article=child cat_title=cat.title -%}
{%- endfor -%}
{%- endif -%}
{%- endfor -%}

{%- assign all_entries = blob | split: "###" -%}

{%- comment -%}
Phase 1.5: bucket each unique body token by document frequency. Bucketing
is a Liquid-friendly stand-in for IDF weighting (no log() in Liquid). The
Lucene MoreLikeThis paper and the BM25 reproducibility study both find
binned IDF nearly indistinguishable from continuous IDF in practice.
Title tokens are not iterated here — they get a uniform TITLE_WEIGHT in
scoring. Note tok_freq counts |tok| occurrences across the whole blob
(title + body segments), so a body token whose word also appears in many
titles inherits those into its DF bucket. Hyperparameters were tuned
against this exact count, not against a body-only DF.
{%- endcomment -%}
{%- assign rare_set = "|" -%}
{%- assign medium_set = "|" -%}
{%- assign common_set = "|" -%}
{%- assign global_seen = "|" -%}
{%- for entry in all_entries -%}
{%- if entry.size == 0 -%}{%- continue -%}{%- endif -%}
{%- assign p = entry | split: "@@@" -%}
{%- assign body_tokens_arr = p[4] | split: "|" -%}
{%- for tok in body_tokens_arr -%}
{%- if tok.size == 0 -%}{%- continue -%}{%- endif -%}
{%- assign tneedle = "|" | append: tok | append: "|" -%}
{%- if global_seen contains tneedle -%}{%- continue -%}{%- endif -%}
{%- assign global_seen = global_seen | append: tok | append: "|" -%}
{%- assign tok_freq = blob | split: tneedle | size | minus: 1 -%}
{%- if tok_freq < MIN_BODY_DF or tok_freq > MAX_BODY_DF -%}{%- continue -%}{%- endif -%}
{%- if tok_freq <= RARE_DF_MAX -%}
{%- assign rare_set = rare_set | append: tok | append: "|" -%}
{%- elsif tok_freq <= MEDIUM_DF_MAX -%}
{%- assign medium_set = medium_set | append: tok | append: "|" -%}
{%- else -%}
{%- assign common_set = common_set | append: tok | append: "|" -%}
{%- endif -%}
{%- endfor -%}
{%- endfor -%}

{%- comment -%}
Phase 2 + 3: pairwise scoring (bidirectional title + IDF-bucketed body) and
adaptive top-K emit per source.
{%- endcomment -%}
{
{%- assign first_emit = true -%}
{%- for source in all_entries -%}
{%- if source.size == 0 -%}{%- continue -%}{%- endif -%}
{%- assign sp = source | split: "@@@" -%}
{%- assign s_url = sp[0] -%}
{%- assign s_category = sp[2] -%}
{%- assign s_title_tokens = sp[3] | split: "|" -%}
{%- assign s_body_tokens = sp[4] | split: "|" -%}
{%- assign s_combined = sp[3] | append: sp[4] -%}

{%- assign scores = "" -%}
{%- for target in all_entries -%}
{%- if target.size == 0 -%}{%- continue -%}{%- endif -%}
{%- assign tp = target | split: "@@@" -%}
{%- if tp[0] == s_url -%}{%- continue -%}{%- endif -%}
{%- assign t_combined = tp[3] | append: tp[4] -%}

{%- comment -%} Bidirectional title matching: count source-title tokens
found in target AND target-title tokens found in source. Handles
narrow-title articles like "Pixhawk" — single largest quality lift in
ablation (score 78 -> 90 vs source-only). {%- endcomment -%}
{%- assign title_hits = 0 -%}
{%- for tok in s_title_tokens -%}
{%- if tok.size == 0 -%}{%- continue -%}{%- endif -%}
{%- assign needle = "|" | append: tok | append: "|" -%}
{%- if t_combined contains needle -%}{%- assign title_hits = title_hits | plus: 1 -%}{%- endif -%}
{%- endfor -%}
{%- assign t_title_tokens = tp[3] | split: "|" -%}
{%- for tok in t_title_tokens -%}
{%- if tok.size == 0 -%}{%- continue -%}{%- endif -%}
{%- assign needle = "|" | append: tok | append: "|" -%}
{%- if s_combined contains needle -%}{%- assign title_hits = title_hits | plus: 1 -%}{%- endif -%}
{%- endfor -%}

{%- assign body_score = 0 -%}
{%- for tok in s_body_tokens -%}
{%- if tok.size == 0 -%}{%- continue -%}{%- endif -%}
{%- assign needle = "|" | append: tok | append: "|" -%}
{%- unless t_combined contains needle -%}{%- continue -%}{%- endunless -%}
{%- if rare_set contains needle -%}
{%- assign body_score = body_score | plus: 5 -%}
{%- elsif medium_set contains needle -%}
{%- assign body_score = body_score | plus: 2 -%}
{%- elsif common_set contains needle -%}
{%- assign body_score = body_score | plus: 1 -%}
{%- endif -%}
{%- endfor -%}

{%- assign score = title_hits | times: TITLE_WEIGHT | plus: body_score -%}
{%- if tp[2] == s_category -%}
{%- assign score = score | times: SAME_CAT_NUM | divided_by: SAME_CAT_DEN -%}
{%- endif -%}
{%- if score < MIN_SCORE -%}{%- continue -%}{%- endif -%}

{%- comment -%} Pad score to 4 digits so lexicographic sort orders numerically. {%- endcomment -%}
{%- assign padded = "0000" | append: score -%}
{%- assign padded = padded | slice: -4, 4 -%}
{%- assign scores = scores | append: padded | append: "@@@" | append: tp[0] | append: "@@@" | append: tp[1] | append: "&&&" -%}
{%- endfor -%}

{%- assign score_lines = scores | split: "&&&" | sort | reverse -%}
{%- assign rel_threshold = MIN_SCORE -%}
{%- assign top_str = "" -%}
{%- for line in score_lines -%}
{%- if line.size > 0 -%}{%- assign top_str = line -%}{%- break -%}{%- endif -%}
{%- endfor -%}
{%- if top_str.size > 0 -%}
{%- assign top_score = top_str | split: "@@@" | first | plus: 0 -%}
{%- assign half_top = top_score | times: REL_THRESHOLD_NUM | divided_by: REL_THRESHOLD_DEN -%}
{%- if half_top > rel_threshold -%}{%- assign rel_threshold = half_top -%}{%- endif -%}
{%- endif -%}

{%- unless first_emit -%},{%- endunless -%}
{{ s_url | jsonify }}:[
{%- assign emitted = 0 -%}
{%- for line in score_lines -%}
{%- if line.size == 0 -%}{%- continue -%}{%- endif -%}
{%- if emitted >= MAX_K -%}{%- break -%}{%- endif -%}
{%- assign rp = line | split: "@@@" -%}
{%- assign rscore = rp[0] | plus: 0 -%}
{%- if rscore < rel_threshold -%}{%- break -%}{%- endif -%}
{%- unless emitted == 0 -%},{%- endunless -%}
{"url":{{ rp[1] | jsonify }},"title":{{ rp[2] | jsonify }}}
{%- assign emitted = emitted | plus: 1 -%}
{%- endfor -%}
]
{%- assign first_emit = false -%}
{%- endfor -%}
}