-
Notifications
You must be signed in to change notification settings - Fork 164
Expand file tree
/
Copy pathsee-also-tokenize.html
More file actions
67 lines (64 loc) · 4.57 KB
/
see-also-tokenize.html
File metadata and controls
67 lines (64 loc) · 4.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
{%- comment -%}
Per-article tokenization helper for assets/see-also.html.
Parameters:
include.article — hash with .title and .url (a child or parent-as-page cat).
include.cat_title — string, the parent category's title.
Side effect: appends one `url@@@title@@@cat@@@|title_toks|@@@|body_toks|###`
record to the outer `blob` variable.
{%- endcomment -%}
{%- assign norm_url = include.article.url | append: "/" | replace: "//", "/" -%}
{%- assign page = site.pages | where: "url", norm_url | first -%}
{%- assign title_text = include.article.title | downcase -%}
{%- assign title_text = title_text | replace: ".", " " | replace: ",", " " | replace: ";", " " | replace: ":", " " | replace: "(", " " | replace: ")", " " | replace: "[", " " | replace: "]", " " | replace: "{", " " | replace: "}", " " | replace: "/", " " | replace: "-", " " | replace: "_", " " | replace: "&", " " | replace: "?", " " | replace: "!", " " | replace: "=", " " | replace: "+", " " | replace: "*", " " | replace: "'", " " | replace: '"', " " | replace: "\\", " " | replace: "<", " " | replace: ">", " " | replace: "#", " " -%}
{%- assign title_wrapped = "|" -%}
{%- assign title_raw = title_text | split: " " -%}
{%- for tok in title_raw -%}
{%- assign t = tok | strip -%}
{%- if t.size < 3 or t.size > 24 -%}{%- continue -%}{%- endif -%}
{%- if STOP contains t -%}{%- continue -%}{%- endif -%}
{%- assign first_char = t | slice: 0, 1 -%}
{%- if first_char == "0" or first_char == "1" or first_char == "2" or first_char == "3" or first_char == "4" or first_char == "5" or first_char == "6" or first_char == "7" or first_char == "8" or first_char == "9" -%}{%- continue -%}{%- endif -%}
{%- comment -%} Light stemming: strip trailing 's' to unify plurals
(apriltags->apriltag, sensors->sensor). Skip if "ss" (address) or
short (toss). Naive but effective on technical English. {%- endcomment -%}
{%- assign last2 = t | slice: -2, 2 -%}
{%- if t.size > 4 and last2 != "ss" -%}
{%- assign last1 = t | slice: -1, 1 -%}
{%- if last1 == "s" -%}
{%- assign sz_m = t.size | minus: 1 -%}
{%- assign t = t | slice: 0, sz_m -%}
{%- endif -%}
{%- endif -%}
{%- assign needle = "|" | append: t | append: "|" -%}
{%- if title_wrapped contains needle -%}{%- continue -%}{%- endif -%}
{%- assign title_wrapped = title_wrapped | append: t | append: "|" -%}
{%- endfor -%}
{%- assign body_wrapped = "|" -%}
{%- if page and page.content -%}
{%- assign body_text = page.content | strip_html | strip_newlines | truncate: BODY_LEAD_CHARS, "" | downcase -%}
{%- assign body_text = body_text | replace: ".", " " | replace: ",", " " | replace: ";", " " | replace: ":", " " | replace: "(", " " | replace: ")", " " | replace: "[", " " | replace: "]", " " | replace: "{", " " | replace: "}", " " | replace: "/", " " | replace: "-", " " | replace: "_", " " | replace: "&", " " | replace: "?", " " | replace: "!", " " | replace: "=", " " | replace: "+", " " | replace: "*", " " | replace: "'", " " | replace: '"', " " | replace: "\\", " " | replace: "<", " " | replace: ">", " " | replace: "#", " " -%}
{%- comment -%} Body dedup is seeded with title_wrapped so we never
double-count tokens that appeared in the title. {%- endcomment -%}
{%- assign b_seen = title_wrapped -%}
{%- assign body_raw = body_text | split: " " -%}
{%- for tok in body_raw -%}
{%- assign t = tok | strip -%}
{%- if t.size < 4 or t.size > 24 -%}{%- continue -%}{%- endif -%}
{%- if STOP contains t -%}{%- continue -%}{%- endif -%}
{%- assign first_char = t | slice: 0, 1 -%}
{%- if first_char == "0" or first_char == "1" or first_char == "2" or first_char == "3" or first_char == "4" or first_char == "5" or first_char == "6" or first_char == "7" or first_char == "8" or first_char == "9" -%}{%- continue -%}{%- endif -%}
{%- assign last2 = t | slice: -2, 2 -%}
{%- if t.size > 4 and last2 != "ss" -%}
{%- assign last1 = t | slice: -1, 1 -%}
{%- if last1 == "s" -%}
{%- assign sz_m = t.size | minus: 1 -%}
{%- assign t = t | slice: 0, sz_m -%}
{%- endif -%}
{%- endif -%}
{%- assign needle = "|" | append: t | append: "|" -%}
{%- if b_seen contains needle -%}{%- continue -%}{%- endif -%}
{%- assign b_seen = b_seen | append: t | append: "|" -%}
{%- assign body_wrapped = body_wrapped | append: t | append: "|" -%}
{%- endfor -%}
{%- endif -%}
{%- assign blob = blob | append: norm_url | append: "@@@" | append: include.article.title | append: "@@@" | append: include.cat_title | append: "@@@" | append: title_wrapped | append: "@@@" | append: body_wrapped | append: "###" -%}