|
51 | 51 |
|
52 | 52 | it { is_expected.to eq("This is not {code} and not [a link]") } |
53 | 53 | end |
| 54 | + |
| 55 | + context "with invalid UTF-8 byte sequences in the input" do |
| 56 | + it "drops a stray invalid byte and keeps the surrounding text" do |
| 57 | + input = "Hello \xFF world".dup |
| 58 | + expect(input.valid_encoding?).to be(false) |
| 59 | + expect(described_class.new(input).convert).to eq("Hello ? world") |
| 60 | + end |
| 61 | + |
| 62 | + it "drops a stray continuation byte" do |
| 63 | + input = "abc \x80 def".dup |
| 64 | + expect(input.valid_encoding?).to be(false) |
| 65 | + expect(described_class.new(input).convert).to eq("abc ? def") |
| 66 | + end |
| 67 | + |
| 68 | + it "drops a truncated multi-byte sequence" do |
| 69 | + input = "pre \xC3 post".dup |
| 70 | + expect(input.valid_encoding?).to be(false) |
| 71 | + expect(described_class.new(input).convert).to eq("pre ? post") |
| 72 | + end |
| 73 | + |
| 74 | + it "preserves valid multi-byte characters while dropping only the invalid byte" do |
| 75 | + input = "héllo \xFF world".dup |
| 76 | + expect(input.valid_encoding?).to be(false) |
| 77 | + expect(described_class.new(input).convert).to eq("héllo ? world") |
| 78 | + end |
| 79 | + |
| 80 | + it "still parses formatting around invalid bytes inside delimiters" do |
| 81 | + input = "*bold\xFFtext*".dup |
| 82 | + expect(input.valid_encoding?).to be(false) |
| 83 | + expect(described_class.new(input).convert).to eq("**bold?text**") |
| 84 | + end |
| 85 | + end |
54 | 86 | end |
55 | 87 |
|
56 | 88 | describe "line ending normalization" do |
|
241 | 273 |
|
242 | 274 | it { is_expected.to eq("H<sub>2</sub>O") } |
243 | 275 | end |
| 276 | + |
| 277 | + context "with multi-byte UTF-8 characters inside formatting delimiters" do |
| 278 | + it "handles bold with multi-byte characters" do |
| 279 | + expect(described_class.new("This is *héllo* text.").convert) |
| 280 | + .to eq("This is **héllo** text.") |
| 281 | + end |
| 282 | + |
| 283 | + it "handles italic with multi-byte characters" do |
| 284 | + expect(described_class.new("This is _äöü_ text.").convert) |
| 285 | + .to eq("This is *äöü* text.") |
| 286 | + end |
| 287 | + |
| 288 | + it "handles strikethrough with multi-byte characters" do |
| 289 | + expect(described_class.new("This is -déléted- text.").convert) |
| 290 | + .to eq("This is ~~déléted~~ text.") |
| 291 | + end |
| 292 | + |
| 293 | + it "handles underline with multi-byte characters" do |
| 294 | + expect(described_class.new("This is +éàü+ text.").convert) |
| 295 | + .to eq("This is <u>éàü</u> text.") |
| 296 | + end |
| 297 | + |
| 298 | + it "handles subscript with multi-byte characters" do |
| 299 | + expect(described_class.new("H~äö~O").convert) |
| 300 | + .to eq("H<sub>äö</sub>O") |
| 301 | + end |
| 302 | + |
| 303 | + it "handles multiple formatted multi-byte segments in one line" do |
| 304 | + expect(described_class.new("*éé* and _öü_").convert) |
| 305 | + .to eq("**éé** and *öü*") |
| 306 | + end |
| 307 | + |
| 308 | + it "handles Arabic inside bold" do |
| 309 | + expect(described_class.new("*مرحبا*").convert).to eq("**مرحبا**") |
| 310 | + end |
| 311 | + |
| 312 | + it "handles Chinese inside bold" do |
| 313 | + expect(described_class.new("*你好*").convert).to eq("**你好**") |
| 314 | + end |
| 315 | + |
| 316 | + it "handles Japanese inside italic" do |
| 317 | + expect(described_class.new("_日本語_").convert).to eq("*日本語*") |
| 318 | + end |
| 319 | + |
| 320 | + it "handles Cyrillic inside bold" do |
| 321 | + expect(described_class.new("*Привет*").convert).to eq("**Привет**") |
| 322 | + end |
| 323 | + |
| 324 | + it "handles Hebrew inside bold" do |
| 325 | + expect(described_class.new("*שלום*").convert).to eq("**שלום**") |
| 326 | + end |
| 327 | + |
| 328 | + it "handles 4-byte emoji inside bold" do |
| 329 | + expect(described_class.new("*🎉*").convert).to eq("**🎉**") |
| 330 | + end |
| 331 | + |
| 332 | + it "handles subscript after a macro preceded by a multi-byte character" do |
| 333 | + # Regression: scanner.string[0...scanner.pos] mixed byte-pos with char-slicing, |
| 334 | + # causing "é*x*~2~" to spuriously "see" the closing ~ in the already-scanned |
| 335 | + # prefix and skip subscript parsing. |
| 336 | + expect(described_class.new("é*x*~2~").convert).to eq("é**x**<sub>2</sub>") |
| 337 | + expect(described_class.new("🎉*x*~2~").convert).to eq("🎉**x**<sub>2</sub>") |
| 338 | + end |
| 339 | + end |
244 | 340 | end |
245 | 341 |
|
246 | 342 | describe "links" do |
|
0 commit comments