Skip to content

Commit 403820b

Browse files
authored
Merge pull request #22900 from opf/bug/73736-jira-migrator-invalid-byte-sequence-in-UTF-8
[73736] Jira migrator: invalid byte sequence in utf 8
2 parents 93a32ce + 29a4969 commit 403820b

4 files changed

Lines changed: 109 additions & 8 deletions

File tree

app/services/import/jira_wiki_markup/parser.rb

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,16 @@ class Parser
4646
)
4747

4848
def initialize(text)
49-
@text = text.dup
49+
# Normalize any input into a safe, mutable UTF-8 string: nil becomes "",
50+
# and invalid byte sequences are dropped so downstream regex/StringScanner
51+
# operations cannot raise ArgumentError on malformed input.
52+
@text = text.to_s.dup
53+
@text.scrub!("?") unless @text.valid_encoding?
5054
end
5155

5256
def parse
57+
return N::Document.new(children: []) if @text.blank?
58+
5359
preprocess
5460
blocks = parse_blocks
5561
N::Document.new(children: blocks)
@@ -515,7 +521,9 @@ def extract_delimited(scanner, delimiter)
515521
return if inner[-1] == " "
516522
return if followed_by_word?(rest, close_idx)
517523

518-
scanner.pos += 1 + close_idx + 1
524+
# StringScanner#pos is byte-based; advance by byte length, not char count,
525+
# so multi-byte UTF-8 content inside the delimiters does not land pos mid-character.
526+
scanner.pos += inner.bytesize + 2
519527
inner
520528
end
521529

@@ -527,7 +535,7 @@ def followed_by_word?(text, close_idx)
527535
def scan_subscript(scanner, buffer, nodes)
528536
return unless scanner.rest[0] == "~"
529537

530-
scanned = scanner.string[0...scanner.pos]
538+
scanned = scanner.string.byteslice(0, scanner.pos)
531539
return if (scanned + buffer).end_with?("~")
532540

533541
inner = extract_subscript_content(scanner)
@@ -549,7 +557,8 @@ def extract_subscript_content(scanner)
549557
after = rest[(close_idx + 1)..]
550558
return if after.present? && after[0] == "~"
551559

552-
scanner.pos += 1 + close_idx + 1
560+
# StringScanner#pos is byte-based; advance by byte length to stay on a char boundary.
561+
scanner.pos += inner.bytesize + 2
553562
inner
554563
end
555564

app/services/import/jira_wiki_markup_converter.rb

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@ def initialize(text)
3535
end
3636

3737
def convert
38-
return "" if @text.blank?
39-
4038
ast = JiraWikiMarkup::Parser.new(@text).parse
4139
JiraWikiMarkup::Renderer.new(ast).render
4240
end

app/workers/import/jira_fetch_and_import_projects_job.rb

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,6 @@ def collect_changelog_user_keys(user_keys, issue)
102102
end
103103

104104
def collect_markup_mentions(text, mention_usernames)
105-
return if text.blank?
106-
107105
ast = JiraWikiMarkup::Parser.new(text).parse
108106
collect_mentions_from_node(ast, mention_usernames)
109107
end

spec/services/import/jira_wiki_markup_converter_spec.rb

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,38 @@
5151

5252
it { is_expected.to eq("This is not {code} and not [a link]") }
5353
end
54+
55+
context "with invalid UTF-8 byte sequences in the input" do
56+
it "drops a stray invalid byte and keeps the surrounding text" do
57+
input = "Hello \xFF world".dup
58+
expect(input.valid_encoding?).to be(false)
59+
expect(described_class.new(input).convert).to eq("Hello ? world")
60+
end
61+
62+
it "drops a stray continuation byte" do
63+
input = "abc \x80 def".dup
64+
expect(input.valid_encoding?).to be(false)
65+
expect(described_class.new(input).convert).to eq("abc ? def")
66+
end
67+
68+
it "drops a truncated multi-byte sequence" do
69+
input = "pre \xC3 post".dup
70+
expect(input.valid_encoding?).to be(false)
71+
expect(described_class.new(input).convert).to eq("pre ? post")
72+
end
73+
74+
it "preserves valid multi-byte characters while dropping only the invalid byte" do
75+
input = "héllo \xFF world".dup
76+
expect(input.valid_encoding?).to be(false)
77+
expect(described_class.new(input).convert).to eq("héllo ? world")
78+
end
79+
80+
it "still parses formatting around invalid bytes inside delimiters" do
81+
input = "*bold\xFFtext*".dup
82+
expect(input.valid_encoding?).to be(false)
83+
expect(described_class.new(input).convert).to eq("**bold?text**")
84+
end
85+
end
5486
end
5587

5688
describe "line ending normalization" do
@@ -241,6 +273,70 @@
241273

242274
it { is_expected.to eq("H<sub>2</sub>O") }
243275
end
276+
277+
context "with multi-byte UTF-8 characters inside formatting delimiters" do
278+
it "handles bold with multi-byte characters" do
279+
expect(described_class.new("This is *héllo* text.").convert)
280+
.to eq("This is **héllo** text.")
281+
end
282+
283+
it "handles italic with multi-byte characters" do
284+
expect(described_class.new("This is _äöü_ text.").convert)
285+
.to eq("This is *äöü* text.")
286+
end
287+
288+
it "handles strikethrough with multi-byte characters" do
289+
expect(described_class.new("This is -déléted- text.").convert)
290+
.to eq("This is ~~déléted~~ text.")
291+
end
292+
293+
it "handles underline with multi-byte characters" do
294+
expect(described_class.new("This is +éàü+ text.").convert)
295+
.to eq("This is <u>éàü</u> text.")
296+
end
297+
298+
it "handles subscript with multi-byte characters" do
299+
expect(described_class.new("H~äö~O").convert)
300+
.to eq("H<sub>äö</sub>O")
301+
end
302+
303+
it "handles multiple formatted multi-byte segments in one line" do
304+
expect(described_class.new("*éé* and _öü_").convert)
305+
.to eq("**éé** and *öü*")
306+
end
307+
308+
it "handles Arabic inside bold" do
309+
expect(described_class.new("*مرحبا*").convert).to eq("**مرحبا**")
310+
end
311+
312+
it "handles Chinese inside bold" do
313+
expect(described_class.new("*你好*").convert).to eq("**你好**")
314+
end
315+
316+
it "handles Japanese inside italic" do
317+
expect(described_class.new("_日本語_").convert).to eq("*日本語*")
318+
end
319+
320+
it "handles Cyrillic inside bold" do
321+
expect(described_class.new("*Привет*").convert).to eq("**Привет**")
322+
end
323+
324+
it "handles Hebrew inside bold" do
325+
expect(described_class.new("*שלום*").convert).to eq("**שלום**")
326+
end
327+
328+
it "handles 4-byte emoji inside bold" do
329+
expect(described_class.new("*🎉*").convert).to eq("**🎉**")
330+
end
331+
332+
it "handles subscript after a macro preceded by a multi-byte character" do
333+
# Regression: scanner.string[0...scanner.pos] mixed byte-pos with char-slicing,
334+
# causing "é*x*~2~" to spuriously "see" the closing ~ in the already-scanned
335+
# prefix and skip subscript parsing.
336+
expect(described_class.new("é*x*~2~").convert).to eq("é**x**<sub>2</sub>")
337+
expect(described_class.new("🎉*x*~2~").convert).to eq("🎉**x**<sub>2</sub>")
338+
end
339+
end
244340
end
245341

246342
describe "links" do

0 commit comments

Comments
 (0)