Merge pull request #22900 from opf/bug/73736-jira-migrator-invalid-byte-sequence-in-UTF-8

as-op · web-flow · commit 403820bab473 · 2026-04-23T17:27:11.000+02:00
[73736] Jira migrator: invalid byte sequence in utf 8
diff --git a/app/services/import/jira_wiki_markup/parser.rb b/app/services/import/jira_wiki_markup/parser.rb
@@ -46,10 +46,16 @@ class Parser
       )
 
       def initialize(text)
-        @text = text.dup
+        # Normalize any input into a safe, mutable UTF-8 string: nil becomes "",
+        # and invalid byte sequences are dropped so downstream regex/StringScanner
+        # operations cannot raise ArgumentError on malformed input.
+        @text = text.to_s.dup
+        @text.scrub!("?") unless @text.valid_encoding?
       end
 
       def parse
+        return N::Document.new(children: []) if @text.blank?
+
         preprocess
         blocks = parse_blocks
         N::Document.new(children: blocks)
@@ -515,7 +521,9 @@ def extract_delimited(scanner, delimiter)
         return if inner[-1] == " "
         return if followed_by_word?(rest, close_idx)
 
-        scanner.pos += 1 + close_idx + 1
+        # StringScanner#pos is byte-based; advance by byte length, not char count,
+        # so multi-byte UTF-8 content inside the delimiters does not land pos mid-character.
+        scanner.pos += inner.bytesize + 2
         inner
       end
 
@@ -527,7 +535,7 @@ def followed_by_word?(text, close_idx)
       def scan_subscript(scanner, buffer, nodes)
         return unless scanner.rest[0] == "~"
 
-        scanned = scanner.string[0...scanner.pos]
+        scanned = scanner.string.byteslice(0, scanner.pos)
         return if (scanned + buffer).end_with?("~")
 
         inner = extract_subscript_content(scanner)
@@ -549,7 +557,8 @@ def extract_subscript_content(scanner)
         after = rest[(close_idx + 1)..]
         return if after.present? && after[0] == "~"
 
-        scanner.pos += 1 + close_idx + 1
+        # StringScanner#pos is byte-based; advance by byte length to stay on a char boundary.
+        scanner.pos += inner.bytesize + 2
         inner
       end
 
diff --git a/app/services/import/jira_wiki_markup_converter.rb b/app/services/import/jira_wiki_markup_converter.rb
@@ -35,8 +35,6 @@ def initialize(text)
     end
 
     def convert
-      return "" if @text.blank?
-
       ast = JiraWikiMarkup::Parser.new(@text).parse
       JiraWikiMarkup::Renderer.new(ast).render
     end
diff --git a/app/workers/import/jira_fetch_and_import_projects_job.rb b/app/workers/import/jira_fetch_and_import_projects_job.rb
@@ -102,8 +102,6 @@ def collect_changelog_user_keys(user_keys, issue)
     end
 
     def collect_markup_mentions(text, mention_usernames)
-      return if text.blank?
-
       ast = JiraWikiMarkup::Parser.new(text).parse
       collect_mentions_from_node(ast, mention_usernames)
     end
diff --git a/spec/services/import/jira_wiki_markup_converter_spec.rb b/spec/services/import/jira_wiki_markup_converter_spec.rb
@@ -51,6 +51,38 @@
 
       it { is_expected.to eq("This is not {code} and not [a link]") }
     end
+
+    context "with invalid UTF-8 byte sequences in the input" do
+      it "drops a stray invalid byte and keeps the surrounding text" do
+        input = "Hello \xFF world".dup
+        expect(input.valid_encoding?).to be(false)
+        expect(described_class.new(input).convert).to eq("Hello ? world")
+      end
+
+      it "drops a stray continuation byte" do
+        input = "abc \x80 def".dup
+        expect(input.valid_encoding?).to be(false)
+        expect(described_class.new(input).convert).to eq("abc ? def")
+      end
+
+      it "drops a truncated multi-byte sequence" do
+        input = "pre \xC3 post".dup
+        expect(input.valid_encoding?).to be(false)
+        expect(described_class.new(input).convert).to eq("pre ? post")
+      end
+
+      it "preserves valid multi-byte characters while dropping only the invalid byte" do
+        input = "héllo \xFF world".dup
+        expect(input.valid_encoding?).to be(false)
+        expect(described_class.new(input).convert).to eq("héllo ? world")
+      end
+
+      it "still parses formatting around invalid bytes inside delimiters" do
+        input = "*bold\xFFtext*".dup
+        expect(input.valid_encoding?).to be(false)
+        expect(described_class.new(input).convert).to eq("**bold?text**")
+      end
+    end
   end
 
   describe "line ending normalization" do
@@ -241,6 +273,70 @@
 
       it { is_expected.to eq("H<sub>2</sub>O") }
     end
+
+    context "with multi-byte UTF-8 characters inside formatting delimiters" do
+      it "handles bold with multi-byte characters" do
+        expect(described_class.new("This is *héllo* text.").convert)
+          .to eq("This is **héllo** text.")
+      end
+
+      it "handles italic with multi-byte characters" do
+        expect(described_class.new("This is _äöü_ text.").convert)
+          .to eq("This is *äöü* text.")
+      end
+
+      it "handles strikethrough with multi-byte characters" do
+        expect(described_class.new("This is -déléted- text.").convert)
+          .to eq("This is ~~déléted~~ text.")
+      end
+
+      it "handles underline with multi-byte characters" do
+        expect(described_class.new("This is +éàü+ text.").convert)
+          .to eq("This is <u>éàü</u> text.")
+      end
+
+      it "handles subscript with multi-byte characters" do
+        expect(described_class.new("H~äö~O").convert)
+          .to eq("H<sub>äö</sub>O")
+      end
+
+      it "handles multiple formatted multi-byte segments in one line" do
+        expect(described_class.new("*éé* and _öü_").convert)
+          .to eq("**éé** and *öü*")
+      end
+
+      it "handles Arabic inside bold" do
+        expect(described_class.new("*مرحبا*").convert).to eq("**مرحبا**")
+      end
+
+      it "handles Chinese inside bold" do
+        expect(described_class.new("*你好*").convert).to eq("**你好**")
+      end
+
+      it "handles Japanese inside italic" do
+        expect(described_class.new("_日本語_").convert).to eq("*日本語*")
+      end
+
+      it "handles Cyrillic inside bold" do
+        expect(described_class.new("*Привет*").convert).to eq("**Привет**")
+      end
+
+      it "handles Hebrew inside bold" do
+        expect(described_class.new("*שלום*").convert).to eq("**שלום**")
+      end
+
+      it "handles 4-byte emoji inside bold" do
+        expect(described_class.new("*🎉*").convert).to eq("**🎉**")
+      end
+
+      it "handles subscript after a macro preceded by a multi-byte character" do
+        # Regression: scanner.string[0...scanner.pos] mixed byte-pos with char-slicing,
+        # causing "é*x*~2~" to spuriously "see" the closing ~ in the already-scanned
+        # prefix and skip subscript parsing.
+        expect(described_class.new("é*x*~2~").convert).to eq("é**x**<sub>2</sub>")
+        expect(described_class.new("🎉*x*~2~").convert).to eq("🎉**x**<sub>2</sub>")
+      end
+    end
   end
 
   describe "links" do