Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions machine/corpora/place_markers_usfm_update_block_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List, TypedDict, cast

from ..translation.word_alignment_matrix import WordAlignmentMatrix
from .segment_boundary_adjuster import SegmentBoundaryAdjuster
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior
from .usfm_token import UsfmToken, UsfmTokenType
from .usfm_update_block import UsfmUpdateBlock
Expand All @@ -21,6 +22,9 @@ class PlaceMarkersAlignmentInfo(TypedDict):


class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
def __init__(self, *args):
super().__init__(*args)
self._segment_boundary_adjuster = SegmentBoundaryAdjuster()

def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
elements = list(block.elements)
Expand Down Expand Up @@ -137,6 +141,12 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
for element, adj_src_tok in zip(to_place, adj_src_toks):
adj_trg_tok = self._predict_marker_location(alignment_info["alignment"], adj_src_tok, src_toks, trg_toks)

# If inserting a paragraph marker, make small adjustments to place it in a more natural location
if element.type == UsfmUpdateBlockElementType.PARAGRAPH:
adj_trg_tok = self._segment_boundary_adjuster.adjust_tokenized_segment_pair_boundaries(
adj_trg_tok, trg_toks
)

Comment on lines +144 to +149
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

if (
adj_trg_tok > 0
and element.type == UsfmUpdateBlockElementType.STYLE
Expand Down
121 changes: 121 additions & 0 deletions machine/corpora/segment_boundary_adjuster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from typing import List, Set, Tuple

import regex


# This class is used by SegmentBoundaryAdjuster when it is dealing with tokenized text.
class TokenRejoiner:

_NO_TRAILING_SPACE_CHARACTERS: Set[str] = {"(", "[", "{", "«", "‹", "“", "‘"}
_NO_LEADING_SPACE_CHARACTERS: Set[str] = {",", ";", ":", ".", "!", "?", ")", "]", "}", "”", "’", "»", "›"}

def __init__(self) -> None:
self._joined_text = ""
self._num_tokens = 0

@classmethod
def join_tokens(cls, tokens: List[str]) -> str:
rejoiner = cls()
for token in tokens:
rejoiner.add_token_to_joined_text(token)
if len(rejoiner._joined_text) > 0 and rejoiner._joined_text[-1] not in cls._NO_TRAILING_SPACE_CHARACTERS:
rejoiner._joined_text += " "
return rejoiner._joined_text

def add_token_to_joined_text(self, token: str) -> str:
if self._num_tokens > 0:
if (
token not in self._NO_LEADING_SPACE_CHARACTERS
and self._joined_text[-1] not in self._NO_TRAILING_SPACE_CHARACTERS
):
self._joined_text += " "
self._joined_text += token
self._num_tokens += 1
return self._joined_text


class SegmentBoundaryAdjuster:
_PROHIBITED_VERSE_STARTING_CHARACTERS: Set[str] = {
" ",
",",
";",
":",
".",
"!",
"?",
")",
"]",
"}",
"”",
"’",
}
_PROHIBITED_VERSE_ENDING_CHARACTERS: Set[str] = {"(", "[", "{", "«", "‹", "“", "‘"}
_PUNCTUATION_AND_SENTENCE_STARTING_PATTERN = regex.compile(r".*([^\w\s]\s*)(\p{Lu}\w+(\s+\w+)?(\s+\w+)?\s*)$")
_WORDS_AND_SENTENCE_ENDING_PATTERN = regex.compile(r"^(\p{Ll}\w+(\s+\w+)?(\s+\w+)?)([\.,;:!\?\)\]”’]\s*[”’]*\s*)")

def adjust_segment_boundaries(self, verses: List[str]) -> List[str]:
for i in range(len(verses) - 1):
verses[i], verses[i + 1] = self.adjust_segment_pair_boundary(verses[i], verses[i + 1])
return verses

def adjust_segment_pair_boundary(self, segment: str, next_segment: str) -> Tuple[str, str]:
while len(next_segment) > 0 and next_segment[0] in self._PROHIBITED_VERSE_STARTING_CHARACTERS:
segment += next_segment[0]
next_segment = next_segment[1:]
while len(segment) > 0 and segment[-1] in self._PROHIBITED_VERSE_ENDING_CHARACTERS:
next_segment = segment[-1] + next_segment
segment = segment[:-1]
if self._segment_ends_with_start_of_sentence(segment):
segment, next_segment = self._adjust_for_missed_sentence_start(segment, next_segment)
if self._segment_starts_with_end_of_sentence(next_segment):
segment, next_segment = self._adjust_for_late_sentence_end(segment, next_segment)
return segment, next_segment

def _segment_ends_with_start_of_sentence(self, segment: str) -> bool:
return self._PUNCTUATION_AND_SENTENCE_STARTING_PATTERN.match(segment) is not None

def _adjust_for_missed_sentence_start(self, segment: str, next_segment: str) -> Tuple[str, str]:
match = self._PUNCTUATION_AND_SENTENCE_STARTING_PATTERN.match(segment)
if match is not None:
capitalized_word = match.group(2)
segment = segment[: match.end(1)]
next_segment = capitalized_word + ("" if capitalized_word[-1] == " " else " ") + next_segment
return segment, next_segment

def _segment_starts_with_end_of_sentence(self, segment: str) -> bool:
return self._WORDS_AND_SENTENCE_ENDING_PATTERN.match(segment) is not None

def _adjust_for_late_sentence_end(self, segment: str, next_segment: str) -> Tuple[str, str]:
match = self._WORDS_AND_SENTENCE_ENDING_PATTERN.match(next_segment)
if match is not None:
words = match.group(1)
punctuation = match.group(4)
segment = segment + words + punctuation
next_segment = next_segment[match.end(0) :]
return segment, next_segment

def adjust_tokenized_segment_pair_boundaries(self, segment_boundary: int, tokens: List[str]) -> int:
segment_text = TokenRejoiner.join_tokens(tokens[:segment_boundary])
next_segment_text = TokenRejoiner.join_tokens(tokens[segment_boundary:])
adjusted_segment_text = self.adjust_segment_pair_boundary(segment_text, next_segment_text)[0].strip()

return self._find_best_boundary_from_segment_length(tokens, len(adjusted_segment_text))

def _find_best_boundary_from_segment_length(self, tokens: List[str], target_segment_length: int) -> int:
token_rejoiner = TokenRejoiner()

for index, token in enumerate(tokens):
accumulated_length = len(token_rejoiner.add_token_to_joined_text(token))

if accumulated_length >= target_segment_length:
# In the unlikely case that the adjusted boundary falls in the middle of a token
# select the token boundary that is closest
error_with_current_boundary = accumulated_length - target_segment_length
error_with_previous_boundary = target_segment_length - (accumulated_length - len(token))

Comment on lines +108 to +115
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not actually an issue, since we're working with tokenized text, which has no whitespace tokens. It doesn't matter on which side of the hypothetical space we decide to draw a boundary, because the boundary will still fall between the same two tokens either way. It's not worth adding extra logic to handle a distinction that will never result in different functionality.

if error_with_current_boundary < error_with_previous_boundary:
return index + 1
else:
return index

return len(tokens)
37 changes: 37 additions & 0 deletions tests/corpora/test_place_markers_usfm_update_block_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,43 @@ def test_support_verse_zero():
assert_usfm_equals(target, result)


def test_adjustment_of_placed_paragraph_marker() -> None:
source = "This is the first paragraph. This text is in English and this test is for paragraph markers."
pretranslation = (
"Este es el primer párrafo. Este texto está en inglés, y esta prueba es para marcadores de párrafo."
)
align_info = PlaceMarkersAlignmentInfo(
source_tokens=[t for t in TOKENIZER.tokenize(source)],
translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)],
alignment=to_word_alignment_matrix(
"0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 11-11 11-12 12-13 13-14 14-15 15-16 16-19 17-17 18-20"
),
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
)
rows = [UpdateUsfmRow(scr_ref("MAT 1:1"), str(pretranslation), {"alignment_info": align_info})]
usfm = r"""\id MAT
\c 1
\v 1 This is the first paragraph.
\p This text is in English
\p and this test is for paragraph markers.
"""

target = update_usfm(
rows,
usfm,
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()],
)
result = r"""\id MAT
\c 1
\v 1 Este es el primer párrafo.
\p Este texto está en inglés,
\p y esta prueba es para marcadores de párrafo.
"""
assert_usfm_equals(target, result)


def scr_ref(*refs: str) -> List[ScriptureRef]:
return [ScriptureRef.parse(ref) for ref in refs]

Expand Down
Loading
Loading