|
| 1 | +/* |
| 2 | +* MIT License |
| 3 | +* |
| 4 | +* Copyright (c) 2024 Firelink Data |
| 5 | +* |
| 6 | +* Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | +* of this software and associated documentation files (the "Software"), to deal |
| 8 | +* in the Software without restriction, including without limitation the rights |
| 9 | +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 10 | +* copies of the Software, and to permit persons to whom the Software is |
| 11 | +* furnished to do so, subject to the following conditions: |
| 12 | +* |
| 13 | +* The above copyright notice and this permission notice shall be included in all |
| 14 | +* copies or substantial portions of the Software. |
| 15 | +* |
| 16 | +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | +* SOFTWARE. |
| 23 | +* |
| 24 | +* File created: 2023-12-11 |
| 25 | +* Last updated: 2024-05-08 |
| 26 | +*/ |
| 27 | + |
| 28 | +use std::default::Default; |
| 29 | +use std::slice::Iter; |
| 30 | + |
| 31 | +use crate::error::Result; |
| 32 | + |
| 33 | +#[derive(Debug)] |
| 34 | +pub(crate) struct Slicer { |
| 35 | + n_threads: usize, |
| 36 | + multithreading: bool, |
| 37 | +} |
| 38 | + |
| 39 | +/// |
| 40 | +impl Slicer { |
| 41 | + /// |
| 42 | + pub fn builder() -> SlicerBuilder { |
| 43 | + SlicerBuilder { |
| 44 | + ..Default::default() |
| 45 | + } |
| 46 | + } |
| 47 | + |
| 48 | + /// Calculate how many bytes correspond to how many rune in the provided byte slice. |
| 49 | + /// See this link https://en.wikipedia.org/wiki/UTF-8 for details on how this works. |
| 50 | + /// |
| 51 | + /// # Panics |
| 52 | + /// Iff the byte slice is not a valid utf-8 sequence. |
| 53 | + pub fn find_num_bytes_for_num_runes(&self, bytes: &[u8], num_runes: usize) -> usize { |
| 54 | + let mut found_runes: usize = 0; |
| 55 | + let mut num_bytes: usize = 0; |
| 56 | + let mut byte_units: usize = 1; |
| 57 | + |
| 58 | + let mut iterator: Iter<u8> = bytes.iter(); |
| 59 | + |
| 60 | + while found_runes < num_runes { |
| 61 | + let byte = match iterator.nth(byte_units - 1) { |
| 62 | + Some(b) => *b, |
| 63 | + None => break, |
| 64 | + }; |
| 65 | + |
| 66 | + byte_units = match byte { |
| 67 | + byte if byte >> 7 == 0 => 1, |
| 68 | + byte if byte >> 5 == 0b110 => 2, |
| 69 | + byte if byte >> 4 == 0b1110 => 3, |
| 70 | + byte if byte >> 3 == 0b11110 => 4, |
| 71 | + _ => panic!("Invalid utf-8 sequence!"), |
| 72 | + }; |
| 73 | + |
| 74 | + found_runes += 1; |
| 75 | + num_bytes += byte_units; |
| 76 | + } |
| 77 | + |
| 78 | + num_bytes |
| 79 | + } |
| 80 | + |
| 81 | + /// Find all line breaks in a slice of bytes representing utf-8 encoded data. |
| 82 | + /// This method looks for a line-feed (LF) character, represented as `\n`. |
| 83 | + /// The hexadecimal code for `\n` is 0x0a. |
| 84 | + /// |
| 85 | + /// # Panics |
| 86 | + /// Iff the byte slice is empty. |
| 87 | + #[cfg(not(target_os = "windows"))] |
| 88 | + pub fn find_line_breaks(&self, bytes: &[u8], buffer: &mut Vec<usize>) { |
| 89 | + if bytes.is_empty() { |
| 90 | + panic!("Byte slice was empty!"); |
| 91 | + } |
| 92 | + |
| 93 | + (0..bytes.len()).for_each(|idx| { |
| 94 | + if bytes[idx] == 0x0a { |
| 95 | + buffer.push(idx); |
| 96 | + } |
| 97 | + }); |
| 98 | + } |
| 99 | + |
| 100 | + /// Find all line breaks in a slice of bytes representing utf-8 encoded data. |
| 101 | + /// This method looks for windows OS specific line break characters called |
| 102 | + /// carriage-return (CR) and line-feed (LF). They are represented as the |
| 103 | + /// `\r` and `\n` utf-8 characters. The hexadecimal code for `\r` is 0x0d |
| 104 | + /// and `\n` is 0x0a. |
| 105 | + /// |
| 106 | + /// # Panics |
| 107 | + /// Iff the byte slice is empty. |
| 108 | + #[cfg(target_os = "windows")] |
| 109 | + pub fn find_line_breaks(&self, bytes: &[u8], buffer: &mut Vec<usize>) { |
| 110 | + if bytes.is_empty() { |
| 111 | + panic!("Byte slice was empty!"); |
| 112 | + } |
| 113 | + |
| 114 | + (1..bytes.len()).for_each(|idx| { |
| 115 | + if (bytes[idx - 1] == 0x0d) && (bytes[idx] == 0x0a) { |
| 116 | + buffer.push(idx); |
| 117 | + } |
| 118 | + }); |
| 119 | + } |
| 120 | +} |
| 121 | + |
| 122 | +/// |
| 123 | +#[derive(Debug, Default)] |
| 124 | +pub(crate) struct SlicerBuilder { |
| 125 | + n_threads: Option<usize>, |
| 126 | +} |
| 127 | + |
| 128 | +/// |
| 129 | +impl SlicerBuilder { |
| 130 | + /// |
| 131 | + pub fn num_threads(mut self, n_threads: usize) -> Self { |
| 132 | + self.n_threads = Some(n_threads); |
| 133 | + self |
| 134 | + } |
| 135 | + |
| 136 | + /// |
| 137 | + pub fn build(self) -> Result<Slicer> { |
| 138 | + let n_threads = match self.n_threads { |
| 139 | + Some(n) => n, |
| 140 | + None => 1, |
| 141 | + }; |
| 142 | + |
| 143 | + let multithreading = n_threads > 1; |
| 144 | + |
| 145 | + Ok(Slicer { |
| 146 | + n_threads, |
| 147 | + multithreading, |
| 148 | + }) |
| 149 | + } |
| 150 | +} |
0 commit comments