|
25 | 25 | * Last updated: 2023-11-21 |
26 | 26 | */ |
27 | 27 |
|
28 | | -use crate::slicers::FnFindLastLineBreak; |
29 | | -use arrow::array::ArrayRef; |
| 28 | +use arrow_array::array::ArrayRef; |
30 | 29 | use parquet::format; |
31 | 30 | use std::cmp::min; |
| 31 | +use std::fs; |
| 32 | +use crate::chunked::chunked_slicer::SLICER_IN_CHUNK_SIZE; |
| 33 | + |
32 | 34 |
|
33 | 35 | // pub(crate) mod arrow2_converter; |
34 | 36 | pub(crate) mod self_converter; |
35 | 37 |
|
36 | 38 | pub mod arrow_converter; |
| 39 | +pub mod chunked_slicer; |
37 | 40 |
|
38 | 41 | pub(crate) trait Converter<'a> { |
39 | 42 | fn set_line_break_handler(&mut self, fn_line_break: FnFindLastLineBreak<'a>); |
@@ -161,3 +164,109 @@ mod tests { |
161 | 164 | assert_eq!(column_length_num_rightaligned(b"123", 3), (0, 2)); |
162 | 165 | } |
163 | 166 | } |
| 167 | + |
| 168 | + |
| 169 | +pub(crate) struct ChunkAndResidue { |
| 170 | + pub(crate) chunk: Box<[u8; SLICER_IN_CHUNK_SIZE]>, |
| 171 | +} |
| 172 | +pub(crate) trait Slicer<'a> { |
| 173 | + fn slice_and_convert( |
| 174 | + &mut self, |
| 175 | + converter: Box<dyn 'a + Converter<'a>>, |
| 176 | + infile: fs::File, |
| 177 | + n_threads: usize, |
| 178 | + ) -> Result<crate::chunked::Stats, &str>; |
| 179 | +} |
| 180 | +pub(crate) struct Stats { |
| 181 | + pub(crate) bytes_in: usize, |
| 182 | + pub(crate) bytes_out: usize, |
| 183 | + |
| 184 | + pub(crate) num_rows: i64, |
| 185 | +} |
| 186 | + |
| 187 | +pub(crate) type FnLineBreakLen = fn() -> usize; |
| 188 | +#[allow(dead_code)] |
| 189 | +pub(crate) fn line_break_len_cr() -> usize { |
| 190 | + 1 as usize |
| 191 | +} |
| 192 | +#[allow(dead_code)] |
| 193 | +pub(crate) fn line_break_len_crlf() -> usize { |
| 194 | + 2 as usize |
| 195 | +} |
| 196 | + |
| 197 | +pub(crate) type FnFindLastLineBreak<'a> = fn(bytes: &'a [u8]) -> (bool, usize); |
| 198 | +#[allow(dead_code)] |
| 199 | +pub(crate) fn find_last_nlcr(bytes: &[u8]) -> (bool, usize) { |
| 200 | + if bytes.is_empty() { |
| 201 | + return (false, 0); // TODO should report err ... |
| 202 | + } |
| 203 | + |
| 204 | + let mut p2 = bytes.len() - 1; |
| 205 | + |
| 206 | + if 0 == p2 { |
| 207 | + return (false, 0); // hmm |
| 208 | + } |
| 209 | + |
| 210 | + loop { |
| 211 | + if bytes[p2 - 1] == 0x0d && bytes[p2] == 0x0a { |
| 212 | + return (true, p2 + 1); |
| 213 | + } |
| 214 | + if 0 == p2 { |
| 215 | + return (false, 0); // indicate we didnt find nl |
| 216 | + } |
| 217 | + |
| 218 | + p2 -= 1; |
| 219 | + } |
| 220 | +} |
| 221 | + |
| 222 | +#[allow(dead_code)] |
| 223 | +pub(crate) fn find_last_nl(bytes: &[u8]) -> (bool, usize) { |
| 224 | + if bytes.is_empty() { |
| 225 | + return (false, 0); // Indicate we didnt found nl. |
| 226 | + } |
| 227 | + |
| 228 | + let mut p2 = bytes.len() - 1; |
| 229 | + |
| 230 | + if 0 == p2 { |
| 231 | + return (false, 0); // hmm |
| 232 | + } |
| 233 | + |
| 234 | + loop { |
| 235 | + if bytes[p2] == 0x0a { |
| 236 | + return (true, p2); |
| 237 | + } |
| 238 | + if 0 == p2 { |
| 239 | + return (false, 0); // indicate we didnt find nl |
| 240 | + } |
| 241 | + p2 -= 1; |
| 242 | + } |
| 243 | +} |
| 244 | +pub(crate) struct IterRevolver<'a, T> { |
| 245 | + shards: *mut T, |
| 246 | + next: usize, |
| 247 | + len: usize, |
| 248 | + phantom: std::marker::PhantomData<&'a mut [T]>, |
| 249 | +} |
| 250 | + |
| 251 | +impl<'a, T> From<&'a mut [T]> for crate::chunked::IterRevolver<'a, T> { |
| 252 | + fn from(shards: &'a mut [T]) -> crate::chunked::IterRevolver<'a, T> { |
| 253 | + IterRevolver { |
| 254 | + next: 0, |
| 255 | + len: shards.len(), |
| 256 | + shards: shards.as_mut_ptr(), |
| 257 | + phantom: std::marker::PhantomData, |
| 258 | + } |
| 259 | + } |
| 260 | +} |
| 261 | + |
| 262 | +impl<'a, T> Iterator for IterRevolver<'a, T> { |
| 263 | + type Item = &'a mut T; |
| 264 | + fn next(&mut self) -> Option<Self::Item> { |
| 265 | + if self.next < self.len { |
| 266 | + self.next += 1; |
| 267 | + } else { |
| 268 | + self.next = 1; |
| 269 | + } |
| 270 | + unsafe { Some(&mut *self.shards.offset(self.next as isize - 1)) } |
| 271 | + } |
| 272 | +} |
0 commit comments