diff --git a/src/ustr.rs b/src/ustr.rs index eca3e13..725ffaa 100644 --- a/src/ustr.rs +++ b/src/ustr.rs @@ -2,6 +2,7 @@ //! //! This module contains wide string slices and related types. +use crate::utfstr::Lines; #[cfg(feature = "alloc")] use crate::{ error::{Utf16Error, Utf32Error}, @@ -910,6 +911,59 @@ impl U16Str { pub fn char_indices_lossy(&self) -> CharIndicesLossyUtf16<'_> { CharIndicesLossyUtf16::new(self.as_slice()) } + + /// Returns an iterator over the lines of a [`U16Str`], as string slices. + /// + /// Lines are split at line endings that are either newlines (`\n`) or + /// sequences of a carriage return followed by a line feed (`\r\n`). + /// + /// Line terminators are not included in the lines returned by the iterator. + /// + /// Note that any carriage return (`\r`) not immediately followed by a + /// line feed (`\n`) does not split a line. These carriage returns are + /// thereby included in the produced lines. + /// + /// The final line ending is optional. A string that ends with a final line + /// ending will return the same lines as an otherwise identical string + /// without a final line ending. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use widestring::{u16str}; + /// + /// let text = u16str!("foo\r\nbar\n\nbaz\r"); + /// let mut lines = text.lines_lossy(); + /// + /// assert_eq!(Some(u16str!("foo")), lines.next()); + /// assert_eq!(Some(u16str!("bar")), lines.next()); + /// assert_eq!(Some(u16str!("")), lines.next()); + /// // Trailing carriage return is included in the last line + /// assert_eq!(Some(u16str!("baz\r")), lines.next()); + /// + /// assert_eq!(None, lines.next()); + /// ``` + /// + /// The final line does not require any ending: + /// + /// ``` + /// use widestring::{u16str}; + /// + /// let text = u16str!("foo\nbar\n\r\nbaz"); + /// let mut lines = text.lines_lossy(); + /// + /// assert_eq!(Some(u16str!("foo")), lines.next()); + /// assert_eq!(Some(u16str!("bar")), lines.next()); + /// assert_eq!(Some(u16str!("")), lines.next()); + /// assert_eq!(Some(u16str!("baz")), lines.next()); + /// + /// assert_eq!(None, lines.next()); + /// ``` + pub fn lines_lossy(&self) -> Lines<'_, Self, CharIndicesLossyUtf16<'_>> { + Lines::new(self, self.len(), self.char_indices_lossy()) + } } impl U32Str { @@ -1155,6 +1209,59 @@ impl U32Str { pub fn char_indices_lossy(&self) -> CharIndicesLossyUtf32<'_> { CharIndicesLossyUtf32::new(self.as_slice()) } + + /// Returns an iterator over the lines of a [`U32Str`], as string slices. + /// + /// Lines are split at line endings that are either newlines (`\n`) or + /// sequences of a carriage return followed by a line feed (`\r\n`). + /// + /// Line terminators are not included in the lines returned by the iterator. + /// + /// Note that any carriage return (`\r`) not immediately followed by a + /// line feed (`\n`) does not split a line. These carriage returns are + /// thereby included in the produced lines. + /// + /// The final line ending is optional. A string that ends with a final line + /// ending will return the same lines as an otherwise identical string + /// without a final line ending. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use widestring::{u32str}; + /// + /// let text = u32str!("foo\r\nbar\n\nbaz\r"); + /// let mut lines = text.lines_lossy(); + /// + /// assert_eq!(Some(u32str!("foo")), lines.next()); + /// assert_eq!(Some(u32str!("bar")), lines.next()); + /// assert_eq!(Some(u32str!("")), lines.next()); + /// // Trailing carriage return is included in the last line + /// assert_eq!(Some(u32str!("baz\r")), lines.next()); + /// + /// assert_eq!(None, lines.next()); + /// ``` + /// + /// The final line does not require any ending: + /// + /// ``` + /// use widestring::{u32str}; + /// + /// let text = u32str!("foo\nbar\n\r\nbaz"); + /// let mut lines = text.lines_lossy(); + /// + /// assert_eq!(Some(u32str!("foo")), lines.next()); + /// assert_eq!(Some(u32str!("bar")), lines.next()); + /// assert_eq!(Some(u32str!("")), lines.next()); + /// assert_eq!(Some(u32str!("baz")), lines.next()); + /// + /// assert_eq!(None, lines.next()); + /// ``` + pub fn lines_lossy(&self) -> Lines<'_, Self, CharIndicesLossyUtf32<'_>> { + Lines::new(self, self.len(), self.char_indices_lossy()) + } } impl core::fmt::Debug for U16Str { diff --git a/src/utfstr.rs b/src/utfstr.rs index f18b03b..d33bbcb 100644 --- a/src/utfstr.rs +++ b/src/utfstr.rs @@ -1422,6 +1422,59 @@ impl Utf16Str { } s } + + /// Returns an iterator over the lines of a [`Utf16Str`], as string slices. + /// + /// Lines are split at line endings that are either newlines (`\n`) or + /// sequences of a carriage return followed by a line feed (`\r\n`). + /// + /// Line terminators are not included in the lines returned by the iterator. + /// + /// Note that any carriage return (`\r`) not immediately followed by a + /// line feed (`\n`) does not split a line. These carriage returns are + /// thereby included in the produced lines. + /// + /// The final line ending is optional. A string that ends with a final line + /// ending will return the same lines as an otherwise identical string + /// without a final line ending. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use widestring::{Utf16String, utf16str}; + /// + /// let text = utf16str!("foo\r\nbar\n\nbaz\r"); + /// let mut lines = text.lines(); + /// + /// assert_eq!(Some(utf16str!("foo")), lines.next()); + /// assert_eq!(Some(utf16str!("bar")), lines.next()); + /// assert_eq!(Some(utf16str!("")), lines.next()); + /// // Trailing carriage return is included in the last line + /// assert_eq!(Some(utf16str!("baz\r")), lines.next()); + /// + /// assert_eq!(None, lines.next()); + /// ``` + /// + /// The final line does not require any ending: + /// + /// ``` + /// use widestring::{Utf16String, utf16str}; + /// + /// let text = utf16str!("foo\nbar\n\r\nbaz"); + /// let mut lines = text.lines(); + /// + /// assert_eq!(Some(utf16str!("foo")), lines.next()); + /// assert_eq!(Some(utf16str!("bar")), lines.next()); + /// assert_eq!(Some(utf16str!("")), lines.next()); + /// assert_eq!(Some(utf16str!("baz")), lines.next()); + /// + /// assert_eq!(None, lines.next()); + /// ``` + pub fn lines(&self) -> Lines<'_, Utf16Str, CharIndicesUtf16<'_>> { + Lines::new(self, self.len(), self.char_indices()) + } } impl Utf32Str { @@ -2027,6 +2080,59 @@ impl Utf32Str { } s } + + /// Returns an iterator over the lines of a [`Utf32Str`], as string slices. + /// + /// Lines are split at line endings that are either newlines (`\n`) or + /// sequences of a carriage return followed by a line feed (`\r\n`). + /// + /// Line terminators are not included in the lines returned by the iterator. + /// + /// Note that any carriage return (`\r`) not immediately followed by a + /// line feed (`\n`) does not split a line. These carriage returns are + /// thereby included in the produced lines. + /// + /// The final line ending is optional. A string that ends with a final line + /// ending will return the same lines as an otherwise identical string + /// without a final line ending. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use widestring::{Utf32String, utf32str}; + /// + /// let text = utf32str!("foo\r\nbar\n\nbaz\r"); + /// let mut lines = text.lines(); + /// + /// assert_eq!(Some(utf32str!("foo")), lines.next()); + /// assert_eq!(Some(utf32str!("bar")), lines.next()); + /// assert_eq!(Some(utf32str!("")), lines.next()); + /// // Trailing carriage return is included in the last line + /// assert_eq!(Some(utf32str!("baz\r")), lines.next()); + /// + /// assert_eq!(None, lines.next()); + /// ``` + /// + /// The final line does not require any ending: + /// + /// ``` + /// use widestring::{Utf32String, utf32str}; + /// + /// let text = utf32str!("foo\nbar\n\r\nbaz"); + /// let mut lines = text.lines(); + /// + /// assert_eq!(Some(utf32str!("foo")), lines.next()); + /// assert_eq!(Some(utf32str!("bar")), lines.next()); + /// assert_eq!(Some(utf32str!("")), lines.next()); + /// assert_eq!(Some(utf32str!("baz")), lines.next()); + /// + /// assert_eq!(None, lines.next()); + /// ``` + pub fn lines(&self) -> Lines<'_, Utf32Str, CharIndicesUtf32<'_>> { + Lines::new(self, self.len(), self.char_indices()) + } } impl AsMut<[char]> for Utf32Str { diff --git a/src/utfstr/iter.rs b/src/utfstr/iter.rs index 128f737..7fcaa38 100644 --- a/src/utfstr/iter.rs +++ b/src/utfstr/iter.rs @@ -2,6 +2,9 @@ use crate::{ debug_fmt_char_iter, decode_utf16, decode_utf32, iter::{DecodeUtf16, DecodeUtf32}, }; +use core::{ + borrow::Borrow, iter::Peekable, marker::PhantomData, ops::{Index, Range}, usize +}; #[allow(unused_imports)] use core::{ fmt::Write, @@ -409,3 +412,94 @@ impl<'a> ExactSizeIterator for CodeUnits<'a> { self.iter.len() } } + +/// An iterator over the lines of a [`Utf16Str`], [`Utf32Str`], or other wide string +/// that has the char_indices method. Returns string slices. +/// +/// This struct is created with one of: +/// 1. The [`lines`][crate::Utf16Str::lines] method on [`Utf16Str`] +/// 2. The [`lines`][crate::Utf32Str::lines] method on [`Utf32Str`] +/// 3. etc. +/// +/// See their documentation for more. +#[derive(Debug, Clone)] +pub struct Lines<'a, Str, CharIndices> +where + Str: Borrow<Str> + Index<Range<usize>, Output = Str> + ?Sized, + CharIndices: IntoIterator<Item = (usize, char)>, +{ + str: &'a Str, + str_len: usize, + char_indices: Peekable<CharIndices::IntoIter>, +} + +impl<'a, Str, CharIndices> Lines<'a, Str, CharIndices> +where + Str: Borrow<Str> + Index<Range<usize>, Output = Str> + ?Sized, + CharIndices: IntoIterator<Item = (usize, char)>, +{ + pub(crate) fn new(str: &'a Str, str_len: usize, char_indices: CharIndices) -> Self { + Self { + str, + str_len, + char_indices: char_indices.into_iter().peekable(), + } + } +} + +impl<'a, Str, CharIndices> Iterator for Lines<'a, Str, CharIndices> +where + Str: Borrow<Str> + Index<Range<usize>, Output = Str> + ?Sized, + CharIndices: IntoIterator<Item = (usize, char)>, +{ + type Item = &'a Str; + + fn next(&mut self) -> Option<Self::Item> { + let mut current_char_index = if let Some(ch_index) = self.char_indices.next(){ + ch_index + } else { + return None; + }; + + let line_start = current_char_index.0; + let mut line_end = current_char_index.0; + let mut previous_was_carriage_return; + + loop { + if current_char_index.1 == '\n' { + break; + } + + if current_char_index.1 == '\r' { + line_end = current_char_index.0; + previous_was_carriage_return = true; + } else { + line_end = self.char_indices.peek() + .map(|ch_index| ch_index.0) + .unwrap_or(self.str_len); + previous_was_carriage_return = false; + } + + if let Some(current) = self.char_indices.next() { + current_char_index = current; + } else { + line_end = if previous_was_carriage_return { + self.str_len + } else { + line_end + }; + break; + } + } + + Some(&self.str[line_start..line_end]) + } +} + +// Since CharIndicesUtf16 is a FusedIterator, so is Lines +impl<'a, Str, CharIndices> FusedIterator for Lines<'a, Str, CharIndices> +where + Str: Borrow<Str> + Index<Range<usize>, Output = Str>, + CharIndices: IntoIterator<Item = (usize, char)>, +{ +}