widestring/
iter.rs

1//! Iterators for encoding and decoding slices of string data.
2
3use crate::{
4    decode_utf16_surrogate_pair,
5    error::{DecodeUtf16Error, DecodeUtf32Error},
6    is_utf16_high_surrogate, is_utf16_low_surrogate, is_utf16_surrogate,
7};
8#[allow(unused_imports)]
9use core::{
10    char,
11    iter::{DoubleEndedIterator, ExactSizeIterator, FusedIterator},
12};
13
14/// An iterator that decodes UTF-16 encoded code points from an iterator of [`u16`]s.
15///
16/// This struct is created by [`decode_utf16`][crate::decode_utf16]. See its documentation for more.
17///
18/// This struct is identical to [`char::DecodeUtf16`] except it is a [`DoubleEndedIterator`] if
19/// `I` is.
20#[derive(Debug, Clone)]
21pub struct DecodeUtf16<I>
22where
23    I: Iterator<Item = u16>,
24{
25    iter: I,
26    forward_buf: Option<u16>,
27    back_buf: Option<u16>,
28}
29
30impl<I> DecodeUtf16<I>
31where
32    I: Iterator<Item = u16>,
33{
34    pub(crate) fn new(iter: I) -> Self {
35        Self {
36            iter,
37            forward_buf: None,
38            back_buf: None,
39        }
40    }
41}
42
43impl<I> Iterator for DecodeUtf16<I>
44where
45    I: Iterator<Item = u16>,
46{
47    type Item = Result<char, DecodeUtf16Error>;
48
49    fn next(&mut self) -> Option<Self::Item> {
50        // Copied from char::DecodeUtf16
51        let u = match self.forward_buf.take() {
52            Some(buf) => buf,
53            None => self.iter.next().or_else(|| self.back_buf.take())?,
54        };
55
56        if !is_utf16_surrogate(u) {
57            // SAFETY: not a surrogate
58            Some(Ok(unsafe { char::from_u32_unchecked(u as u32) }))
59        } else if is_utf16_low_surrogate(u) {
60            // a trailing surrogate
61            Some(Err(DecodeUtf16Error::new(u)))
62        } else {
63            let u2 = match self.iter.next().or_else(|| self.back_buf.take()) {
64                Some(u2) => u2,
65                // eof
66                None => return Some(Err(DecodeUtf16Error::new(u))),
67            };
68            if !is_utf16_low_surrogate(u2) {
69                // not a trailing surrogate so we're not a valid
70                // surrogate pair, so rewind to redecode u2 next time.
71                self.forward_buf = Some(u2);
72                return Some(Err(DecodeUtf16Error::new(u)));
73            }
74
75            // all ok, so lets decode it.
76            // SAFETY: verified the surrogate pair
77            unsafe { Some(Ok(decode_utf16_surrogate_pair(u, u2))) }
78        }
79    }
80
81    #[inline]
82    fn size_hint(&self) -> (usize, Option<usize>) {
83        let (low, high) = self.iter.size_hint();
84        // we could be entirely valid surrogates (2 elements per
85        // char), or entirely non-surrogates (1 element per char)
86        (low / 2, high)
87    }
88}
89
90impl<I> DoubleEndedIterator for DecodeUtf16<I>
91where
92    I: Iterator<Item = u16> + DoubleEndedIterator,
93{
94    fn next_back(&mut self) -> Option<Self::Item> {
95        let u2 = match self.back_buf.take() {
96            Some(buf) => buf,
97            None => self.iter.next_back().or_else(|| self.forward_buf.take())?,
98        };
99
100        if !is_utf16_surrogate(u2) {
101            // SAFETY: not a surrogate
102            Some(Ok(unsafe { char::from_u32_unchecked(u2 as u32) }))
103        } else if is_utf16_high_surrogate(u2) {
104            // a leading surrogate
105            Some(Err(DecodeUtf16Error::new(u2)))
106        } else {
107            let u = match self.iter.next_back().or_else(|| self.forward_buf.take()) {
108                Some(u) => u,
109                // eof
110                None => return Some(Err(DecodeUtf16Error::new(u2))),
111            };
112            if !is_utf16_high_surrogate(u) {
113                // not a leading surrogate so we're not a valid
114                // surrogate pair, so rewind to redecode u next time.
115                self.back_buf = Some(u);
116                return Some(Err(DecodeUtf16Error::new(u2)));
117            }
118
119            // all ok, so lets decode it.
120            // SAFETY: verified the surrogate pair
121            unsafe { Some(Ok(decode_utf16_surrogate_pair(u, u2))) }
122        }
123    }
124}
125
126impl<I> FusedIterator for DecodeUtf16<I> where I: Iterator<Item = u16> + FusedIterator {}
127
128/// An iterator that lossily decodes possibly ill-formed UTF-16 encoded code points from an iterator
129/// of [`u16`]s.
130///
131/// Any unpaired UTF-16 surrogate values are replaced by
132/// [`U+FFFD REPLACEMENT_CHARACTER`][char::REPLACEMENT_CHARACTER] (�).
133#[derive(Debug, Clone)]
134pub struct DecodeUtf16Lossy<I>
135where
136    I: Iterator<Item = u16>,
137{
138    pub(crate) iter: DecodeUtf16<I>,
139}
140
141impl<I> Iterator for DecodeUtf16Lossy<I>
142where
143    I: Iterator<Item = u16>,
144{
145    type Item = char;
146
147    #[inline]
148    fn next(&mut self) -> Option<Self::Item> {
149        self.iter
150            .next()
151            .map(|res| res.unwrap_or(char::REPLACEMENT_CHARACTER))
152    }
153
154    #[inline]
155    fn size_hint(&self) -> (usize, Option<usize>) {
156        self.iter.size_hint()
157    }
158}
159
160impl<I> DoubleEndedIterator for DecodeUtf16Lossy<I>
161where
162    I: Iterator<Item = u16> + DoubleEndedIterator,
163{
164    #[inline]
165    fn next_back(&mut self) -> Option<Self::Item> {
166        self.iter
167            .next_back()
168            .map(|res| res.unwrap_or(char::REPLACEMENT_CHARACTER))
169    }
170}
171
172impl<I> FusedIterator for DecodeUtf16Lossy<I> where I: Iterator<Item = u16> + FusedIterator {}
173
174/// An iterator that decodes UTF-32 encoded code points from an iterator of `u32`s.
175#[derive(Debug, Clone)]
176pub struct DecodeUtf32<I>
177where
178    I: Iterator<Item = u32>,
179{
180    pub(crate) iter: I,
181}
182
183impl<I> Iterator for DecodeUtf32<I>
184where
185    I: Iterator<Item = u32>,
186{
187    type Item = Result<char, DecodeUtf32Error>;
188
189    #[inline]
190    fn next(&mut self) -> Option<Self::Item> {
191        self.iter
192            .next()
193            .map(|u| char::from_u32(u).ok_or_else(|| DecodeUtf32Error::new(u)))
194    }
195
196    #[inline]
197    fn size_hint(&self) -> (usize, Option<usize>) {
198        self.iter.size_hint()
199    }
200}
201
202impl<I> DoubleEndedIterator for DecodeUtf32<I>
203where
204    I: Iterator<Item = u32> + DoubleEndedIterator,
205{
206    #[inline]
207    fn next_back(&mut self) -> Option<Self::Item> {
208        self.iter
209            .next_back()
210            .map(|u| char::from_u32(u).ok_or_else(|| DecodeUtf32Error::new(u)))
211    }
212}
213
214impl<I> FusedIterator for DecodeUtf32<I> where I: Iterator<Item = u32> + FusedIterator {}
215
216impl<I> ExactSizeIterator for DecodeUtf32<I>
217where
218    I: Iterator<Item = u32> + ExactSizeIterator,
219{
220    #[inline]
221    fn len(&self) -> usize {
222        self.iter.len()
223    }
224}
225
226/// An iterator that lossily decodes possibly ill-formed UTF-32 encoded code points from an iterator
227/// of `u32`s.
228///
229/// Any invalid UTF-32 values are replaced by
230/// [`U+FFFD REPLACEMENT_CHARACTER`][core::char::REPLACEMENT_CHARACTER] (�).
231#[derive(Debug, Clone)]
232pub struct DecodeUtf32Lossy<I>
233where
234    I: Iterator<Item = u32>,
235{
236    pub(crate) iter: DecodeUtf32<I>,
237}
238
239impl<I> Iterator for DecodeUtf32Lossy<I>
240where
241    I: Iterator<Item = u32>,
242{
243    type Item = char;
244
245    #[inline]
246    fn next(&mut self) -> Option<Self::Item> {
247        self.iter
248            .next()
249            .map(|res| res.unwrap_or(core::char::REPLACEMENT_CHARACTER))
250    }
251
252    #[inline]
253    fn size_hint(&self) -> (usize, Option<usize>) {
254        self.iter.size_hint()
255    }
256}
257
258impl<I> DoubleEndedIterator for DecodeUtf32Lossy<I>
259where
260    I: Iterator<Item = u32> + DoubleEndedIterator,
261{
262    #[inline]
263    fn next_back(&mut self) -> Option<Self::Item> {
264        self.iter
265            .next_back()
266            .map(|res| res.unwrap_or(core::char::REPLACEMENT_CHARACTER))
267    }
268}
269
270impl<I> FusedIterator for DecodeUtf32Lossy<I> where I: Iterator<Item = u32> + FusedIterator {}
271
272impl<I> ExactSizeIterator for DecodeUtf32Lossy<I>
273where
274    I: Iterator<Item = u32> + ExactSizeIterator,
275{
276    #[inline]
277    fn len(&self) -> usize {
278        self.iter.len()
279    }
280}
281
282/// An iterator that encodes an iterator of [`char`][prim@char]s into UTF-8 bytes.
283///
284/// This struct is created by [`encode_utf8`][crate::encode_utf8]. See its documentation for more.
285#[derive(Debug, Clone)]
286pub struct EncodeUtf8<I>
287where
288    I: Iterator<Item = char>,
289{
290    iter: I,
291    buf: [u8; 4],
292    idx: u8,
293    len: u8,
294}
295
296impl<I> EncodeUtf8<I>
297where
298    I: Iterator<Item = char>,
299{
300    pub(crate) fn new(iter: I) -> Self {
301        Self {
302            iter,
303            buf: [0; 4],
304            idx: 0,
305            len: 0,
306        }
307    }
308}
309
310impl<I> Iterator for EncodeUtf8<I>
311where
312    I: Iterator<Item = char>,
313{
314    type Item = u8;
315
316    #[inline]
317    fn next(&mut self) -> Option<Self::Item> {
318        if self.idx >= self.len {
319            let c = self.iter.next()?;
320            self.idx = 0;
321            self.len = c.encode_utf8(&mut self.buf).len() as u8;
322        }
323        self.idx += 1;
324        let idx = (self.idx - 1) as usize;
325        Some(self.buf[idx])
326    }
327
328    #[inline]
329    fn size_hint(&self) -> (usize, Option<usize>) {
330        let (lower, upper) = self.iter.size_hint();
331        (lower, upper.and_then(|len| len.checked_mul(4))) // Max 4 UTF-8 bytes per char
332    }
333}
334
335impl<I> FusedIterator for EncodeUtf8<I> where I: Iterator<Item = char> + FusedIterator {}
336
337/// An iterator that encodes an iterator of [`char`][prim@char]s into UTF-16 [`u16`] code units.
338///
339/// This struct is created by [`encode_utf16`][crate::encode_utf16]. See its documentation for more.
340#[derive(Debug, Clone)]
341pub struct EncodeUtf16<I>
342where
343    I: Iterator<Item = char>,
344{
345    iter: I,
346    buf: Option<u16>,
347}
348
349impl<I> EncodeUtf16<I>
350where
351    I: Iterator<Item = char>,
352{
353    pub(crate) fn new(iter: I) -> Self {
354        Self { iter, buf: None }
355    }
356}
357
358impl<I> Iterator for EncodeUtf16<I>
359where
360    I: Iterator<Item = char>,
361{
362    type Item = u16;
363
364    #[inline]
365    fn next(&mut self) -> Option<Self::Item> {
366        self.buf.take().or_else(|| {
367            let c = self.iter.next()?;
368            let mut buf = [0; 2];
369            let buf = c.encode_utf16(&mut buf);
370            if buf.len() > 1 {
371                self.buf = Some(buf[1]);
372            }
373            Some(buf[0])
374        })
375    }
376
377    #[inline]
378    fn size_hint(&self) -> (usize, Option<usize>) {
379        let (lower, upper) = self.iter.size_hint();
380        (lower, upper.and_then(|len| len.checked_mul(2))) // Max 2 UTF-16 code units per char
381    }
382}
383
384impl<I> FusedIterator for EncodeUtf16<I> where I: Iterator<Item = char> + FusedIterator {}
385
386/// An iterator that encodes an iterator of [`char`][prim@char]s into UTF-32 [`u32`] values.
387///
388/// This struct is created by [`encode_utf32`][crate::encode_utf32]. See its documentation for more.
389#[derive(Debug, Clone)]
390pub struct EncodeUtf32<I>
391where
392    I: Iterator<Item = char>,
393{
394    iter: I,
395}
396
397impl<I> EncodeUtf32<I>
398where
399    I: Iterator<Item = char>,
400{
401    pub(crate) fn new(iter: I) -> Self {
402        Self { iter }
403    }
404}
405
406impl<I> Iterator for EncodeUtf32<I>
407where
408    I: Iterator<Item = char>,
409{
410    type Item = u32;
411
412    #[inline]
413    fn next(&mut self) -> Option<Self::Item> {
414        self.iter.next().map(|c| c as u32)
415    }
416
417    #[inline]
418    fn size_hint(&self) -> (usize, Option<usize>) {
419        self.iter.size_hint()
420    }
421}
422
423impl<I> FusedIterator for EncodeUtf32<I> where I: Iterator<Item = char> + FusedIterator {}
424
425impl<I> ExactSizeIterator for EncodeUtf32<I>
426where
427    I: Iterator<Item = char> + ExactSizeIterator,
428{
429    #[inline]
430    fn len(&self) -> usize {
431        self.iter.len()
432    }
433}
434
435impl<I> DoubleEndedIterator for EncodeUtf32<I>
436where
437    I: Iterator<Item = char> + DoubleEndedIterator,
438{
439    #[inline]
440    fn next_back(&mut self) -> Option<Self::Item> {
441        self.iter.next_back().map(|c| c as u32)
442    }
443}