encoding_rs/
utf_8.rs

Help
1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::ascii::ascii_to_basic_latin;
12use crate::ascii::basic_latin_to_ascii;
13use crate::ascii::validate_ascii;
14use crate::handles::*;
15use crate::mem::convert_utf16_to_utf8_partial;
16use crate::variant::*;
17
18cfg_if! {
19    if #[cfg(feature = "simd-accel")] {
20        use ::core::intrinsics::unlikely;
21        use ::core::intrinsics::likely;
22    } else {
23        #[inline(always)]
24        fn unlikely(b: bool) -> bool {
25            b
26        }
27        #[inline(always)]
28        fn likely(b: bool) -> bool {
29            b
30        }
31    }
32}
33
34#[repr(align(64))] // Align to cache lines
35pub struct Utf8Data {
36    pub table: [u8; 384],
37}
38
39// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
40// Instead, please regenerate using generate-encoding-data.py
41
42pub static UTF8_DATA: Utf8Data = Utf8Data {
43    table: [
44        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
45        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
46        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
47        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
48        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
49        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
50        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
51        252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
52        148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
53        164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
54        164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
55        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
56        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
57        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
58        252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60        4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
61        8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
62        4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
63    ],
64};
65
66// END GENERATED CODE
67
68pub fn utf8_valid_up_to(src: &[u8]) -> usize {
69    let mut read = 0;
70    'outer: loop {
71        let mut byte = {
72            let src_remaining = &src[read..];
73            match validate_ascii(src_remaining) {
74                None => {
75                    return src.len();
76                }
77                Some((non_ascii, consumed)) => {
78                    read += consumed;
79                    non_ascii
80                }
81            }
82        };
83        // Check for the longest sequence to avoid checking twice for the
84        // multi-byte sequences. This can't overflow with 64-bit address space,
85        // because full 64 bits aren't in use. In the 32-bit PAE case, for this
86        // to overflow would mean that the source slice would be so large that
87        // the address space of the process would not have space for any code.
88        // Therefore, the slice cannot be so long that this would overflow.
89        if likely(read + 4 <= src.len()) {
90            'inner: loop {
91                // At this point, `byte` is not included in `read`, because we
92                // don't yet know that a) the UTF-8 sequence is valid and b) that there
93                // is output space if it is an astral sequence.
94                // Inspecting the lead byte directly is faster than what the
95                // std lib does!
96                if likely(in_inclusive_range8(byte, 0xC2, 0xDF)) {
97                    // Two-byte
98                    let second = unsafe { *(src.get_unchecked(read + 1)) };
99                    if !in_inclusive_range8(second, 0x80, 0xBF) {
100                        break 'outer;
101                    }
102                    read += 2;
103
104                    // Next lead (manually inlined)
105                    if likely(read + 4 <= src.len()) {
106                        byte = unsafe { *(src.get_unchecked(read)) };
107                        if byte < 0x80 {
108                            read += 1;
109                            continue 'outer;
110                        }
111                        continue 'inner;
112                    }
113                    break 'inner;
114                }
115                if likely(byte < 0xF0) {
116                    'three: loop {
117                        // Three-byte
118                        let second = unsafe { *(src.get_unchecked(read + 1)) };
119                        let third = unsafe { *(src.get_unchecked(read + 2)) };
120                        if ((UTF8_DATA.table[usize::from(second)]
121                            & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
122                            | (third >> 6))
123                            != 2
124                        {
125                            break 'outer;
126                        }
127                        read += 3;
128
129                        // Next lead (manually inlined)
130                        if likely(read + 4 <= src.len()) {
131                            byte = unsafe { *(src.get_unchecked(read)) };
132                            if in_inclusive_range8(byte, 0xE0, 0xEF) {
133                                continue 'three;
134                            }
135                            if likely(byte < 0x80) {
136                                read += 1;
137                                continue 'outer;
138                            }
139                            continue 'inner;
140                        }
141                        break 'inner;
142                    }
143                }
144                // Four-byte
145                let second = unsafe { *(src.get_unchecked(read + 1)) };
146                let third = unsafe { *(src.get_unchecked(read + 2)) };
147                let fourth = unsafe { *(src.get_unchecked(read + 3)) };
148                if (u16::from(
149                    UTF8_DATA.table[usize::from(second)]
150                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
151                ) | u16::from(third >> 6)
152                    | (u16::from(fourth & 0xC0) << 2))
153                    != 0x202
154                {
155                    break 'outer;
156                }
157                read += 4;
158
159                // Next lead
160                if likely(read + 4 <= src.len()) {
161                    byte = unsafe { *(src.get_unchecked(read)) };
162                    if byte < 0x80 {
163                        read += 1;
164                        continue 'outer;
165                    }
166                    continue 'inner;
167                }
168                break 'inner;
169            }
170        }
171        // We can't have a complete 4-byte sequence, but we could still have
172        // one to three shorter sequences.
173        'tail: loop {
174            // >= is better for bound check elision than ==
175            if read >= src.len() {
176                break 'outer;
177            }
178            byte = src[read];
179            // At this point, `byte` is not included in `read`, because we
180            // don't yet know that a) the UTF-8 sequence is valid and b) that there
181            // is output space if it is an astral sequence.
182            // Inspecting the lead byte directly is faster than what the
183            // std lib does!
184            if byte < 0x80 {
185                read += 1;
186                continue 'tail;
187            }
188            if in_inclusive_range8(byte, 0xC2, 0xDF) {
189                // Two-byte
190                let new_read = read + 2;
191                if new_read > src.len() {
192                    break 'outer;
193                }
194                let second = src[read + 1];
195                if !in_inclusive_range8(second, 0x80, 0xBF) {
196                    break 'outer;
197                }
198                read += 2;
199                continue 'tail;
200            }
201            // We need to exclude valid four byte lead bytes, because
202            // `UTF8_DATA.second_mask` covers
203            if byte < 0xF0 {
204                // Three-byte
205                let new_read = read + 3;
206                if new_read > src.len() {
207                    break 'outer;
208                }
209                let second = src[read + 1];
210                let third = src[read + 2];
211                if ((UTF8_DATA.table[usize::from(second)]
212                    & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
213                    | (third >> 6))
214                    != 2
215                {
216                    break 'outer;
217                }
218                read += 3;
219                // `'tail` handles sequences shorter than 4, so
220                // there can't be another sequence after this one.
221                break 'outer;
222            }
223            break 'outer;
224        }
225    }
226    read
227}
228
229#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cyclomatic_complexity))]
230pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) {
231    let mut read = 0;
232    let mut written = 0;
233    'outer: loop {
234        let mut byte = {
235            let src_remaining = &src[read..];
236            let dst_remaining = &mut dst[written..];
237            let length = ::core::cmp::min(src_remaining.len(), dst_remaining.len());
238            match unsafe {
239                ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
240            } {
241                None => {
242                    read += length;
243                    written += length;
244                    break 'outer;
245                }
246                Some((non_ascii, consumed)) => {
247                    read += consumed;
248                    written += consumed;
249                    non_ascii
250                }
251            }
252        };
253        // Check for the longest sequence to avoid checking twice for the
254        // multi-byte sequences. This can't overflow with 64-bit address space,
255        // because full 64 bits aren't in use. In the 32-bit PAE case, for this
256        // to overflow would mean that the source slice would be so large that
257        // the address space of the process would not have space for any code.
258        // Therefore, the slice cannot be so long that this would overflow.
259        if likely(read + 4 <= src.len()) {
260            'inner: loop {
261                // At this point, `byte` is not included in `read`, because we
262                // don't yet know that a) the UTF-8 sequence is valid and b) that there
263                // is output space if it is an astral sequence.
264                // We know, thanks to `ascii_to_basic_latin` that there is output
265                // space for at least one UTF-16 code unit, so no need to check
266                // for output space in the BMP cases.
267                // Inspecting the lead byte directly is faster than what the
268                // std lib does!
269                if likely(in_inclusive_range8(byte, 0xC2, 0xDF)) {
270                    // Two-byte
271                    let second = unsafe { *(src.get_unchecked(read + 1)) };
272                    if !in_inclusive_range8(second, 0x80, 0xBF) {
273                        break 'outer;
274                    }
275                    unsafe {
276                        *(dst.get_unchecked_mut(written)) =
277                            ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F)
278                    };
279                    read += 2;
280                    written += 1;
281
282                    // Next lead (manually inlined)
283                    if written == dst.len() {
284                        break 'outer;
285                    }
286                    if likely(read + 4 <= src.len()) {
287                        byte = unsafe { *(src.get_unchecked(read)) };
288                        if byte < 0x80 {
289                            unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
290                            read += 1;
291                            written += 1;
292                            continue 'outer;
293                        }
294                        continue 'inner;
295                    }
296                    break 'inner;
297                }
298                if likely(byte < 0xF0) {
299                    'three: loop {
300                        // Three-byte
301                        let second = unsafe { *(src.get_unchecked(read + 1)) };
302                        let third = unsafe { *(src.get_unchecked(read + 2)) };
303                        if ((UTF8_DATA.table[usize::from(second)]
304                            & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
305                            | (third >> 6))
306                            != 2
307                        {
308                            break 'outer;
309                        }
310                        let point = ((u16::from(byte) & 0xF) << 12)
311                            | ((u16::from(second) & 0x3F) << 6)
312                            | (u16::from(third) & 0x3F);
313                        unsafe { *(dst.get_unchecked_mut(written)) = point };
314                        read += 3;
315                        written += 1;
316
317                        // Next lead (manually inlined)
318                        if written == dst.len() {
319                            break 'outer;
320                        }
321                        if likely(read + 4 <= src.len()) {
322                            byte = unsafe { *(src.get_unchecked(read)) };
323                            if in_inclusive_range8(byte, 0xE0, 0xEF) {
324                                continue 'three;
325                            }
326                            if likely(byte < 0x80) {
327                                unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
328                                read += 1;
329                                written += 1;
330                                continue 'outer;
331                            }
332                            continue 'inner;
333                        }
334                        break 'inner;
335                    }
336                }
337                // Four-byte
338                if written + 1 == dst.len() {
339                    break 'outer;
340                }
341                let second = unsafe { *(src.get_unchecked(read + 1)) };
342                let third = unsafe { *(src.get_unchecked(read + 2)) };
343                let fourth = unsafe { *(src.get_unchecked(read + 3)) };
344                if (u16::from(
345                    UTF8_DATA.table[usize::from(second)]
346                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
347                ) | u16::from(third >> 6)
348                    | (u16::from(fourth & 0xC0) << 2))
349                    != 0x202
350                {
351                    break 'outer;
352                }
353                let point = ((u32::from(byte) & 0x7) << 18)
354                    | ((u32::from(second) & 0x3F) << 12)
355                    | ((u32::from(third) & 0x3F) << 6)
356                    | (u32::from(fourth) & 0x3F);
357                unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
358                unsafe {
359                    *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
360                };
361                read += 4;
362                written += 2;
363
364                // Next lead
365                if written == dst.len() {
366                    break 'outer;
367                }
368                if likely(read + 4 <= src.len()) {
369                    byte = unsafe { *(src.get_unchecked(read)) };
370                    if byte < 0x80 {
371                        unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
372                        read += 1;
373                        written += 1;
374                        continue 'outer;
375                    }
376                    continue 'inner;
377                }
378                break 'inner;
379            }
380        }
381        // We can't have a complete 4-byte sequence, but we could still have
382        // one to three shorter sequences.
383        'tail: loop {
384            // >= is better for bound check elision than ==
385            if read >= src.len() || written >= dst.len() {
386                break 'outer;
387            }
388            byte = src[read];
389            // At this point, `byte` is not included in `read`, because we
390            // don't yet know that a) the UTF-8 sequence is valid and b) that there
391            // is output space if it is an astral sequence.
392            // Inspecting the lead byte directly is faster than what the
393            // std lib does!
394            if byte < 0x80 {
395                dst[written] = u16::from(byte);
396                read += 1;
397                written += 1;
398                continue 'tail;
399            }
400            if in_inclusive_range8(byte, 0xC2, 0xDF) {
401                // Two-byte
402                let new_read = read + 2;
403                if new_read > src.len() {
404                    break 'outer;
405                }
406                let second = src[read + 1];
407                if !in_inclusive_range8(second, 0x80, 0xBF) {
408                    break 'outer;
409                }
410                dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
411                read += 2;
412                written += 1;
413                continue 'tail;
414            }
415            // We need to exclude valid four byte lead bytes, because
416            // `UTF8_DATA.second_mask` covers
417            if byte < 0xF0 {
418                // Three-byte
419                let new_read = read + 3;
420                if new_read > src.len() {
421                    break 'outer;
422                }
423                let second = src[read + 1];
424                let third = src[read + 2];
425                if ((UTF8_DATA.table[usize::from(second)]
426                    & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
427                    | (third >> 6))
428                    != 2
429                {
430                    break 'outer;
431                }
432                let point = ((u16::from(byte) & 0xF) << 12)
433                    | ((u16::from(second) & 0x3F) << 6)
434                    | (u16::from(third) & 0x3F);
435                dst[written] = point;
436                read += 3;
437                written += 1;
438                // `'tail` handles sequences shorter than 4, so
439                // there can't be another sequence after this one.
440                break 'outer;
441            }
442            break 'outer;
443        }
444    }
445    (read, written)
446}
447
448pub struct Utf8Decoder {
449    code_point: u32,
450    bytes_seen: usize,   // 1, 2 or 3: counts continuations only
451    bytes_needed: usize, // 1, 2 or 3: counts continuations only
452    lower_boundary: u8,
453    upper_boundary: u8,
454}
455
456impl Utf8Decoder {
457    pub fn new_inner() -> Utf8Decoder {
458        Utf8Decoder {
459            code_point: 0,
460            bytes_seen: 0,
461            bytes_needed: 0,
462            lower_boundary: 0x80u8,
463            upper_boundary: 0xBFu8,
464        }
465    }
466
467    pub fn new() -> VariantDecoder {
468        VariantDecoder::Utf8(Utf8Decoder::new_inner())
469    }
470
471    pub fn in_neutral_state(&self) -> bool {
472        self.bytes_needed == 0
473    }
474
475    fn extra_from_state(&self) -> usize {
476        if self.bytes_needed == 0 {
477            0
478        } else {
479            self.bytes_seen + 1
480        }
481    }
482
483    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
484        byte_length.checked_add(1 + self.extra_from_state())
485    }
486
487    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
488        byte_length.checked_add(3 + self.extra_from_state())
489    }
490
491    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
492        checked_add(
493            3,
494            checked_mul(3, byte_length.checked_add(self.extra_from_state())),
495        )
496    }
497
498    decoder_functions!(
499        {},
500        {
501            // This is the fast path. The rest runs only at the
502            // start and end for partial sequences.
503            if self.bytes_needed == 0 {
504                dest.copy_utf8_up_to_invalid_from(&mut source);
505            }
506        },
507        {
508            if self.bytes_needed != 0 {
509                let bad_bytes = (self.bytes_seen + 1) as u8;
510                self.code_point = 0;
511                self.bytes_needed = 0;
512                self.bytes_seen = 0;
513                return (
514                    DecoderResult::Malformed(bad_bytes, 0),
515                    src_consumed,
516                    dest.written(),
517                );
518            }
519        },
520        {
521            if self.bytes_needed == 0 {
522                if b < 0x80u8 {
523                    destination_handle.write_ascii(b);
524                    continue;
525                }
526                if b < 0xC2u8 {
527                    return (
528                        DecoderResult::Malformed(1, 0),
529                        unread_handle.consumed(),
530                        destination_handle.written(),
531                    );
532                }
533                if b < 0xE0u8 {
534                    self.bytes_needed = 1;
535                    self.code_point = u32::from(b) & 0x1F;
536                    continue;
537                }
538                if b < 0xF0u8 {
539                    if b == 0xE0u8 {
540                        self.lower_boundary = 0xA0u8;
541                    } else if b == 0xEDu8 {
542                        self.upper_boundary = 0x9Fu8;
543                    }
544                    self.bytes_needed = 2;
545                    self.code_point = u32::from(b) & 0xF;
546                    continue;
547                }
548                if b < 0xF5u8 {
549                    if b == 0xF0u8 {
550                        self.lower_boundary = 0x90u8;
551                    } else if b == 0xF4u8 {
552                        self.upper_boundary = 0x8Fu8;
553                    }
554                    self.bytes_needed = 3;
555                    self.code_point = u32::from(b) & 0x7;
556                    continue;
557                }
558                return (
559                    DecoderResult::Malformed(1, 0),
560                    unread_handle.consumed(),
561                    destination_handle.written(),
562                );
563            }
564            // self.bytes_needed != 0
565            if !(b >= self.lower_boundary && b <= self.upper_boundary) {
566                let bad_bytes = (self.bytes_seen + 1) as u8;
567                self.code_point = 0;
568                self.bytes_needed = 0;
569                self.bytes_seen = 0;
570                self.lower_boundary = 0x80u8;
571                self.upper_boundary = 0xBFu8;
572                return (
573                    DecoderResult::Malformed(bad_bytes, 0),
574                    unread_handle.unread(),
575                    destination_handle.written(),
576                );
577            }
578            self.lower_boundary = 0x80u8;
579            self.upper_boundary = 0xBFu8;
580            self.code_point = (self.code_point << 6) | (u32::from(b) & 0x3F);
581            self.bytes_seen += 1;
582            if self.bytes_seen != self.bytes_needed {
583                continue;
584            }
585            if self.bytes_needed == 3 {
586                destination_handle.write_astral(self.code_point);
587            } else {
588                destination_handle.write_bmp_excl_ascii(self.code_point as u16);
589            }
590            self.code_point = 0;
591            self.bytes_needed = 0;
592            self.bytes_seen = 0;
593            continue;
594        },
595        self,
596        src_consumed,
597        dest,
598        source,
599        b,
600        destination_handle,
601        unread_handle,
602        check_space_astral
603    );
604}
605
606#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
607#[inline(never)]
608pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
609    let mut read = 0;
610    let mut written = 0;
611    'outer: loop {
612        let mut unit = {
613            let src_remaining = &src[read..];
614            let dst_remaining = &mut dst[written..];
615            let length = if dst_remaining.len() < src_remaining.len() {
616                dst_remaining.len()
617            } else {
618                src_remaining.len()
619            };
620            match unsafe {
621                basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
622            } {
623                None => {
624                    read += length;
625                    written += length;
626                    return (read, written);
627                }
628                Some((non_ascii, consumed)) => {
629                    read += consumed;
630                    written += consumed;
631                    non_ascii
632                }
633            }
634        };
635        'inner: loop {
636            // The following loop is only broken out of as a goto forward.
637            loop {
638                // Unfortunately, this check isn't enough for the compiler to elide
639                // the bound checks on writes to dst, which is why they are manually
640                // elided, which makes a measurable difference.
641                if written.checked_add(4).unwrap() > dst.len() {
642                    return (read, written);
643                }
644                read += 1;
645                if unit < 0x800 {
646                    unsafe {
647                        *(dst.get_unchecked_mut(written)) = (unit >> 6) as u8 | 0xC0u8;
648                        written += 1;
649                        *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
650                        written += 1;
651                    }
652                    break;
653                }
654                let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
655                if likely(unit_minus_surrogate_start > (0xDFFF - 0xD800)) {
656                    unsafe {
657                        *(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8;
658                        written += 1;
659                        *(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
660                        written += 1;
661                        *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
662                        written += 1;
663                    }
664                    break;
665                }
666                if likely(unit_minus_surrogate_start <= (0xDBFF - 0xD800)) {
667                    // high surrogate
668                    // read > src.len() is impossible, but using
669                    // >= instead of == allows the compiler to elide a bound check.
670                    if read >= src.len() {
671                        debug_assert_eq!(read, src.len());
672                        // Unpaired surrogate at the end of the buffer.
673                        unsafe {
674                            *(dst.get_unchecked_mut(written)) = 0xEFu8;
675                            written += 1;
676                            *(dst.get_unchecked_mut(written)) = 0xBFu8;
677                            written += 1;
678                            *(dst.get_unchecked_mut(written)) = 0xBDu8;
679                            written += 1;
680                        }
681                        return (read, written);
682                    }
683                    let second = src[read];
684                    let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
685                    if likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) {
686                        // The next code unit is a low surrogate. Advance position.
687                        read += 1;
688                        let astral = (u32::from(unit) << 10) + u32::from(second)
689                            - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
690                        unsafe {
691                            *(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8;
692                            written += 1;
693                            *(dst.get_unchecked_mut(written)) =
694                                ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
695                            written += 1;
696                            *(dst.get_unchecked_mut(written)) =
697                                ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
698                            written += 1;
699                            *(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8;
700                            written += 1;
701                        }
702                        break;
703                    }
704                    // The next code unit is not a low surrogate. Don't advance
705                    // position and treat the high surrogate as unpaired.
706                    // Fall through
707                }
708                // Unpaired low surrogate
709                unsafe {
710                    *(dst.get_unchecked_mut(written)) = 0xEFu8;
711                    written += 1;
712                    *(dst.get_unchecked_mut(written)) = 0xBFu8;
713                    written += 1;
714                    *(dst.get_unchecked_mut(written)) = 0xBDu8;
715                    written += 1;
716                }
717                break;
718            }
719            // Now see if the next unit is Basic Latin
720            // read > src.len() is impossible, but using
721            // >= instead of == allows the compiler to elide a bound check.
722            if read >= src.len() {
723                debug_assert_eq!(read, src.len());
724                return (read, written);
725            }
726            unit = src[read];
727            if unlikely(unit < 0x80) {
728                // written > dst.len() is impossible, but using
729                // >= instead of == allows the compiler to elide a bound check.
730                if written >= dst.len() {
731                    debug_assert_eq!(written, dst.len());
732                    return (read, written);
733                }
734                dst[written] = unit as u8;
735                read += 1;
736                written += 1;
737                // Mysteriously, adding a punctuation check here makes
738                // the expected benificiary cases *slower*!
739                continue 'outer;
740            }
741            continue 'inner;
742        }
743    }
744}
745
746#[inline(never)]
747pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
748    // Everything below is cold code!
749    let mut read = 0;
750    let mut written = 0;
751    let mut unit = src[read];
752    // We now have up to 3 output slots, so an astral character
753    // will not fit.
754    if unit < 0x800 {
755        loop {
756            if unit < 0x80 {
757                if written >= dst.len() {
758                    return (read, written);
759                }
760                read += 1;
761                dst[written] = unit as u8;
762                written += 1;
763            } else if unit < 0x800 {
764                if written + 2 > dst.len() {
765                    return (read, written);
766                }
767                read += 1;
768                dst[written] = (unit >> 6) as u8 | 0xC0u8;
769                written += 1;
770                dst[written] = (unit & 0x3F) as u8 | 0x80u8;
771                written += 1;
772            } else {
773                return (read, written);
774            }
775            // read > src.len() is impossible, but using
776            // >= instead of == allows the compiler to elide a bound check.
777            if read >= src.len() {
778                debug_assert_eq!(read, src.len());
779                return (read, written);
780            }
781            unit = src[read];
782        }
783    }
784    // Could be an unpaired surrogate, but we'll need 3 output
785    // slots in any case.
786    if written + 3 > dst.len() {
787        return (read, written);
788    }
789    read += 1;
790    let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
791    if unit_minus_surrogate_start <= (0xDFFF - 0xD800) {
792        // Got surrogate
793        if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
794            // Got high surrogate
795            if read >= src.len() {
796                // Unpaired high surrogate
797                unit = 0xFFFD;
798            } else {
799                let second = src[read];
800                if in_inclusive_range16(second, 0xDC00, 0xDFFF) {
801                    // Valid surrogate pair, but we know it won't fit.
802                    read -= 1;
803                    return (read, written);
804                }
805                // Unpaired high
806                unit = 0xFFFD;
807            }
808        } else {
809            // Unpaired low
810            unit = 0xFFFD;
811        }
812    }
813    dst[written] = (unit >> 12) as u8 | 0xE0u8;
814    written += 1;
815    dst[written] = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
816    written += 1;
817    dst[written] = (unit & 0x3F) as u8 | 0x80u8;
818    written += 1;
819    debug_assert_eq!(written, dst.len());
820    (read, written)
821}
822
823pub struct Utf8Encoder;
824
825impl Utf8Encoder {
826    pub fn new(encoding: &'static Encoding) -> Encoder {
827        Encoder::new(encoding, VariantEncoder::Utf8(Utf8Encoder))
828    }
829
830    pub fn max_buffer_length_from_utf16_without_replacement(
831        &self,
832        u16_length: usize,
833    ) -> Option<usize> {
834        u16_length.checked_mul(3)
835    }
836
837    pub fn max_buffer_length_from_utf8_without_replacement(
838        &self,
839        byte_length: usize,
840    ) -> Option<usize> {
841        Some(byte_length)
842    }
843
844    pub fn encode_from_utf16_raw(
845        &mut self,
846        src: &[u16],
847        dst: &mut [u8],
848        _last: bool,
849    ) -> (EncoderResult, usize, usize) {
850        let (read, written) = convert_utf16_to_utf8_partial(src, dst);
851        (
852            if read == src.len() {
853                EncoderResult::InputEmpty
854            } else {
855                EncoderResult::OutputFull
856            },
857            read,
858            written,
859        )
860    }
861
862    pub fn encode_from_utf8_raw(
863        &mut self,
864        src: &str,
865        dst: &mut [u8],
866        _last: bool,
867    ) -> (EncoderResult, usize, usize) {
868        let bytes = src.as_bytes();
869        let mut to_write = bytes.len();
870        if to_write <= dst.len() {
871            (&mut dst[..to_write]).copy_from_slice(bytes);
872            return (EncoderResult::InputEmpty, to_write, to_write);
873        }
874        to_write = dst.len();
875        // Move back until we find a UTF-8 sequence boundary.
876        while (bytes[to_write] & 0xC0) == 0x80 {
877            to_write -= 1;
878        }
879        (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]);
880        (EncoderResult::OutputFull, to_write, to_write)
881    }
882}
883
884// Any copyright to the test code below this comment is dedicated to the
885// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
886
887#[cfg(all(test, feature = "alloc"))]
888mod tests {
889    use super::super::testing::*;
890    use super::super::*;
891
892    //    fn decode_utf8_to_utf16(bytes: &[u8], expect: &[u16]) {
893    //        decode_to_utf16_without_replacement(UTF_8, bytes, expect);
894    //    }
895
896    fn decode_utf8_to_utf8(bytes: &[u8], expect: &str) {
897        decode_to_utf8(UTF_8, bytes, expect);
898    }
899
900    fn decode_valid_utf8(string: &str) {
901        decode_utf8_to_utf8(string.as_bytes(), string);
902    }
903
904    fn encode_utf8_from_utf16(string: &[u16], expect: &[u8]) {
905        encode_from_utf16(UTF_8, string, expect);
906    }
907
908    fn encode_utf8_from_utf8(string: &str, expect: &[u8]) {
909        encode_from_utf8(UTF_8, string, expect);
910    }
911
912    fn encode_utf8_from_utf16_with_output_limit(
913        string: &[u16],
914        expect: &str,
915        limit: usize,
916        expect_result: EncoderResult,
917    ) {
918        let mut dst = Vec::new();
919        {
920            dst.resize(limit, 0u8);
921            let mut encoder = UTF_8.new_encoder();
922            let (result, read, written) =
923                encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
924            assert_eq!(result, expect_result);
925            if expect_result == EncoderResult::InputEmpty {
926                assert_eq!(read, string.len());
927            }
928            assert_eq!(&dst[..written], expect.as_bytes());
929        }
930        {
931            dst.resize(64, 0u8);
932            for (i, elem) in dst.iter_mut().enumerate() {
933                *elem = i as u8;
934            }
935            let mut encoder = UTF_8.new_encoder();
936            let (_, _, mut j) =
937                encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
938            while j < dst.len() {
939                assert_eq!(usize::from(dst[j]), j);
940                j += 1;
941            }
942        }
943    }
944
945    #[test]
946    fn test_utf8_decode() {
947        // Empty
948        decode_valid_utf8("");
949        // ASCII
950        decode_valid_utf8("ab");
951        // Low BMP
952        decode_valid_utf8("a\u{E4}Z");
953        // High BMP
954        decode_valid_utf8("a\u{2603}Z");
955        // Astral
956        decode_valid_utf8("a\u{1F4A9}Z");
957        // Low BMP with last byte missing
958        decode_utf8_to_utf8(b"a\xC3Z", "a\u{FFFD}Z");
959        decode_utf8_to_utf8(b"a\xC3", "a\u{FFFD}");
960        // High BMP with last byte missing
961        decode_utf8_to_utf8(b"a\xE2\x98Z", "a\u{FFFD}Z");
962        decode_utf8_to_utf8(b"a\xE2\x98", "a\u{FFFD}");
963        // Astral with last byte missing
964        decode_utf8_to_utf8(b"a\xF0\x9F\x92Z", "a\u{FFFD}Z");
965        decode_utf8_to_utf8(b"a\xF0\x9F\x92", "a\u{FFFD}");
966        // Lone highest continuation
967        decode_utf8_to_utf8(b"a\xBFZ", "a\u{FFFD}Z");
968        decode_utf8_to_utf8(b"a\xBF", "a\u{FFFD}");
969        // Two lone highest continuations
970        decode_utf8_to_utf8(b"a\xBF\xBFZ", "a\u{FFFD}\u{FFFD}Z");
971        decode_utf8_to_utf8(b"a\xBF\xBF", "a\u{FFFD}\u{FFFD}");
972        // Low BMP followed by lowest lone continuation
973        decode_utf8_to_utf8(b"a\xC3\xA4\x80Z", "a\u{E4}\u{FFFD}Z");
974        decode_utf8_to_utf8(b"a\xC3\xA4\x80", "a\u{E4}\u{FFFD}");
975        // Low BMP followed by highest lone continuation
976        decode_utf8_to_utf8(b"a\xC3\xA4\xBFZ", "a\u{E4}\u{FFFD}Z");
977        decode_utf8_to_utf8(b"a\xC3\xA4\xBF", "a\u{E4}\u{FFFD}");
978        // High BMP followed by lowest lone continuation
979        decode_utf8_to_utf8(b"a\xE2\x98\x83\x80Z", "a\u{2603}\u{FFFD}Z");
980        decode_utf8_to_utf8(b"a\xE2\x98\x83\x80", "a\u{2603}\u{FFFD}");
981        // High BMP followed by highest lone continuation
982        decode_utf8_to_utf8(b"a\xE2\x98\x83\xBFZ", "a\u{2603}\u{FFFD}Z");
983        decode_utf8_to_utf8(b"a\xE2\x98\x83\xBF", "a\u{2603}\u{FFFD}");
984        // Astral followed by lowest lone continuation
985        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80Z", "a\u{1F4A9}\u{FFFD}Z");
986        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80", "a\u{1F4A9}\u{FFFD}");
987        // Astral followed by highest lone continuation
988        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBFZ", "a\u{1F4A9}\u{FFFD}Z");
989        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBF", "a\u{1F4A9}\u{FFFD}");
990
991        // Boundary conditions
992        // Lowest single-byte
993        decode_valid_utf8("Z\x00");
994        decode_valid_utf8("Z\x00Z");
995        // Lowest single-byte as two-byte overlong sequence
996        decode_utf8_to_utf8(b"a\xC0\x80", "a\u{FFFD}\u{FFFD}");
997        decode_utf8_to_utf8(b"a\xC0\x80Z", "a\u{FFFD}\u{FFFD}Z");
998        // Lowest single-byte as three-byte overlong sequence
999        decode_utf8_to_utf8(b"a\xE0\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1000        decode_utf8_to_utf8(b"a\xE0\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1001        // Lowest single-byte as four-byte overlong sequence
1002        decode_utf8_to_utf8(b"a\xF0\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1003        decode_utf8_to_utf8(b"a\xF0\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1004        // One below lowest single-byte
1005        decode_utf8_to_utf8(b"a\xFF", "a\u{FFFD}");
1006        decode_utf8_to_utf8(b"a\xFFZ", "a\u{FFFD}Z");
1007        // Highest single-byte
1008        decode_valid_utf8("a\x7F");
1009        decode_valid_utf8("a\x7FZ");
1010        // Highest single-byte as two-byte overlong sequence
1011        decode_utf8_to_utf8(b"a\xC1\xBF", "a\u{FFFD}\u{FFFD}");
1012        decode_utf8_to_utf8(b"a\xC1\xBFZ", "a\u{FFFD}\u{FFFD}Z");
1013        // Highest single-byte as three-byte overlong sequence
1014        decode_utf8_to_utf8(b"a\xE0\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1015        decode_utf8_to_utf8(b"a\xE0\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1016        // Highest single-byte as four-byte overlong sequence
1017        decode_utf8_to_utf8(b"a\xF0\x80\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1018        decode_utf8_to_utf8(b"a\xF0\x80\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1019        // One past highest single byte (also lone continuation)
1020        decode_utf8_to_utf8(b"a\x80Z", "a\u{FFFD}Z");
1021        decode_utf8_to_utf8(b"a\x80", "a\u{FFFD}");
1022        // Two lone continuations
1023        decode_utf8_to_utf8(b"a\x80\x80Z", "a\u{FFFD}\u{FFFD}Z");
1024        decode_utf8_to_utf8(b"a\x80\x80", "a\u{FFFD}\u{FFFD}");
1025        // Three lone continuations
1026        decode_utf8_to_utf8(b"a\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1027        decode_utf8_to_utf8(b"a\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1028        // Four lone continuations
1029        decode_utf8_to_utf8(b"a\x80\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1030        decode_utf8_to_utf8(b"a\x80\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1031        // Lowest two-byte
1032        decode_utf8_to_utf8(b"a\xC2\x80", "a\u{0080}");
1033        decode_utf8_to_utf8(b"a\xC2\x80Z", "a\u{0080}Z");
1034        // Lowest two-byte as three-byte overlong sequence
1035        decode_utf8_to_utf8(b"a\xE0\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1036        decode_utf8_to_utf8(b"a\xE0\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1037        // Lowest two-byte as four-byte overlong sequence
1038        decode_utf8_to_utf8(b"a\xF0\x80\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1039        decode_utf8_to_utf8(b"a\xF0\x80\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1040        // Lead one below lowest two-byte
1041        decode_utf8_to_utf8(b"a\xC1\x80", "a\u{FFFD}\u{FFFD}");
1042        decode_utf8_to_utf8(b"a\xC1\x80Z", "a\u{FFFD}\u{FFFD}Z");
1043        // Trail one below lowest two-byte
1044        decode_utf8_to_utf8(b"a\xC2\x7F", "a\u{FFFD}\u{007F}");
1045        decode_utf8_to_utf8(b"a\xC2\x7FZ", "a\u{FFFD}\u{007F}Z");
1046        // Highest two-byte
1047        decode_utf8_to_utf8(b"a\xDF\xBF", "a\u{07FF}");
1048        decode_utf8_to_utf8(b"a\xDF\xBFZ", "a\u{07FF}Z");
1049        // Highest two-byte as three-byte overlong sequence
1050        decode_utf8_to_utf8(b"a\xE0\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1051        decode_utf8_to_utf8(b"a\xE0\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1052        // Highest two-byte as four-byte overlong sequence
1053        decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1054        decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1055        // Lowest three-byte
1056        decode_utf8_to_utf8(b"a\xE0\xA0\x80", "a\u{0800}");
1057        decode_utf8_to_utf8(b"a\xE0\xA0\x80Z", "a\u{0800}Z");
1058        // Lowest three-byte as four-byte overlong sequence
1059        decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1060        decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1061        // Highest below surrogates
1062        decode_utf8_to_utf8(b"a\xED\x9F\xBF", "a\u{D7FF}");
1063        decode_utf8_to_utf8(b"a\xED\x9F\xBFZ", "a\u{D7FF}Z");
1064        // Highest below surrogates as four-byte overlong sequence
1065        decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1066        decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1067        // First surrogate
1068        decode_utf8_to_utf8(b"a\xED\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1069        decode_utf8_to_utf8(b"a\xED\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1070        // First surrogate as four-byte overlong sequence
1071        decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1072        decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1073        // Last surrogate
1074        decode_utf8_to_utf8(b"a\xED\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1075        decode_utf8_to_utf8(b"a\xED\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1076        // Last surrogate as four-byte overlong sequence
1077        decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1078        decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1079        // Lowest above surrogates
1080        decode_utf8_to_utf8(b"a\xEE\x80\x80", "a\u{E000}");
1081        decode_utf8_to_utf8(b"a\xEE\x80\x80Z", "a\u{E000}Z");
1082        // Lowest above surrogates as four-byte overlong sequence
1083        decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1084        decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1085        // Highest three-byte
1086        decode_utf8_to_utf8(b"a\xEF\xBF\xBF", "a\u{FFFF}");
1087        decode_utf8_to_utf8(b"a\xEF\xBF\xBFZ", "a\u{FFFF}Z");
1088        // Highest three-byte as four-byte overlong sequence
1089        decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1090        decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1091        // Lowest four-byte
1092        decode_utf8_to_utf8(b"a\xF0\x90\x80\x80", "a\u{10000}");
1093        decode_utf8_to_utf8(b"a\xF0\x90\x80\x80Z", "a\u{10000}Z");
1094        // Highest four-byte
1095        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBF", "a\u{10FFFF}");
1096        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBFZ", "a\u{10FFFF}Z");
1097        // One past highest four-byte
1098        decode_utf8_to_utf8(b"a\xF4\x90\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1099        decode_utf8_to_utf8(b"a\xF4\x90\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1100
1101        // Highest four-byte with last byte replaced with 0xFF
1102        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFF", "a\u{FFFD}\u{FFFD}");
1103        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFFZ", "a\u{FFFD}\u{FFFD}Z");
1104    }
1105
1106    #[test]
1107    fn test_utf8_encode() {
1108        // Empty
1109        encode_utf8_from_utf16(&[], b"");
1110        encode_utf8_from_utf8("", b"");
1111
1112        encode_utf8_from_utf16(&[0x0000], "\u{0000}".as_bytes());
1113        encode_utf8_from_utf16(&[0x007F], "\u{007F}".as_bytes());
1114        encode_utf8_from_utf16(&[0x0080], "\u{0080}".as_bytes());
1115        encode_utf8_from_utf16(&[0x07FF], "\u{07FF}".as_bytes());
1116        encode_utf8_from_utf16(&[0x0800], "\u{0800}".as_bytes());
1117        encode_utf8_from_utf16(&[0xD7FF], "\u{D7FF}".as_bytes());
1118        encode_utf8_from_utf16(&[0xD800], "\u{FFFD}".as_bytes());
1119        encode_utf8_from_utf16(&[0xD800, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
1120        encode_utf8_from_utf16(&[0xDFFF], "\u{FFFD}".as_bytes());
1121        encode_utf8_from_utf16(&[0xDFFF, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
1122        encode_utf8_from_utf16(&[0xE000], "\u{E000}".as_bytes());
1123        encode_utf8_from_utf16(&[0xFFFF], "\u{FFFF}".as_bytes());
1124        encode_utf8_from_utf16(&[0xD800, 0xDC00], "\u{10000}".as_bytes());
1125        encode_utf8_from_utf16(&[0xDBFF, 0xDFFF], "\u{10FFFF}".as_bytes());
1126        encode_utf8_from_utf16(&[0xDC00, 0xDEDE], "\u{FFFD}\u{FFFD}".as_bytes());
1127    }
1128
1129    #[test]
1130    fn test_encode_utf8_from_utf16_with_output_limit() {
1131        encode_utf8_from_utf16_with_output_limit(&[0x0062], "\u{62}", 1, EncoderResult::InputEmpty);
1132        encode_utf8_from_utf16_with_output_limit(&[0x00A7], "\u{A7}", 2, EncoderResult::InputEmpty);
1133        encode_utf8_from_utf16_with_output_limit(
1134            &[0x2603],
1135            "\u{2603}",
1136            3,
1137            EncoderResult::InputEmpty,
1138        );
1139        encode_utf8_from_utf16_with_output_limit(
1140            &[0xD83D, 0xDCA9],
1141            "\u{1F4A9}",
1142            4,
1143            EncoderResult::InputEmpty,
1144        );
1145
1146        encode_utf8_from_utf16_with_output_limit(&[0x00A7], "", 1, EncoderResult::OutputFull);
1147        encode_utf8_from_utf16_with_output_limit(&[0x2603], "", 2, EncoderResult::OutputFull);
1148        encode_utf8_from_utf16_with_output_limit(
1149            &[0xD83D, 0xDCA9],
1150            "",
1151            3,
1152            EncoderResult::OutputFull,
1153        );
1154
1155        encode_utf8_from_utf16_with_output_limit(
1156            &[0x0063, 0x0062],
1157            "\u{63}\u{62}",
1158            2,
1159            EncoderResult::InputEmpty,
1160        );
1161        encode_utf8_from_utf16_with_output_limit(
1162            &[0x0063, 0x00A7],
1163            "\u{63}\u{A7}",
1164            3,
1165            EncoderResult::InputEmpty,
1166        );
1167        encode_utf8_from_utf16_with_output_limit(
1168            &[0x0063, 0x2603],
1169            "\u{63}\u{2603}",
1170            4,
1171            EncoderResult::InputEmpty,
1172        );
1173        encode_utf8_from_utf16_with_output_limit(
1174            &[0x0063, 0xD83D, 0xDCA9],
1175            "\u{63}\u{1F4A9}",
1176            5,
1177            EncoderResult::InputEmpty,
1178        );
1179
1180        encode_utf8_from_utf16_with_output_limit(
1181            &[0x0063, 0x00A7],
1182            "\u{63}",
1183            2,
1184            EncoderResult::OutputFull,
1185        );
1186        encode_utf8_from_utf16_with_output_limit(
1187            &[0x0063, 0x2603],
1188            "\u{63}",
1189            3,
1190            EncoderResult::OutputFull,
1191        );
1192        encode_utf8_from_utf16_with_output_limit(
1193            &[0x0063, 0xD83D, 0xDCA9],
1194            "\u{63}",
1195            4,
1196            EncoderResult::OutputFull,
1197        );
1198
1199        encode_utf8_from_utf16_with_output_limit(
1200            &[0x00B6, 0x0062],
1201            "\u{B6}\u{62}",
1202            3,
1203            EncoderResult::InputEmpty,
1204        );
1205        encode_utf8_from_utf16_with_output_limit(
1206            &[0x00B6, 0x00A7],
1207            "\u{B6}\u{A7}",
1208            4,
1209            EncoderResult::InputEmpty,
1210        );
1211        encode_utf8_from_utf16_with_output_limit(
1212            &[0x00B6, 0x2603],
1213            "\u{B6}\u{2603}",
1214            5,
1215            EncoderResult::InputEmpty,
1216        );
1217        encode_utf8_from_utf16_with_output_limit(
1218            &[0x00B6, 0xD83D, 0xDCA9],
1219            "\u{B6}\u{1F4A9}",
1220            6,
1221            EncoderResult::InputEmpty,
1222        );
1223
1224        encode_utf8_from_utf16_with_output_limit(
1225            &[0x00B6, 0x00A7],
1226            "\u{B6}",
1227            3,
1228            EncoderResult::OutputFull,
1229        );
1230        encode_utf8_from_utf16_with_output_limit(
1231            &[0x00B6, 0x2603],
1232            "\u{B6}",
1233            4,
1234            EncoderResult::OutputFull,
1235        );
1236        encode_utf8_from_utf16_with_output_limit(
1237            &[0x00B6, 0xD83D, 0xDCA9],
1238            "\u{B6}",
1239            5,
1240            EncoderResult::OutputFull,
1241        );
1242
1243        encode_utf8_from_utf16_with_output_limit(
1244            &[0x263A, 0x0062],
1245            "\u{263A}\u{62}",
1246            4,
1247            EncoderResult::InputEmpty,
1248        );
1249        encode_utf8_from_utf16_with_output_limit(
1250            &[0x263A, 0x00A7],
1251            "\u{263A}\u{A7}",
1252            5,
1253            EncoderResult::InputEmpty,
1254        );
1255        encode_utf8_from_utf16_with_output_limit(
1256            &[0x263A, 0x2603],
1257            "\u{263A}\u{2603}",
1258            6,
1259            EncoderResult::InputEmpty,
1260        );
1261        encode_utf8_from_utf16_with_output_limit(
1262            &[0x263A, 0xD83D, 0xDCA9],
1263            "\u{263A}\u{1F4A9}",
1264            7,
1265            EncoderResult::InputEmpty,
1266        );
1267
1268        encode_utf8_from_utf16_with_output_limit(
1269            &[0x263A, 0x00A7],
1270            "\u{263A}",
1271            4,
1272            EncoderResult::OutputFull,
1273        );
1274        encode_utf8_from_utf16_with_output_limit(
1275            &[0x263A, 0x2603],
1276            "\u{263A}",
1277            5,
1278            EncoderResult::OutputFull,
1279        );
1280        encode_utf8_from_utf16_with_output_limit(
1281            &[0x263A, 0xD83D, 0xDCA9],
1282            "\u{263A}",
1283            6,
1284            EncoderResult::OutputFull,
1285        );
1286
1287        encode_utf8_from_utf16_with_output_limit(
1288            &[0xD83D, 0xDE0E, 0x0062],
1289            "\u{1F60E}\u{62}",
1290            5,
1291            EncoderResult::InputEmpty,
1292        );
1293        encode_utf8_from_utf16_with_output_limit(
1294            &[0xD83D, 0xDE0E, 0x00A7],
1295            "\u{1F60E}\u{A7}",
1296            6,
1297            EncoderResult::InputEmpty,
1298        );
1299        encode_utf8_from_utf16_with_output_limit(
1300            &[0xD83D, 0xDE0E, 0x2603],
1301            "\u{1F60E}\u{2603}",
1302            7,
1303            EncoderResult::InputEmpty,
1304        );
1305        encode_utf8_from_utf16_with_output_limit(
1306            &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
1307            "\u{1F60E}\u{1F4A9}",
1308            8,
1309            EncoderResult::InputEmpty,
1310        );
1311
1312        encode_utf8_from_utf16_with_output_limit(
1313            &[0xD83D, 0xDE0E, 0x00A7],
1314            "\u{1F60E}",
1315            5,
1316            EncoderResult::OutputFull,
1317        );
1318        encode_utf8_from_utf16_with_output_limit(
1319            &[0xD83D, 0xDE0E, 0x2603],
1320            "\u{1F60E}",
1321            6,
1322            EncoderResult::OutputFull,
1323        );
1324        encode_utf8_from_utf16_with_output_limit(
1325            &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
1326            "\u{1F60E}",
1327            7,
1328            EncoderResult::OutputFull,
1329        );
1330
1331        encode_utf8_from_utf16_with_output_limit(
1332            &[0x0063, 0x00B6, 0x0062, 0x0062],
1333            "\u{63}\u{B6}\u{62}\u{62}",
1334            5,
1335            EncoderResult::InputEmpty,
1336        );
1337        encode_utf8_from_utf16_with_output_limit(
1338            &[0x0063, 0x00B6, 0x0062, 0x0062],
1339            "\u{63}\u{B6}\u{62}",
1340            4,
1341            EncoderResult::OutputFull,
1342        );
1343
1344        encode_utf8_from_utf16_with_output_limit(
1345            &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
1346            "\u{63}\u{B6}\u{62}\u{62}\u{62}",
1347            6,
1348            EncoderResult::InputEmpty,
1349        );
1350        encode_utf8_from_utf16_with_output_limit(
1351            &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
1352            "\u{63}\u{B6}\u{62}\u{62}",
1353            5,
1354            EncoderResult::OutputFull,
1355        );
1356
1357        encode_utf8_from_utf16_with_output_limit(
1358            &[0x263A, 0x0062, 0x0062],
1359            "\u{263A}\u{62}\u{62}",
1360            5,
1361            EncoderResult::InputEmpty,
1362        );
1363        encode_utf8_from_utf16_with_output_limit(
1364            &[0x263A, 0x0062, 0x0062],
1365            "\u{263A}\u{62}",
1366            4,
1367            EncoderResult::OutputFull,
1368        );
1369
1370        encode_utf8_from_utf16_with_output_limit(
1371            &[0x263A, 0x0062, 0x0062, 0x0062],
1372            "\u{263A}\u{62}\u{62}\u{62}",
1373            6,
1374            EncoderResult::InputEmpty,
1375        );
1376        encode_utf8_from_utf16_with_output_limit(
1377            &[0x263A, 0x0062, 0x0062, 0x0062],
1378            "\u{263A}\u{62}\u{62}",
1379            5,
1380            EncoderResult::OutputFull,
1381        );
1382
1383        encode_utf8_from_utf16_with_output_limit(
1384            &[0x0063, 0x00B6, 0x00A7],
1385            "\u{63}\u{B6}\u{A7}",
1386            5,
1387            EncoderResult::InputEmpty,
1388        );
1389        encode_utf8_from_utf16_with_output_limit(
1390            &[0x0063, 0x00B6, 0x00A7],
1391            "\u{63}\u{B6}",
1392            4,
1393            EncoderResult::OutputFull,
1394        );
1395
1396        encode_utf8_from_utf16_with_output_limit(
1397            &[0x0063, 0x00B6, 0x00A7, 0x0062],
1398            "\u{63}\u{B6}\u{A7}\u{62}",
1399            6,
1400            EncoderResult::InputEmpty,
1401        );
1402        encode_utf8_from_utf16_with_output_limit(
1403            &[0x0063, 0x00B6, 0x00A7, 0x0062],
1404            "\u{63}\u{B6}\u{A7}",
1405            5,
1406            EncoderResult::OutputFull,
1407        );
1408
1409        encode_utf8_from_utf16_with_output_limit(
1410            &[0x263A, 0x00A7, 0x0062],
1411            "\u{263A}\u{A7}\u{62}",
1412            6,
1413            EncoderResult::InputEmpty,
1414        );
1415        encode_utf8_from_utf16_with_output_limit(
1416            &[0x263A, 0x00A7, 0x0062],
1417            "\u{263A}\u{A7}",
1418            5,
1419            EncoderResult::OutputFull,
1420        );
1421
1422        encode_utf8_from_utf16_with_output_limit(
1423            &[0x0063, 0x00B6, 0x0062, 0x00A7],
1424            "\u{63}\u{B6}\u{62}\u{A7}",
1425            6,
1426            EncoderResult::InputEmpty,
1427        );
1428        encode_utf8_from_utf16_with_output_limit(
1429            &[0x0063, 0x00B6, 0x0062, 0x00A7],
1430            "\u{63}\u{B6}\u{62}",
1431            5,
1432            EncoderResult::OutputFull,
1433        );
1434
1435        encode_utf8_from_utf16_with_output_limit(
1436            &[0x263A, 0x0062, 0x00A7],
1437            "\u{263A}\u{62}\u{A7}",
1438            6,
1439            EncoderResult::InputEmpty,
1440        );
1441        encode_utf8_from_utf16_with_output_limit(
1442            &[0x263A, 0x0062, 0x00A7],
1443            "\u{263A}\u{62}",
1444            5,
1445            EncoderResult::OutputFull,
1446        );
1447
1448        encode_utf8_from_utf16_with_output_limit(
1449            &[0x0063, 0x00B6, 0x2603],
1450            "\u{63}\u{B6}\u{2603}",
1451            6,
1452            EncoderResult::InputEmpty,
1453        );
1454        encode_utf8_from_utf16_with_output_limit(
1455            &[0x0063, 0x00B6, 0x2603],
1456            "\u{63}\u{B6}",
1457            5,
1458            EncoderResult::OutputFull,
1459        );
1460
1461        encode_utf8_from_utf16_with_output_limit(
1462            &[0x263A, 0x2603],
1463            "\u{263A}\u{2603}",
1464            6,
1465            EncoderResult::InputEmpty,
1466        );
1467        encode_utf8_from_utf16_with_output_limit(
1468            &[0x263A, 0x2603],
1469            "\u{263A}",
1470            5,
1471            EncoderResult::OutputFull,
1472        );
1473
1474        encode_utf8_from_utf16_with_output_limit(
1475            &[0x0063, 0x00B6, 0xD83D],
1476            "\u{63}\u{B6}\u{FFFD}",
1477            6,
1478            EncoderResult::InputEmpty,
1479        );
1480        encode_utf8_from_utf16_with_output_limit(
1481            &[0x0063, 0x00B6, 0xD83D],
1482            "\u{63}\u{B6}",
1483            5,
1484            EncoderResult::OutputFull,
1485        );
1486
1487        encode_utf8_from_utf16_with_output_limit(
1488            &[0x263A, 0xD83D],
1489            "\u{263A}\u{FFFD}",
1490            6,
1491            EncoderResult::InputEmpty,
1492        );
1493        encode_utf8_from_utf16_with_output_limit(
1494            &[0x263A, 0xD83D],
1495            "\u{263A}",
1496            5,
1497            EncoderResult::OutputFull,
1498        );
1499
1500        encode_utf8_from_utf16_with_output_limit(
1501            &[0x0063, 0x00B6, 0xDCA9],
1502            "\u{63}\u{B6}\u{FFFD}",
1503            6,
1504            EncoderResult::InputEmpty,
1505        );
1506        encode_utf8_from_utf16_with_output_limit(
1507            &[0x0063, 0x00B6, 0xDCA9],
1508            "\u{63}\u{B6}",
1509            5,
1510            EncoderResult::OutputFull,
1511        );
1512
1513        encode_utf8_from_utf16_with_output_limit(
1514            &[0x263A, 0xDCA9],
1515            "\u{263A}\u{FFFD}",
1516            6,
1517            EncoderResult::InputEmpty,
1518        );
1519        encode_utf8_from_utf16_with_output_limit(
1520            &[0x263A, 0xDCA9],
1521            "\u{263A}",
1522            5,
1523            EncoderResult::OutputFull,
1524        );
1525    }
1526
1527    #[test]
1528    fn test_utf8_max_length_from_utf16() {
1529        let mut encoder = UTF_8.new_encoder();
1530        let mut output = [0u8; 13];
1531        let input = &[0x2C9Fu16, 0x2CA9u16, 0x2CA3u16, 0x2C9Fu16];
1532        let needed = encoder
1533            .max_buffer_length_from_utf16_without_replacement(input.len())
1534            .unwrap();
1535        let (result, _, _) =
1536            encoder.encode_from_utf16_without_replacement(input, &mut output[..needed], true);
1537        assert_eq!(result, EncoderResult::InputEmpty);
1538    }
1539
1540    #[test]
1541    fn test_decode_bom_prefixed_split_byte_triple() {
1542        let mut output = [0u16; 20];
1543        let mut decoder = UTF_8.new_decoder();
1544        {
1545            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1546            let (result, read, written, had_errors) =
1547                decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
1548            assert_eq!(result, CoderResult::InputEmpty);
1549            assert_eq!(read, 1);
1550            assert_eq!(written, 0);
1551            assert!(!had_errors);
1552        }
1553        {
1554            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1555            let (result, read, written, had_errors) =
1556                decoder.decode_to_utf16(b"\xBF", &mut output[..needed], false);
1557            assert_eq!(result, CoderResult::InputEmpty);
1558            assert_eq!(read, 1);
1559            assert_eq!(written, 0);
1560            assert!(!had_errors);
1561        }
1562        {
1563            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1564            let (result, read, written, had_errors) =
1565                decoder.decode_to_utf16(b"\xBE", &mut output[..needed], true);
1566            assert_eq!(result, CoderResult::InputEmpty);
1567            assert_eq!(read, 1);
1568            assert_eq!(written, 1);
1569            assert!(!had_errors);
1570            assert_eq!(output[0], 0xFFFE);
1571        }
1572    }
1573
1574    #[test]
1575    fn test_decode_bom_prefixed_split_byte_pair() {
1576        let mut output = [0u16; 20];
1577        let mut decoder = UTF_8.new_decoder();
1578        {
1579            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1580            let (result, read, written, had_errors) =
1581                decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
1582            assert_eq!(result, CoderResult::InputEmpty);
1583            assert_eq!(read, 1);
1584            assert_eq!(written, 0);
1585            assert!(!had_errors);
1586        }
1587        {
1588            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1589            let (result, read, written, had_errors) =
1590                decoder.decode_to_utf16(b"\xBC", &mut output[..needed], true);
1591            assert_eq!(result, CoderResult::InputEmpty);
1592            assert_eq!(read, 1);
1593            assert_eq!(written, 1);
1594            assert!(had_errors);
1595            assert_eq!(output[0], 0xFFFD);
1596        }
1597    }
1598
1599    #[test]
1600    fn test_decode_bom_prefix() {
1601        let mut output = [0u16; 20];
1602        let mut decoder = UTF_8.new_decoder();
1603        {
1604            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1605            let (result, read, written, had_errors) =
1606                decoder.decode_to_utf16(b"\xEF", &mut output[..needed], true);
1607            assert_eq!(result, CoderResult::InputEmpty);
1608            assert_eq!(read, 1);
1609            assert_eq!(written, 1);
1610            assert!(had_errors);
1611            assert_eq!(output[0], 0xFFFD);
1612        }
1613    }
1614
1615    #[test]
1616    fn test_tail() {
1617        let mut output = [0u16; 1];
1618        let mut decoder = UTF_8.new_decoder_without_bom_handling();
1619        {
1620            let (result, read, written, had_errors) =
1621                decoder.decode_to_utf16("\u{E4}a".as_bytes(), &mut output[..], false);
1622            assert_eq!(result, CoderResult::OutputFull);
1623            assert_eq!(read, 2);
1624            assert_eq!(written, 1);
1625            assert!(!had_errors);
1626            assert_eq!(output[0], 0x00E4);
1627        }
1628    }
1629}
encoding_rs/utf_8.rs

encoding_rs/
utf_8.rs