encoding_rs/
lib.rs

Help
1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10// The above license applies to code in this file. The label data in
11// this file is generated from WHATWG's encodings.json, which came under
12// the following license:
13
14// Copyright © WHATWG (Apple, Google, Mozilla, Microsoft).
15//
16// Redistribution and use in source and binary forms, with or without
17// modification, are permitted provided that the following conditions are met:
18//
19// 1. Redistributions of source code must retain the above copyright notice, this
20//    list of conditions and the following disclaimer.
21//
22// 2. Redistributions in binary form must reproduce the above copyright notice,
23//    this list of conditions and the following disclaimer in the documentation
24//    and/or other materials provided with the distribution.
25//
26// 3. Neither the name of the copyright holder nor the names of its
27//    contributors may be used to endorse or promote products derived from
28//    this software without specific prior written permission.
29//
30// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
31// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
33// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
34// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
36// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
37// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
38// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
39// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40
41#![cfg_attr(
42    feature = "cargo-clippy",
43    allow(doc_markdown, inline_always, new_ret_no_self)
44)]
45
46//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
47//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
48//! Gecko-oriented means that converting to and from UTF-16 is supported in
49//! addition to converting to and from UTF-8, that the performance and
50//! streamability goals are browser-oriented, and that FFI-friendliness is a
51//! goal.
52//!
53//! Additionally, the `mem` module provides functions that are useful for
54//! applications that need to be able to deal with legacy in-memory
55//! representations of Unicode.
56//!
57//! For expectation setting, please be sure to read the sections
58//! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
59//! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
60//!
61//! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
62//! design and internals of the crate.
63//!
64//! # Availability
65//!
66//! The code is available under the
67//! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
68//! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
69//! See the
70//! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
71//! file for details.
72//! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
73//! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
74//!
75//! # Integration with `std::io`
76//!
77//! This crate doesn't implement traits from `std::io`. However, for the case of
78//! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
79//! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
80//! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
81//!
82//! # Examples
83//!
84//! Example programs:
85//!
86//! * [Rust](https://github.com/hsivonen/recode_rs)
87//! * [C](https://github.com/hsivonen/recode_c)
88//! * [C++](https://github.com/hsivonen/recode_cpp)
89//!
90//! Decode using the non-streaming API:
91//!
92//! ```
93//! #[cfg(feature = "alloc")] {
94//! use encoding_rs::*;
95//!
96//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
97//! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
98//!
99//! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
100//! assert_eq!(&cow[..], expectation);
101//! assert_eq!(encoding_used, SHIFT_JIS);
102//! assert!(!had_errors);
103//! }
104//! ```
105//!
106//! Decode using the streaming API with minimal `unsafe`:
107//!
108//! ```
109//! use encoding_rs::*;
110//!
111//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
112//!
113//! // Use an array of byte slices to demonstrate content arriving piece by
114//! // piece from the network.
115//! let bytes: [&'static [u8]; 4] = [b"\x83",
116//!                                  b"n\x83\x8D\x81",
117//!                                  b"[\x81E\x83\x8F\x81[\x83",
118//!                                  b"\x8B\x83h"];
119//!
120//! // Very short output buffer to demonstrate the output buffer getting full.
121//! // Normally, you'd use something like `[0u8; 2048]`.
122//! let mut buffer_bytes = [0u8; 8];
123//! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
124//!
125//! // How many bytes in the buffer currently hold significant data.
126//! let mut bytes_in_buffer = 0usize;
127//!
128//! // Collect the output to a string for demonstration purposes.
129//! let mut output = String::new();
130//!
131//! // The `Decoder`
132//! let mut decoder = SHIFT_JIS.new_decoder();
133//!
134//! // Track whether we see errors.
135//! let mut total_had_errors = false;
136//!
137//! // Decode using a fixed-size intermediate buffer (for demonstrating the
138//! // use of a fixed-size buffer; normally when the output of an incremental
139//! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
140//! // avoid the intermediate buffer).
141//! for input in &bytes[..] {
142//!     // The number of bytes already read from current `input` in total.
143//!     let mut total_read_from_current_input = 0usize;
144//!
145//!     loop {
146//!         let (result, read, written, had_errors) =
147//!             decoder.decode_to_str(&input[total_read_from_current_input..],
148//!                                   &mut buffer[bytes_in_buffer..],
149//!                                   false);
150//!         total_read_from_current_input += read;
151//!         bytes_in_buffer += written;
152//!         total_had_errors |= had_errors;
153//!         match result {
154//!             CoderResult::InputEmpty => {
155//!                 // We have consumed the current input buffer. Break out of
156//!                 // the inner loop to get the next input buffer from the
157//!                 // outer loop.
158//!                 break;
159//!             },
160//!             CoderResult::OutputFull => {
161//!                 // Write the current buffer out and consider the buffer
162//!                 // empty.
163//!                 output.push_str(&buffer[..bytes_in_buffer]);
164//!                 bytes_in_buffer = 0usize;
165//!                 continue;
166//!             }
167//!         }
168//!     }
169//! }
170//!
171//! // Process EOF
172//! loop {
173//!     let (result, _, written, had_errors) =
174//!         decoder.decode_to_str(b"",
175//!                               &mut buffer[bytes_in_buffer..],
176//!                               true);
177//!     bytes_in_buffer += written;
178//!     total_had_errors |= had_errors;
179//!     // Write the current buffer out and consider the buffer empty.
180//!     // Need to do this here for both `match` arms, because we exit the
181//!     // loop on `CoderResult::InputEmpty`.
182//!     output.push_str(&buffer[..bytes_in_buffer]);
183//!     bytes_in_buffer = 0usize;
184//!     match result {
185//!         CoderResult::InputEmpty => {
186//!             // Done!
187//!             break;
188//!         },
189//!         CoderResult::OutputFull => {
190//!             continue;
191//!         }
192//!     }
193//! }
194//!
195//! assert_eq!(&output[..], expectation);
196//! assert!(!total_had_errors);
197//! ```
198//!
199//! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
200//!
201//! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
202//! __so this crate does not provide encoders for those encodings__!
203//! Along with the replacement encoding, their _output encoding_ (i.e. the
204//! encoding used for form submission and error handling in the query string
205//! of URLs) is UTF-8, so you get an UTF-8 encoder if you request an encoder
206//! for them.
207//!
208//! Additionally, the Encoding Standard factors BOM handling into wrapper
209//! algorithms so that BOM handling isn't part of the definition of the
210//! encodings themselves. The Unicode _encoding schemes_ in the Unicode
211//! Standard define BOM handling or lack thereof as part of the encoding
212//! scheme.
213//!
214//! When used with the `_without_bom_handling` entry points, the UTF-16LE
215//! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
216//! the Unicode Standard.
217//!
218//! When used with the `_with_bom_removal` entry points, the UTF-8
219//! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
220//! Standard.
221//!
222//! This crate does not provide a mode that matches the UTF-16 _encoding
223//! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
224//! the entry points without `_bom_` qualifiers is the closest match,
225//! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
226//! not part of the behavior of the UTF-16 _encoding scheme_ per the
227//! Unicode Standard.
228//!
229//! The UTF-32 family of Unicode encoding schemes is not supported
230//! by this crate. The Encoding Standard doesn't define any UTF-32
231//! family encodings, since they aren't necessary for consuming Web
232//! content.
233//!
234//! While gb18030 is capable of representing U+FEFF, the Encoding
235//! Standard does not treat the gb18030 byte representation of U+FEFF
236//! as a BOM, so neither does this crate.
237//!
238//! ## ISO-8859-1
239//!
240//! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
241//! the Encoding Standard. Therefore, an encoding that maps the unsigned
242//! byte value to the same Unicode scalar value is not available via
243//! `Encoding` in this crate.
244//!
245//! However, the functions whose name starts with `convert` and contains
246//! `latin1` in the `mem` module support such conversions, which are known as
247//! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
248//! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
249//! in the [Infra Standard](https://infra.spec.whatwg.org/).
250//!
251//! ## Web / Browser Focus
252//!
253//! Both in terms of scope and performance, the focus is on the Web. For scope,
254//! this means that encoding_rs implements the Encoding Standard fully and
255//! doesn't implement encodings that are not specified in the Encoding
256//! Standard. For performance, this means that decoding performance is
257//! important as well as performance for encoding into UTF-8 or encoding the
258//! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
259//! be encoded into legacy encodings in only two places in the Web platform: in
260//! the query part of URLs, in which case it's a matter of relatively rare
261//! error handling, and in form submission, in which case the user action and
262//! networking tend to hide the performance of the encoder.
263//!
264//! Deemphasizing performance of encoding non-Basic Latin text into legacy
265//! encodings enables smaller code size thanks to the encoder side using the
266//! decode-optimized data tables without having encode-optimized data tables at
267//! all. Even in decoders, smaller lookup table size is preferred over avoiding
268//! multiplication operations.
269//!
270//! Additionally, performance is a non-goal for the ASCII-incompatible
271//! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
272//! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
273//! of implementation.
274//!
275//! Despite the browser focus, the hope is that non-browser applications
276//! that wish to consume Web content or submit Web forms in a Web-compatible
277//! way will find encoding_rs useful. While encoding_rs does not try to match
278//! Windows behavior, many of the encodings are close enough to legacy
279//! encodings implemented by Windows that applications that need to consume
280//! data in legacy Windows encodins may find encoding_rs useful. The
281//! [codepage](https://crates.io/crates/codepage) crate maps from Windows
282//! code page identifiers onto encoding_rs `Encoding`s and vice versa.
283//!
284//! For decoding email, UTF-7 support is needed (unfortunately) in additition
285//! to the encodings defined in the Encoding Standard. The
286//! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
287//! UTF-7 decoding for email purposes.
288//!
289//! For single-byte DOS encodings beyond the ones supported by the Encoding
290//! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
291//!
292//! # Preparing Text for the Encoders
293//!
294//! Normalizing text into Unicode Normalization Form C prior to encoding text
295//! into a legacy encoding minimizes unmappable characters. Text can be
296//! normalized to Unicode Normalization Form C using the
297//! [`icu_normalizer`](https://crates.io/crates/icu_normalizer) crate, which
298//! is part of [ICU4X](https://icu4x.unicode.org/).
299//!
300//! The exception is windows-1258, which after normalizing to Unicode
301//! Normalization Form C requires tone marks to be decomposed in order to
302//! minimize unmappable characters. Vietnamese tone marks can be decomposed
303//! using the [`detone`](https://crates.io/crates/detone) crate.
304//!
305//! # Streaming & Non-Streaming; Rust & C/C++
306//!
307//! The API in Rust has two modes of operation: streaming and non-streaming.
308//! The streaming API is the foundation of the implementation and should be
309//! used when processing data that arrives piecemeal from an i/o stream. The
310//! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
311//! to C callers. The non-streaming part of the API is for Rust callers only and
312//! is smart about borrowing instead of copying when possible. When
313//! streamability is not needed, the non-streaming API should be preferrer in
314//! order to avoid copying data when a borrow suffices.
315//!
316//! There is no analogous C API exposed via FFI, mainly because C doesn't have
317//! standard types for growable byte buffers and Unicode strings that know
318//! their length.
319//!
320//! The C API (header file generated at `target/include/encoding_rs.h` when
321//! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
322//! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
323//! The C binding comes with a [C++17 wrapper][2] that uses standard library +
324//! [GSL][3] types and that recreates the non-streaming API in C++ on top of
325//! the streaming API. A C++ wrapper with XPCOM/MFBT types is available as
326//! [`mozilla::Encoding`][4].
327//!
328//! The `Encoding` type is common to both the streaming and non-streaming
329//! modes. In the streaming mode, decoding operations are performed with a
330//! `Decoder` and encoding operations with an `Encoder` object obtained via
331//! `Encoding`. In the non-streaming mode, decoding and encoding operations are
332//! performed using methods on `Encoding` objects themselves, so the `Decoder`
333//! and `Encoder` objects are not used at all.
334//!
335//! [1]: https://github.com/hsivonen/encoding_c
336//! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
337//! [3]: https://github.com/Microsoft/GSL/
338//! [4]: https://searchfox.org/mozilla-central/source/intl/Encoding.h
339//!
340//! # Memory management
341//!
342//! The non-streaming mode never performs heap allocations (even the methods
343//! that write into a `Vec<u8>` or a `String` by taking them as arguments do
344//! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
345//! is, the non-streaming mode uses caller-allocated buffers exclusively.
346//!
347//! The methods of the streaming mode that return a `Vec<u8>` or a `String`
348//! perform heap allocations but only to allocate the backing buffer of the
349//! `Vec<u8>` or the `String`.
350//!
351//! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
352//! `Drop` cleanup.
353//!
354//! # Buffer reading and writing behavior
355//!
356//! Based on experience gained with the `java.nio.charset` encoding converter
357//! API and with the Gecko uconv encoding converter API, the buffer reading
358//! and writing behaviors of encoding_rs are asymmetric: input buffers are
359//! fully drained but output buffers are not always fully filled.
360//!
361//! When reading from an input buffer, encoding_rs always consumes all input
362//! up to the next error or to the end of the buffer. In particular, when
363//! decoding, even if the input buffer ends in the middle of a byte sequence
364//! for a character, the decoder consumes all input. This has the benefit that
365//! the caller of the API can always fill the next buffer from the start from
366//! whatever source the bytes come from and never has to first copy the last
367//! bytes of the previous buffer to the start of the next buffer. However, when
368//! encoding, the UTF-8 input buffers have to end at a character boundary, which
369//! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
370//! boundaries falling in the middle of a surrogate pair result in both
371//! suggorates being treated individually as unpaired surrogates.
372//!
373//! Additionally, decoders guarantee that they can be fed even one byte at a
374//! time and encoders guarantee that they can be fed even one code point at a
375//! time. This has the benefit of not placing restrictions on the size of
376//! chunks the content arrives e.g. from network.
377//!
378//! When writing into an output buffer, encoding_rs makes sure that the code
379//! unit sequence for a character is never split across output buffer
380//! boundaries. This may result in wasted space at the end of an output buffer,
381//! but the advantages are that the output side of both decoders and encoders
382//! is greatly simplified compared to designs that attempt to fill output
383//! buffers exactly even when that entails splitting a code unit sequence and
384//! when encoding_rs methods return to the caller, the output produces thus
385//! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
386//! the output needs to be considered as a whole, because the latest output
387//! buffer taken alone might not be valid taken alone if the transition away
388//! from the ASCII state occurred in an earlier output buffer. However, since
389//! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
390//! state as being in error despite the encoder generating a transition to the
391//! ASCII state at the end, the claim about the partial output taken as a whole
392//! being valid is true even for ISO-2022-JP.)
393//!
394//! # Error Reporting
395//!
396//! Based on experience gained with the `java.nio.charset` encoding converter
397//! API and with the Gecko uconv encoding converter API, the error reporting
398//! behaviors of encoding_rs are asymmetric: decoder errors include offsets
399//! that leave it up to the caller to extract the erroneous bytes from the
400//! input stream if the caller wishes to do so but encoder errors provide the
401//! code point associated with the error without requiring the caller to
402//! extract it from the input on its own.
403//!
404//! On the encoder side, an error is always triggered by the most recently
405//! pushed Unicode scalar, which makes it simple to pass the `char` to the
406//! caller. Also, it's very typical for the caller to wish to do something with
407//! this data: generate a numeric escape for the character. Additionally, the
408//! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
409//! certain cases, so requiring the caller to extract the character from the
410//! input buffer would require the caller to handle ISO-2022-JP details.
411//! Furthermore, requiring the caller to extract the character from the input
412//! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
413//! the job of an encoding conversion library.
414//!
415//! On the decoder side, errors are triggered in more complex ways. For
416//! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
417//! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
418//! the buffer boundary when processing 'A'. Thus, the bytes in error might not
419//! be the ones most recently pushed to the decoder and the error might not even
420//! be in the current buffer.
421//!
422//! Some encoding conversion APIs address the problem by not acknowledging
423//! trailing bytes of an input buffer as consumed if it's still possible for
424//! future bytes to cause the trailing bytes to be in error. This way, error
425//! reporting can always refer to the most recently pushed buffer. This has the
426//! problem that the caller of the API has to copy the unconsumed trailing
427//! bytes to the start of the next buffer before being able to fill the rest
428//! of the next buffer. This is annoying, error-prone and inefficient.
429//!
430//! A possible solution would be making the decoder remember recently consumed
431//! bytes in order to be able to include a copy of the erroneous bytes when
432//! reporting an error. This has two problem: First, callers a rarely
433//! interested in the erroneous bytes, so attempts to identify them are most
434//! often just overhead anyway. Second, the rare applications that are
435//! interested typically care about the location of the error in the input
436//! stream.
437//!
438//! To keep the API convenient for common uses and the overhead low while making
439//! it possible to develop applications, such as HTML validators, that care
440//! about which bytes were in error, encoding_rs reports the length of the
441//! erroneous sequence and the number of bytes consumed after the erroneous
442//! sequence. As long as the caller doesn't discard the 6 most recent bytes,
443//! this makes it possible for callers that care about the erroneous bytes to
444//! locate them.
445//!
446//! # No Convenience API for Custom Replacements
447//!
448//! The Web Platform and, therefore, the Encoding Standard supports only one
449//! error recovery mode for decoders and only one error recovery mode for
450//! encoders. The supported error recovery mode for decoders is emitting the
451//! REPLACEMENT CHARACTER on error. The supported error recovery mode for
452//! encoders is emitting an HTML decimal numeric character reference for
453//! unmappable characters.
454//!
455//! Since encoding_rs is Web-focused, these are the only error recovery modes
456//! for which convenient support is provided. Moreover, on the decoder side,
457//! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
458//! on error (other than treating errors as fatal). In particular, simply
459//! ignoring errors is a
460//! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
461//! so it would be a bad idea for encoding_rs to provide a mode that encouraged
462//! callers to ignore errors.
463//!
464//! On the encoder side, there are plausible alternatives for HTML decimal
465//! numeric character references. For example, when outputting CSS, CSS-style
466//! escapes would seem to make sense. However, instead of facilitating the
467//! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
468//! position that you shouldn't generate output in encodings other than UTF-8,
469//! except where backward compatibility with interacting with the legacy Web
470//! requires it. The legacy Web requires it only when parsing the query strings
471//! of URLs and when submitting forms, and those two both use HTML decimal
472//! numeric character references.
473//!
474//! While encoding_rs doesn't make encoder replacements other than HTML decimal
475//! numeric character references easy, it does make them _possible_.
476//! `encode_from_utf8()`, which emits HTML decimal numeric character references
477//! for unmappable characters, is implemented on top of
478//! `encode_from_utf8_without_replacement()`. Applications that really, really
479//! want other replacement schemes for unmappable characters can likewise
480//! implement them on top of `encode_from_utf8_without_replacement()`.
481//!
482//! # No Extensibility by Design
483//!
484//! The set of encodings supported by encoding_rs is not extensible by design.
485//! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
486//! rather than `trait`s. encoding_rs takes the design position that all future
487//! text interchange should be done using UTF-8, which can represent all of
488//! Unicode. (It is, in fact, the only encoding supported by the Encoding
489//! Standard and encoding_rs that can represent all of Unicode and that has
490//! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
491//! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
492//! legacy compatibility and not due to non-UTF-8 encodings having benefits
493//! other than being able to consume legacy content.
494//!
495//! Considering that UTF-8 can represent all of Unicode and is already supported
496//! by all Web browsers, introducing a new encoding wouldn't add to the
497//! expressiveness but would add to compatibility problems. In that sense,
498//! adding new encodings to the Web Platform doesn't make sense, and, in fact,
499//! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
500//! the Web Platform. On the other hand, the set of legacy encodings that must
501//! be supported for a Web browser to be able to be successful is not going to
502//! expand. Empirically, the set of encodings specified in the Encoding Standard
503//! is already sufficient and the set of legacy encodings won't grow
504//! retroactively.
505//!
506//! Since extensibility doesn't make sense considering the Web focus of
507//! encoding_rs and adding encodings to Web clients would be actively harmful,
508//! it makes sense to make the set of encodings that encoding_rs supports
509//! non-extensible and to take the (admittedly small) benefits arising from
510//! that, such as the size of `Decoder` and `Encoder` objects being known ahead
511//!  of time, which enables stack allocation thereof.
512//!
513//! This does have downsides for applications that might want to put encoding_rs
514//! to non-Web uses if those non-Web uses involve legacy encodings that aren't
515//! needed for Web uses. The needs of such applications should not complicate
516//! encoding_rs itself, though. It is up to those applications to provide a
517//! framework that delegates the operations with encodings that encoding_rs
518//! supports to encoding_rs and operations with other encodings to something
519//! else (as opposed to encoding_rs itself providing an extensibility
520//! framework).
521//!
522//! # Panics
523//!
524//! Methods in encoding_rs can panic if the API is used against the requirements
525//! stated in the documentation, if a state that's supposed to be impossible
526//! is reached due to an internal bug or on integer overflow. When used
527//! according to documentation with buffer sizes that stay below integer
528//! overflow, in the absence of internal bugs, encoding_rs does not panic.
529//!
530//! Panics arising from API misuse aren't documented beyond this on individual
531//! methods.
532//!
533//! # At-Risk Parts of the API
534//!
535//! The foreseeable source of partially backward-incompatible API change is the
536//! way the instances of `Encoding` are made available.
537//!
538//! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
539//! initialized with `static`s of type `&'static Encoding`, the non-reference
540//! `FOO_INIT` public `Encoding` instances will be removed from the public API.
541//!
542//! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
543//! unique when the constant is used in different crates, the reference-typed
544//! `static`s for the encoding instances will be changed from `static` to
545//! `const` and the non-reference-typed `_INIT` instances will be removed.
546//!
547//! # Mapping Spec Concepts onto the API
548//!
549//! <table>
550//! <thead>
551//! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
552//! </thead>
553//! <tbody>
554//! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&amp;'static Encoding</code></td><td><code>&amp;'static Encoding</code></td></tr>
555//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
556//! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
557//! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
558//! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
559//! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
560//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
561//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
562//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// &hellip; (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
563//! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
564//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// &hellip;</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
565//! </tbody>
566//! </table>
567//!
568//! # Compatibility with the rust-encoding API
569//!
570//! The crate
571//! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
572//! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
573//! the API of rust-encoding 0.2.32 on top of encoding_rs.
574//!
575//! # Mapping rust-encoding concepts to encoding_rs concepts
576//!
577//! The following table provides a mapping from rust-encoding constructs to
578//! encoding_rs ones.
579//!
580//! <table>
581//! <thead>
582//! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
583//! </thead>
584//! <tbody>
585//! <tr><td><code>encoding::EncodingRef</code></td><td><code>&amp;'static encoding_rs::Encoding</code></td></tr>
586//! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
587//! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
588//! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
589//! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
590//! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
591//! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
592//! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
593//! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
594//! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
595//! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
596//! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
597//! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
598//! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
599//! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
600//! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
601//! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
602//! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
603//! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
604//! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
605//! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
606//! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
607//! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
608//! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
609//! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
610//! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
611//! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
612//! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
613//! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
614//! </tbody>
615//! </table>
616//!
617//! # Relationship with Windows Code Pages
618//!
619//! Despite the Web and browser focus, the encodings defined by the Encoding
620//! Standard and implemented by this crate may be useful for decoding legacy
621//! data that uses Windows code pages. The following table names the single-byte
622//! encodings
623//! that have a closely related Windows code page, the number of the closest
624//! code page, a column indicating whether Windows maps unassigned code points
625//! to the Unicode Private Use Area instead of U+FFFD and a remark number
626//! indicating remarks in the list after the table.
627//!
628//! <table>
629//! <thead>
630//! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
631//! </thead>
632//! <tbody>
633//! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
634//! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
635//! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
636//! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
637//! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
638//! <tr><td>windows-874</td><td>874</td><td>&bullet;</td><td></td></tr>
639//! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
640//! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
641//! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
642//! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
643//! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
644//! <tr><td>windows-1253</td><td>1253</td><td>&bullet;</td><td></td></tr>
645//! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
646//! <tr><td>windows-1255</td><td>1255</td><td>&bullet;</td><td></td></tr>
647//! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
648//! <tr><td>windows-1257</td><td>1257</td><td>&bullet;</td><td></td></tr>
649//! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
650//! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
651//! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
652//! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
653//! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
654//! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
655//! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
656//! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
657//! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
658//! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
659//! <tr><td>ISO-8859-6</td><td>28596</td><td>&bullet;</td><td></td></tr>
660//! <tr><td>ISO-8859-7</td><td>28597</td><td>&bullet;</td><td>3</td></tr>
661//! <tr><td>ISO-8859-8</td><td>28598</td><td>&bullet;</td><td>4</td></tr>
662//! <tr><td>ISO-8859-13</td><td>28603</td><td>&bullet;</td><td></td></tr>
663//! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
664//! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
665//! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
666//! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
667//! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
668//! </tbody>
669//! </table>
670//!
671//! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
672//! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
673//! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
674//!    which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
675//!    decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
676//!    LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
677//!    instead of U+2019 RIGHT SINGLE QUOTATION MARK.
678//! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
679//!    of LRM and RLM.
680//! 5. Remarks from the previous item apply.
681//!
682//! The differences between this crate and Windows in the case of multibyte encodings
683//! are not yet fully documented here. The lack of remarks above should not be taken
684//! as indication of lack of differences.
685//!
686//! # Notable Differences from IANA Naming
687//!
688//! In some cases, the Encoding Standard specifies the popular unextended encoding
689//! name where in IANA terms one of the other labels would be more precise considering
690//! the extensions that the Encoding Standard has unified into the encoding.
691//!
692//! <table>
693//! <thead>
694//! <tr><th>Encoding</th><th>IANA</th></tr>
695//! </thead>
696//! <tbody>
697//! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
698//! <tr><td>EUC-KR</td><td>windows-949</td></tr>
699//! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
700//! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
701//! </tbody>
702//! </table>
703//!
704//! In other cases where the Encoding Standard unifies unextended and extended
705//! variants of an encoding, the encoding gets the name of the extended
706//! variant.
707//!
708//! <table>
709//! <thead>
710//! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
711//! </thead>
712//! <tbody>
713//! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
714//! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
715//! <tr><td>TIS-620</td><td>windows-874</td></tr>
716//! </tbody>
717//! </table>
718//!
719//! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
720//! for discussion about the UTF-16 family.
721
722#![no_std]
723#![cfg_attr(feature = "simd-accel", feature(core_intrinsics, portable_simd))]
724
725#[cfg(feature = "alloc")]
726#[cfg_attr(test, macro_use)]
727extern crate alloc;
728
729extern crate core;
730#[macro_use]
731extern crate cfg_if;
732
733#[cfg(feature = "serde")]
734extern crate serde;
735
736#[cfg(all(test, feature = "serde"))]
737extern crate bincode;
738#[cfg(all(test, feature = "serde"))]
739#[macro_use]
740extern crate serde_derive;
741#[cfg(all(test, feature = "serde"))]
742extern crate serde_json;
743
744#[macro_use]
745mod macros;
746
747#[cfg(all(
748    feature = "simd-accel",
749    any(
750        target_feature = "sse2",
751        all(target_endian = "little", target_arch = "aarch64"),
752        all(target_endian = "little", target_feature = "neon")
753    )
754))]
755mod simd_funcs;
756
757#[cfg(all(test, feature = "alloc"))]
758mod testing;
759
760mod big5;
761mod euc_jp;
762mod euc_kr;
763mod gb18030;
764mod gb18030_2022;
765mod iso_2022_jp;
766mod replacement;
767mod shift_jis;
768mod single_byte;
769mod utf_16;
770mod utf_8;
771mod x_user_defined;
772
773mod ascii;
774mod data;
775mod handles;
776mod variant;
777
778pub mod mem;
779
780use crate::ascii::ascii_valid_up_to;
781use crate::ascii::iso_2022_jp_ascii_valid_up_to;
782use crate::utf_8::utf8_valid_up_to;
783use crate::variant::*;
784
785#[cfg(feature = "alloc")]
786use alloc::borrow::Cow;
787#[cfg(feature = "alloc")]
788use alloc::string::String;
789#[cfg(feature = "alloc")]
790use alloc::vec::Vec;
791use core::cmp::Ordering;
792use core::hash::Hash;
793use core::hash::Hasher;
794
795#[cfg(feature = "serde")]
796use serde::de::Visitor;
797#[cfg(feature = "serde")]
798use serde::{Deserialize, Deserializer, Serialize, Serializer};
799
800/// This has to be the max length of an NCR instead of max
801/// minus one, because we can't rely on getting the minus
802/// one from the space reserved for the current unmappable,
803/// because the ISO-2022-JP encoder can fill up that space
804/// with a state transition escape.
805const NCR_EXTRA: usize = 10; // &#1114111;
806
807// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
808// Instead, please regenerate using generate-encoding-data.py
809
810const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
811
812/// The initializer for the [Big5](static.BIG5.html) encoding.
813///
814/// For use only for taking the address of this form when
815/// Rust prohibits the use of the non-`_INIT` form directly,
816/// such as in initializers of other `static`s. If in doubt,
817/// use the corresponding non-`_INIT` reference-typed `static`.
818///
819/// This part of the public API will go away if Rust changes
820/// to make the referent of `pub const FOO: &'static Encoding`
821/// unique cross-crate or if Rust starts allowing static arrays
822/// to be initialized with `pub static FOO: &'static Encoding`
823/// items.
824pub static BIG5_INIT: Encoding = Encoding {
825    name: "Big5",
826    variant: VariantEncoding::Big5,
827};
828
829/// The Big5 encoding.
830///
831/// This is Big5 with HKSCS with mappings to more recent Unicode assignments
832/// instead of the Private Use Area code points that have been used historically.
833/// It is believed to be able to decode existing Web content in a way that makes
834/// sense.
835///
836/// To avoid form submissions generating data that Web servers don't understand,
837/// the encoder doesn't use the HKSCS byte sequences that precede the unextended
838/// Big5 in the lexical order.
839///
840/// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
841/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
842///
843/// This encoding is designed to be suited for decoding the Windows code page 950
844/// and its HKSCS patched "951" variant such that the text makes sense, given
845/// assignments that Unicode has made after those encodings used Private Use
846/// Area characters.
847///
848/// This will change from `static` to `const` if Rust changes
849/// to make the referent of `pub const FOO: &'static Encoding`
850/// unique cross-crate, so don't take the address of this
851/// `static`.
852pub static BIG5: &'static Encoding = &BIG5_INIT;
853
854/// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
855///
856/// For use only for taking the address of this form when
857/// Rust prohibits the use of the non-`_INIT` form directly,
858/// such as in initializers of other `static`s. If in doubt,
859/// use the corresponding non-`_INIT` reference-typed `static`.
860///
861/// This part of the public API will go away if Rust changes
862/// to make the referent of `pub const FOO: &'static Encoding`
863/// unique cross-crate or if Rust starts allowing static arrays
864/// to be initialized with `pub static FOO: &'static Encoding`
865/// items.
866pub static EUC_JP_INIT: Encoding = Encoding {
867    name: "EUC-JP",
868    variant: VariantEncoding::EucJp,
869};
870
871/// The EUC-JP encoding.
872///
873/// This is the legacy Unix encoding for Japanese.
874///
875/// For compatibility with Web servers that don't expect three-byte sequences
876/// in form submissions, the encoder doesn't generate three-byte sequences.
877/// That is, the JIS X 0212 support is decode-only.
878///
879/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
880/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
881///
882/// This encoding roughly matches the Windows code page 20932. There are error
883/// handling differences and a handful of 2-byte sequences that decode differently.
884/// Additionall, Windows doesn't support 3-byte sequences.
885///
886/// This will change from `static` to `const` if Rust changes
887/// to make the referent of `pub const FOO: &'static Encoding`
888/// unique cross-crate, so don't take the address of this
889/// `static`.
890pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
891
892/// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
893///
894/// For use only for taking the address of this form when
895/// Rust prohibits the use of the non-`_INIT` form directly,
896/// such as in initializers of other `static`s. If in doubt,
897/// use the corresponding non-`_INIT` reference-typed `static`.
898///
899/// This part of the public API will go away if Rust changes
900/// to make the referent of `pub const FOO: &'static Encoding`
901/// unique cross-crate or if Rust starts allowing static arrays
902/// to be initialized with `pub static FOO: &'static Encoding`
903/// items.
904pub static EUC_KR_INIT: Encoding = Encoding {
905    name: "EUC-KR",
906    variant: VariantEncoding::EucKr,
907};
908
909/// The EUC-KR encoding.
910///
911/// This is the Korean encoding for Windows. It extends the Unix legacy encoding
912/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
913/// Classic), with all the characters from the Hangul Syllables block of Unicode.
914///
915/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
916/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
917///
918/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
919/// to U+0080 and some byte sequences that are error per the Encoding Standard to
920/// the question mark or the Private Use Area.
921///
922/// This will change from `static` to `const` if Rust changes
923/// to make the referent of `pub const FOO: &'static Encoding`
924/// unique cross-crate, so don't take the address of this
925/// `static`.
926pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
927
928/// The initializer for the [GBK](static.GBK.html) encoding.
929///
930/// For use only for taking the address of this form when
931/// Rust prohibits the use of the non-`_INIT` form directly,
932/// such as in initializers of other `static`s. If in doubt,
933/// use the corresponding non-`_INIT` reference-typed `static`.
934///
935/// This part of the public API will go away if Rust changes
936/// to make the referent of `pub const FOO: &'static Encoding`
937/// unique cross-crate or if Rust starts allowing static arrays
938/// to be initialized with `pub static FOO: &'static Encoding`
939/// items.
940pub static GBK_INIT: Encoding = Encoding {
941    name: "GBK",
942    variant: VariantEncoding::Gbk,
943};
944
945/// The GBK encoding.
946///
947/// The decoder for this encoding is the same as the decoder for gb18030.
948/// The encoder side of this encoding is GBK with Windows code page 936 euro
949/// sign behavior and with the changes to two-byte sequences made in GB18030-2022.
950/// GBK extends GB2312-80 to cover the CJK Unified Ideographs Unicode block as
951/// well as a handful of ideographs from the CJK Unified Ideographs Extension A
952/// and CJK Compatibility Ideographs blocks.
953///
954/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
955/// unified with the gb18030 encoder in the Encoding Standard out of concern
956/// that servers that expect GBK form submissions might not be able to handle
957/// the four-byte sequences.
958///
959/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
960/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
961///
962/// The encoder of this encoding roughly matches the Windows code page 936.
963/// The decoder side is a superset.
964///
965/// This will change from `static` to `const` if Rust changes
966/// to make the referent of `pub const FOO: &'static Encoding`
967/// unique cross-crate, so don't take the address of this
968/// `static`.
969pub static GBK: &'static Encoding = &GBK_INIT;
970
971/// The initializer for the [IBM866](static.IBM866.html) encoding.
972///
973/// For use only for taking the address of this form when
974/// Rust prohibits the use of the non-`_INIT` form directly,
975/// such as in initializers of other `static`s. If in doubt,
976/// use the corresponding non-`_INIT` reference-typed `static`.
977///
978/// This part of the public API will go away if Rust changes
979/// to make the referent of `pub const FOO: &'static Encoding`
980/// unique cross-crate or if Rust starts allowing static arrays
981/// to be initialized with `pub static FOO: &'static Encoding`
982/// items.
983pub static IBM866_INIT: Encoding = Encoding {
984    name: "IBM866",
985    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
986};
987
988/// The IBM866 encoding.
989///
990/// This the most notable one of the DOS Cyrillic code pages. It has the same
991/// box drawing characters as code page 437, so it can be used for decoding
992/// DOS-era ASCII + box drawing data.
993///
994/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
995/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
996///
997/// This encoding matches the Windows code page 866.
998///
999/// This will change from `static` to `const` if Rust changes
1000/// to make the referent of `pub const FOO: &'static Encoding`
1001/// unique cross-crate, so don't take the address of this
1002/// `static`.
1003pub static IBM866: &'static Encoding = &IBM866_INIT;
1004
1005/// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
1006///
1007/// For use only for taking the address of this form when
1008/// Rust prohibits the use of the non-`_INIT` form directly,
1009/// such as in initializers of other `static`s. If in doubt,
1010/// use the corresponding non-`_INIT` reference-typed `static`.
1011///
1012/// This part of the public API will go away if Rust changes
1013/// to make the referent of `pub const FOO: &'static Encoding`
1014/// unique cross-crate or if Rust starts allowing static arrays
1015/// to be initialized with `pub static FOO: &'static Encoding`
1016/// items.
1017pub static ISO_2022_JP_INIT: Encoding = Encoding {
1018    name: "ISO-2022-JP",
1019    variant: VariantEncoding::Iso2022Jp,
1020};
1021
1022/// The ISO-2022-JP encoding.
1023///
1024/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
1025/// byte range to encode non-Basic Latin characters. It's the only encoding
1026/// supported by this crate whose encoder is stateful.
1027///
1028/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
1029/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
1030///
1031/// This encoding roughly matches the Windows code page 50220. Notably, Windows
1032/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
1033/// error handling.
1034///
1035/// This will change from `static` to `const` if Rust changes
1036/// to make the referent of `pub const FOO: &'static Encoding`
1037/// unique cross-crate, so don't take the address of this
1038/// `static`.
1039pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
1040
1041/// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1042///
1043/// For use only for taking the address of this form when
1044/// Rust prohibits the use of the non-`_INIT` form directly,
1045/// such as in initializers of other `static`s. If in doubt,
1046/// use the corresponding non-`_INIT` reference-typed `static`.
1047///
1048/// This part of the public API will go away if Rust changes
1049/// to make the referent of `pub const FOO: &'static Encoding`
1050/// unique cross-crate or if Rust starts allowing static arrays
1051/// to be initialized with `pub static FOO: &'static Encoding`
1052/// items.
1053pub static ISO_8859_10_INIT: Encoding = Encoding {
1054    name: "ISO-8859-10",
1055    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1056};
1057
1058/// The ISO-8859-10 encoding.
1059///
1060/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1061/// is also known as Latin 6.
1062///
1063/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1064/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1065///
1066/// The Windows code page number for this encoding is 28600, but kernel32.dll
1067/// does not support this encoding.
1068///
1069/// This will change from `static` to `const` if Rust changes
1070/// to make the referent of `pub const FOO: &'static Encoding`
1071/// unique cross-crate, so don't take the address of this
1072/// `static`.
1073pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1074
1075/// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1076///
1077/// For use only for taking the address of this form when
1078/// Rust prohibits the use of the non-`_INIT` form directly,
1079/// such as in initializers of other `static`s. If in doubt,
1080/// use the corresponding non-`_INIT` reference-typed `static`.
1081///
1082/// This part of the public API will go away if Rust changes
1083/// to make the referent of `pub const FOO: &'static Encoding`
1084/// unique cross-crate or if Rust starts allowing static arrays
1085/// to be initialized with `pub static FOO: &'static Encoding`
1086/// items.
1087pub static ISO_8859_13_INIT: Encoding = Encoding {
1088    name: "ISO-8859-13",
1089    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1090};
1091
1092/// The ISO-8859-13 encoding.
1093///
1094/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1095/// is also known as Latin 7.
1096///
1097/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1098/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1099///
1100/// This encoding matches the Windows code page 28603, except Windows decodes
1101/// unassigned code points to the Private Use Area of Unicode.
1102///
1103/// This will change from `static` to `const` if Rust changes
1104/// to make the referent of `pub const FOO: &'static Encoding`
1105/// unique cross-crate, so don't take the address of this
1106/// `static`.
1107pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1108
1109/// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1110///
1111/// For use only for taking the address of this form when
1112/// Rust prohibits the use of the non-`_INIT` form directly,
1113/// such as in initializers of other `static`s. If in doubt,
1114/// use the corresponding non-`_INIT` reference-typed `static`.
1115///
1116/// This part of the public API will go away if Rust changes
1117/// to make the referent of `pub const FOO: &'static Encoding`
1118/// unique cross-crate or if Rust starts allowing static arrays
1119/// to be initialized with `pub static FOO: &'static Encoding`
1120/// items.
1121pub static ISO_8859_14_INIT: Encoding = Encoding {
1122    name: "ISO-8859-14",
1123    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1124};
1125
1126/// The ISO-8859-14 encoding.
1127///
1128/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1129/// is also known as Latin 8.
1130///
1131/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1132/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1133///
1134/// The Windows code page number for this encoding is 28604, but kernel32.dll
1135/// does not support this encoding.
1136///
1137/// This will change from `static` to `const` if Rust changes
1138/// to make the referent of `pub const FOO: &'static Encoding`
1139/// unique cross-crate, so don't take the address of this
1140/// `static`.
1141pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1142
1143/// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1144///
1145/// For use only for taking the address of this form when
1146/// Rust prohibits the use of the non-`_INIT` form directly,
1147/// such as in initializers of other `static`s. If in doubt,
1148/// use the corresponding non-`_INIT` reference-typed `static`.
1149///
1150/// This part of the public API will go away if Rust changes
1151/// to make the referent of `pub const FOO: &'static Encoding`
1152/// unique cross-crate or if Rust starts allowing static arrays
1153/// to be initialized with `pub static FOO: &'static Encoding`
1154/// items.
1155pub static ISO_8859_15_INIT: Encoding = Encoding {
1156    name: "ISO-8859-15",
1157    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1158};
1159
1160/// The ISO-8859-15 encoding.
1161///
1162/// This is the revised Western European part of the ISO/IEC 8859 encoding
1163/// family. This encoding is also known as Latin 9.
1164///
1165/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1166/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1167///
1168/// This encoding matches the Windows code page 28605.
1169///
1170/// This will change from `static` to `const` if Rust changes
1171/// to make the referent of `pub const FOO: &'static Encoding`
1172/// unique cross-crate, so don't take the address of this
1173/// `static`.
1174pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1175
1176/// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1177///
1178/// For use only for taking the address of this form when
1179/// Rust prohibits the use of the non-`_INIT` form directly,
1180/// such as in initializers of other `static`s. If in doubt,
1181/// use the corresponding non-`_INIT` reference-typed `static`.
1182///
1183/// This part of the public API will go away if Rust changes
1184/// to make the referent of `pub const FOO: &'static Encoding`
1185/// unique cross-crate or if Rust starts allowing static arrays
1186/// to be initialized with `pub static FOO: &'static Encoding`
1187/// items.
1188pub static ISO_8859_16_INIT: Encoding = Encoding {
1189    name: "ISO-8859-16",
1190    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1191};
1192
1193/// The ISO-8859-16 encoding.
1194///
1195/// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1196/// family. This encoding is also known as Latin 10.
1197///
1198/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1199/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1200///
1201/// The Windows code page number for this encoding is 28606, but kernel32.dll
1202/// does not support this encoding.
1203///
1204/// This will change from `static` to `const` if Rust changes
1205/// to make the referent of `pub const FOO: &'static Encoding`
1206/// unique cross-crate, so don't take the address of this
1207/// `static`.
1208pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1209
1210/// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1211///
1212/// For use only for taking the address of this form when
1213/// Rust prohibits the use of the non-`_INIT` form directly,
1214/// such as in initializers of other `static`s. If in doubt,
1215/// use the corresponding non-`_INIT` reference-typed `static`.
1216///
1217/// This part of the public API will go away if Rust changes
1218/// to make the referent of `pub const FOO: &'static Encoding`
1219/// unique cross-crate or if Rust starts allowing static arrays
1220/// to be initialized with `pub static FOO: &'static Encoding`
1221/// items.
1222pub static ISO_8859_2_INIT: Encoding = Encoding {
1223    name: "ISO-8859-2",
1224    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1225};
1226
1227/// The ISO-8859-2 encoding.
1228///
1229/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1230///
1231/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1232/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1233///
1234/// This encoding matches the Windows code page 28592.
1235///
1236/// This will change from `static` to `const` if Rust changes
1237/// to make the referent of `pub const FOO: &'static Encoding`
1238/// unique cross-crate, so don't take the address of this
1239/// `static`.
1240pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1241
1242/// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1243///
1244/// For use only for taking the address of this form when
1245/// Rust prohibits the use of the non-`_INIT` form directly,
1246/// such as in initializers of other `static`s. If in doubt,
1247/// use the corresponding non-`_INIT` reference-typed `static`.
1248///
1249/// This part of the public API will go away if Rust changes
1250/// to make the referent of `pub const FOO: &'static Encoding`
1251/// unique cross-crate or if Rust starts allowing static arrays
1252/// to be initialized with `pub static FOO: &'static Encoding`
1253/// items.
1254pub static ISO_8859_3_INIT: Encoding = Encoding {
1255    name: "ISO-8859-3",
1256    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1257};
1258
1259/// The ISO-8859-3 encoding.
1260///
1261/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1262///
1263/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1264/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1265///
1266/// This encoding matches the Windows code page 28593.
1267///
1268/// This will change from `static` to `const` if Rust changes
1269/// to make the referent of `pub const FOO: &'static Encoding`
1270/// unique cross-crate, so don't take the address of this
1271/// `static`.
1272pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1273
1274/// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1275///
1276/// For use only for taking the address of this form when
1277/// Rust prohibits the use of the non-`_INIT` form directly,
1278/// such as in initializers of other `static`s. If in doubt,
1279/// use the corresponding non-`_INIT` reference-typed `static`.
1280///
1281/// This part of the public API will go away if Rust changes
1282/// to make the referent of `pub const FOO: &'static Encoding`
1283/// unique cross-crate or if Rust starts allowing static arrays
1284/// to be initialized with `pub static FOO: &'static Encoding`
1285/// items.
1286pub static ISO_8859_4_INIT: Encoding = Encoding {
1287    name: "ISO-8859-4",
1288    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1289};
1290
1291/// The ISO-8859-4 encoding.
1292///
1293/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1294///
1295/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1296/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1297///
1298/// This encoding matches the Windows code page 28594.
1299///
1300/// This will change from `static` to `const` if Rust changes
1301/// to make the referent of `pub const FOO: &'static Encoding`
1302/// unique cross-crate, so don't take the address of this
1303/// `static`.
1304pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1305
1306/// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1307///
1308/// For use only for taking the address of this form when
1309/// Rust prohibits the use of the non-`_INIT` form directly,
1310/// such as in initializers of other `static`s. If in doubt,
1311/// use the corresponding non-`_INIT` reference-typed `static`.
1312///
1313/// This part of the public API will go away if Rust changes
1314/// to make the referent of `pub const FOO: &'static Encoding`
1315/// unique cross-crate or if Rust starts allowing static arrays
1316/// to be initialized with `pub static FOO: &'static Encoding`
1317/// items.
1318pub static ISO_8859_5_INIT: Encoding = Encoding {
1319    name: "ISO-8859-5",
1320    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1321};
1322
1323/// The ISO-8859-5 encoding.
1324///
1325/// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1326///
1327/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1328/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1329///
1330/// This encoding matches the Windows code page 28595.
1331///
1332/// This will change from `static` to `const` if Rust changes
1333/// to make the referent of `pub const FOO: &'static Encoding`
1334/// unique cross-crate, so don't take the address of this
1335/// `static`.
1336pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1337
1338/// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1339///
1340/// For use only for taking the address of this form when
1341/// Rust prohibits the use of the non-`_INIT` form directly,
1342/// such as in initializers of other `static`s. If in doubt,
1343/// use the corresponding non-`_INIT` reference-typed `static`.
1344///
1345/// This part of the public API will go away if Rust changes
1346/// to make the referent of `pub const FOO: &'static Encoding`
1347/// unique cross-crate or if Rust starts allowing static arrays
1348/// to be initialized with `pub static FOO: &'static Encoding`
1349/// items.
1350pub static ISO_8859_6_INIT: Encoding = Encoding {
1351    name: "ISO-8859-6",
1352    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1353};
1354
1355/// The ISO-8859-6 encoding.
1356///
1357/// This is the Arabic part of the ISO/IEC 8859 encoding family.
1358///
1359/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1360/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1361///
1362/// This encoding matches the Windows code page 28596, except Windows decodes
1363/// unassigned code points to the Private Use Area of Unicode.
1364///
1365/// This will change from `static` to `const` if Rust changes
1366/// to make the referent of `pub const FOO: &'static Encoding`
1367/// unique cross-crate, so don't take the address of this
1368/// `static`.
1369pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1370
1371/// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1372///
1373/// For use only for taking the address of this form when
1374/// Rust prohibits the use of the non-`_INIT` form directly,
1375/// such as in initializers of other `static`s. If in doubt,
1376/// use the corresponding non-`_INIT` reference-typed `static`.
1377///
1378/// This part of the public API will go away if Rust changes
1379/// to make the referent of `pub const FOO: &'static Encoding`
1380/// unique cross-crate or if Rust starts allowing static arrays
1381/// to be initialized with `pub static FOO: &'static Encoding`
1382/// items.
1383pub static ISO_8859_7_INIT: Encoding = Encoding {
1384    name: "ISO-8859-7",
1385    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1386};
1387
1388/// The ISO-8859-7 encoding.
1389///
1390/// This is the Greek part of the ISO/IEC 8859 encoding family.
1391///
1392/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1393/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1394///
1395/// This encoding roughly matches the Windows code page 28597. Windows decodes
1396/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1397/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1398/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1399/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1400/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1401///
1402/// This will change from `static` to `const` if Rust changes
1403/// to make the referent of `pub const FOO: &'static Encoding`
1404/// unique cross-crate, so don't take the address of this
1405/// `static`.
1406pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1407
1408/// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1409///
1410/// For use only for taking the address of this form when
1411/// Rust prohibits the use of the non-`_INIT` form directly,
1412/// such as in initializers of other `static`s. If in doubt,
1413/// use the corresponding non-`_INIT` reference-typed `static`.
1414///
1415/// This part of the public API will go away if Rust changes
1416/// to make the referent of `pub const FOO: &'static Encoding`
1417/// unique cross-crate or if Rust starts allowing static arrays
1418/// to be initialized with `pub static FOO: &'static Encoding`
1419/// items.
1420pub static ISO_8859_8_INIT: Encoding = Encoding {
1421    name: "ISO-8859-8",
1422    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1423};
1424
1425/// The ISO-8859-8 encoding.
1426///
1427/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1428///
1429/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1430/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1431///
1432/// This encoding roughly matches the Windows code page 28598. Windows decodes
1433/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1434/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1435/// the private use area.
1436///
1437/// This will change from `static` to `const` if Rust changes
1438/// to make the referent of `pub const FOO: &'static Encoding`
1439/// unique cross-crate, so don't take the address of this
1440/// `static`.
1441pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1442
1443/// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1444///
1445/// For use only for taking the address of this form when
1446/// Rust prohibits the use of the non-`_INIT` form directly,
1447/// such as in initializers of other `static`s. If in doubt,
1448/// use the corresponding non-`_INIT` reference-typed `static`.
1449///
1450/// This part of the public API will go away if Rust changes
1451/// to make the referent of `pub const FOO: &'static Encoding`
1452/// unique cross-crate or if Rust starts allowing static arrays
1453/// to be initialized with `pub static FOO: &'static Encoding`
1454/// items.
1455pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1456    name: "ISO-8859-8-I",
1457    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1458};
1459
1460/// The ISO-8859-8-I encoding.
1461///
1462/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1463///
1464/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1465/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1466///
1467/// This encoding roughly matches the Windows code page 38598. Windows decodes
1468/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1469/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1470/// the private use area.
1471///
1472/// This will change from `static` to `const` if Rust changes
1473/// to make the referent of `pub const FOO: &'static Encoding`
1474/// unique cross-crate, so don't take the address of this
1475/// `static`.
1476pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1477
1478/// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1479///
1480/// For use only for taking the address of this form when
1481/// Rust prohibits the use of the non-`_INIT` form directly,
1482/// such as in initializers of other `static`s. If in doubt,
1483/// use the corresponding non-`_INIT` reference-typed `static`.
1484///
1485/// This part of the public API will go away if Rust changes
1486/// to make the referent of `pub const FOO: &'static Encoding`
1487/// unique cross-crate or if Rust starts allowing static arrays
1488/// to be initialized with `pub static FOO: &'static Encoding`
1489/// items.
1490pub static KOI8_R_INIT: Encoding = Encoding {
1491    name: "KOI8-R",
1492    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1493};
1494
1495/// The KOI8-R encoding.
1496///
1497/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1498///
1499/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1500/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1501///
1502/// This encoding matches the Windows code page 20866.
1503///
1504/// This will change from `static` to `const` if Rust changes
1505/// to make the referent of `pub const FOO: &'static Encoding`
1506/// unique cross-crate, so don't take the address of this
1507/// `static`.
1508pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1509
1510/// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1511///
1512/// For use only for taking the address of this form when
1513/// Rust prohibits the use of the non-`_INIT` form directly,
1514/// such as in initializers of other `static`s. If in doubt,
1515/// use the corresponding non-`_INIT` reference-typed `static`.
1516///
1517/// This part of the public API will go away if Rust changes
1518/// to make the referent of `pub const FOO: &'static Encoding`
1519/// unique cross-crate or if Rust starts allowing static arrays
1520/// to be initialized with `pub static FOO: &'static Encoding`
1521/// items.
1522pub static KOI8_U_INIT: Encoding = Encoding {
1523    name: "KOI8-U",
1524    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1525};
1526
1527/// The KOI8-U encoding.
1528///
1529/// This is an encoding for Ukrainian adapted from KOI8-R.
1530///
1531/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1532/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1533///
1534/// This encoding matches the Windows code page 21866.
1535///
1536/// This will change from `static` to `const` if Rust changes
1537/// to make the referent of `pub const FOO: &'static Encoding`
1538/// unique cross-crate, so don't take the address of this
1539/// `static`.
1540pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1541
1542/// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1543///
1544/// For use only for taking the address of this form when
1545/// Rust prohibits the use of the non-`_INIT` form directly,
1546/// such as in initializers of other `static`s. If in doubt,
1547/// use the corresponding non-`_INIT` reference-typed `static`.
1548///
1549/// This part of the public API will go away if Rust changes
1550/// to make the referent of `pub const FOO: &'static Encoding`
1551/// unique cross-crate or if Rust starts allowing static arrays
1552/// to be initialized with `pub static FOO: &'static Encoding`
1553/// items.
1554pub static SHIFT_JIS_INIT: Encoding = Encoding {
1555    name: "Shift_JIS",
1556    variant: VariantEncoding::ShiftJis,
1557};
1558
1559/// The Shift_JIS encoding.
1560///
1561/// This is the Japanese encoding for Windows.
1562///
1563/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1564/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1565///
1566/// This encoding matches the Windows code page 932, except Windows decodes some byte
1567/// sequences that are error per the Encoding Standard to the question mark or the
1568/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1569///
1570/// This will change from `static` to `const` if Rust changes
1571/// to make the referent of `pub const FOO: &'static Encoding`
1572/// unique cross-crate, so don't take the address of this
1573/// `static`.
1574pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1575
1576/// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1577///
1578/// For use only for taking the address of this form when
1579/// Rust prohibits the use of the non-`_INIT` form directly,
1580/// such as in initializers of other `static`s. If in doubt,
1581/// use the corresponding non-`_INIT` reference-typed `static`.
1582///
1583/// This part of the public API will go away if Rust changes
1584/// to make the referent of `pub const FOO: &'static Encoding`
1585/// unique cross-crate or if Rust starts allowing static arrays
1586/// to be initialized with `pub static FOO: &'static Encoding`
1587/// items.
1588pub static UTF_16BE_INIT: Encoding = Encoding {
1589    name: "UTF-16BE",
1590    variant: VariantEncoding::Utf16Be,
1591};
1592
1593/// The UTF-16BE encoding.
1594///
1595/// This decode-only encoding uses 16-bit code units due to Unicode originally
1596/// having been designed as a 16-bit reportoire. In the absence of a byte order
1597/// mark the big endian byte order is assumed.
1598///
1599/// There is no corresponding encoder in this crate or in the Encoding
1600/// Standard. The output encoding of this encoding is UTF-8.
1601///
1602/// This encoding matches the Windows code page 1201.
1603///
1604/// This will change from `static` to `const` if Rust changes
1605/// to make the referent of `pub const FOO: &'static Encoding`
1606/// unique cross-crate, so don't take the address of this
1607/// `static`.
1608pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1609
1610/// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1611///
1612/// For use only for taking the address of this form when
1613/// Rust prohibits the use of the non-`_INIT` form directly,
1614/// such as in initializers of other `static`s. If in doubt,
1615/// use the corresponding non-`_INIT` reference-typed `static`.
1616///
1617/// This part of the public API will go away if Rust changes
1618/// to make the referent of `pub const FOO: &'static Encoding`
1619/// unique cross-crate or if Rust starts allowing static arrays
1620/// to be initialized with `pub static FOO: &'static Encoding`
1621/// items.
1622pub static UTF_16LE_INIT: Encoding = Encoding {
1623    name: "UTF-16LE",
1624    variant: VariantEncoding::Utf16Le,
1625};
1626
1627/// The UTF-16LE encoding.
1628///
1629/// This decode-only encoding uses 16-bit code units due to Unicode originally
1630/// having been designed as a 16-bit reportoire. In the absence of a byte order
1631/// mark the little endian byte order is assumed.
1632///
1633/// There is no corresponding encoder in this crate or in the Encoding
1634/// Standard. The output encoding of this encoding is UTF-8.
1635///
1636/// This encoding matches the Windows code page 1200.
1637///
1638/// This will change from `static` to `const` if Rust changes
1639/// to make the referent of `pub const FOO: &'static Encoding`
1640/// unique cross-crate, so don't take the address of this
1641/// `static`.
1642pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1643
1644/// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1645///
1646/// For use only for taking the address of this form when
1647/// Rust prohibits the use of the non-`_INIT` form directly,
1648/// such as in initializers of other `static`s. If in doubt,
1649/// use the corresponding non-`_INIT` reference-typed `static`.
1650///
1651/// This part of the public API will go away if Rust changes
1652/// to make the referent of `pub const FOO: &'static Encoding`
1653/// unique cross-crate or if Rust starts allowing static arrays
1654/// to be initialized with `pub static FOO: &'static Encoding`
1655/// items.
1656pub static UTF_8_INIT: Encoding = Encoding {
1657    name: "UTF-8",
1658    variant: VariantEncoding::Utf8,
1659};
1660
1661/// The UTF-8 encoding.
1662///
1663/// This is the encoding that should be used for all new development it can
1664/// represent all of Unicode.
1665///
1666/// This encoding matches the Windows code page 65001, except Windows differs
1667/// in the number of errors generated for some erroneous byte sequences.
1668///
1669/// This will change from `static` to `const` if Rust changes
1670/// to make the referent of `pub const FOO: &'static Encoding`
1671/// unique cross-crate, so don't take the address of this
1672/// `static`.
1673pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1674
1675/// The initializer for the [gb18030](static.GB18030.html) encoding.
1676///
1677/// For use only for taking the address of this form when
1678/// Rust prohibits the use of the non-`_INIT` form directly,
1679/// such as in initializers of other `static`s. If in doubt,
1680/// use the corresponding non-`_INIT` reference-typed `static`.
1681///
1682/// This part of the public API will go away if Rust changes
1683/// to make the referent of `pub const FOO: &'static Encoding`
1684/// unique cross-crate or if Rust starts allowing static arrays
1685/// to be initialized with `pub static FOO: &'static Encoding`
1686/// items.
1687pub static GB18030_INIT: Encoding = Encoding {
1688    name: "gb18030",
1689    variant: VariantEncoding::Gb18030,
1690};
1691
1692/// The gb18030 encoding.
1693///
1694/// This encoding matches GB18030-2022 except the two-byte sequence 0xA3 0xA0
1695/// maps to U+3000 for compatibility with existing Web content and the four-byte
1696/// sequences for the non-PUA characters that got two-byte sequences still decode
1697/// to the same non-PUA characters as in GB18030-2005. As a result, this encoding
1698/// can represent all of Unicode except for 19 private-use characters.
1699///
1700/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1701/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1702///
1703/// This encoding matches the Windows code page 54936.
1704///
1705/// This will change from `static` to `const` if Rust changes
1706/// to make the referent of `pub const FOO: &'static Encoding`
1707/// unique cross-crate, so don't take the address of this
1708/// `static`.
1709pub static GB18030: &'static Encoding = &GB18030_INIT;
1710
1711/// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1712///
1713/// For use only for taking the address of this form when
1714/// Rust prohibits the use of the non-`_INIT` form directly,
1715/// such as in initializers of other `static`s. If in doubt,
1716/// use the corresponding non-`_INIT` reference-typed `static`.
1717///
1718/// This part of the public API will go away if Rust changes
1719/// to make the referent of `pub const FOO: &'static Encoding`
1720/// unique cross-crate or if Rust starts allowing static arrays
1721/// to be initialized with `pub static FOO: &'static Encoding`
1722/// items.
1723pub static MACINTOSH_INIT: Encoding = Encoding {
1724    name: "macintosh",
1725    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1726};
1727
1728/// The macintosh encoding.
1729///
1730/// This is the MacRoman encoding from Mac OS Classic.
1731///
1732/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1733/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1734///
1735/// This encoding matches the Windows code page 10000, except Windows decodes
1736/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1737///
1738/// This will change from `static` to `const` if Rust changes
1739/// to make the referent of `pub const FOO: &'static Encoding`
1740/// unique cross-crate, so don't take the address of this
1741/// `static`.
1742pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1743
1744/// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1745///
1746/// For use only for taking the address of this form when
1747/// Rust prohibits the use of the non-`_INIT` form directly,
1748/// such as in initializers of other `static`s. If in doubt,
1749/// use the corresponding non-`_INIT` reference-typed `static`.
1750///
1751/// This part of the public API will go away if Rust changes
1752/// to make the referent of `pub const FOO: &'static Encoding`
1753/// unique cross-crate or if Rust starts allowing static arrays
1754/// to be initialized with `pub static FOO: &'static Encoding`
1755/// items.
1756pub static REPLACEMENT_INIT: Encoding = Encoding {
1757    name: "replacement",
1758    variant: VariantEncoding::Replacement,
1759};
1760
1761/// The replacement encoding.
1762///
1763/// This decode-only encoding decodes all non-zero-length streams to a single
1764/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1765/// ASCII-compatible fallback encoding (typically windows-1252) for some
1766/// encodings that are no longer supported by the Web Platform and that
1767/// would be dangerous to treat as ASCII-compatible.
1768///
1769/// There is no corresponding encoder. The output encoding of this encoding
1770/// is UTF-8.
1771///
1772/// This encoding does not have a Windows code page number.
1773///
1774/// This will change from `static` to `const` if Rust changes
1775/// to make the referent of `pub const FOO: &'static Encoding`
1776/// unique cross-crate, so don't take the address of this
1777/// `static`.
1778pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1779
1780/// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1781///
1782/// For use only for taking the address of this form when
1783/// Rust prohibits the use of the non-`_INIT` form directly,
1784/// such as in initializers of other `static`s. If in doubt,
1785/// use the corresponding non-`_INIT` reference-typed `static`.
1786///
1787/// This part of the public API will go away if Rust changes
1788/// to make the referent of `pub const FOO: &'static Encoding`
1789/// unique cross-crate or if Rust starts allowing static arrays
1790/// to be initialized with `pub static FOO: &'static Encoding`
1791/// items.
1792pub static WINDOWS_1250_INIT: Encoding = Encoding {
1793    name: "windows-1250",
1794    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1795};
1796
1797/// The windows-1250 encoding.
1798///
1799/// This is the Central European encoding for Windows.
1800///
1801/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1802/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1803///
1804/// This encoding matches the Windows code page 1250.
1805///
1806/// This will change from `static` to `const` if Rust changes
1807/// to make the referent of `pub const FOO: &'static Encoding`
1808/// unique cross-crate, so don't take the address of this
1809/// `static`.
1810pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1811
1812/// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1813///
1814/// For use only for taking the address of this form when
1815/// Rust prohibits the use of the non-`_INIT` form directly,
1816/// such as in initializers of other `static`s. If in doubt,
1817/// use the corresponding non-`_INIT` reference-typed `static`.
1818///
1819/// This part of the public API will go away if Rust changes
1820/// to make the referent of `pub const FOO: &'static Encoding`
1821/// unique cross-crate or if Rust starts allowing static arrays
1822/// to be initialized with `pub static FOO: &'static Encoding`
1823/// items.
1824pub static WINDOWS_1251_INIT: Encoding = Encoding {
1825    name: "windows-1251",
1826    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1827};
1828
1829/// The windows-1251 encoding.
1830///
1831/// This is the Cyrillic encoding for Windows.
1832///
1833/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1834/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1835///
1836/// This encoding matches the Windows code page 1251.
1837///
1838/// This will change from `static` to `const` if Rust changes
1839/// to make the referent of `pub const FOO: &'static Encoding`
1840/// unique cross-crate, so don't take the address of this
1841/// `static`.
1842pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1843
1844/// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1845///
1846/// For use only for taking the address of this form when
1847/// Rust prohibits the use of the non-`_INIT` form directly,
1848/// such as in initializers of other `static`s. If in doubt,
1849/// use the corresponding non-`_INIT` reference-typed `static`.
1850///
1851/// This part of the public API will go away if Rust changes
1852/// to make the referent of `pub const FOO: &'static Encoding`
1853/// unique cross-crate or if Rust starts allowing static arrays
1854/// to be initialized with `pub static FOO: &'static Encoding`
1855/// items.
1856pub static WINDOWS_1252_INIT: Encoding = Encoding {
1857    name: "windows-1252",
1858    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1859};
1860
1861/// The windows-1252 encoding.
1862///
1863/// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1864/// which is known as Latin 1.
1865///
1866/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1867/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1868///
1869/// This encoding matches the Windows code page 1252.
1870///
1871/// This will change from `static` to `const` if Rust changes
1872/// to make the referent of `pub const FOO: &'static Encoding`
1873/// unique cross-crate, so don't take the address of this
1874/// `static`.
1875pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1876
1877/// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1878///
1879/// For use only for taking the address of this form when
1880/// Rust prohibits the use of the non-`_INIT` form directly,
1881/// such as in initializers of other `static`s. If in doubt,
1882/// use the corresponding non-`_INIT` reference-typed `static`.
1883///
1884/// This part of the public API will go away if Rust changes
1885/// to make the referent of `pub const FOO: &'static Encoding`
1886/// unique cross-crate or if Rust starts allowing static arrays
1887/// to be initialized with `pub static FOO: &'static Encoding`
1888/// items.
1889pub static WINDOWS_1253_INIT: Encoding = Encoding {
1890    name: "windows-1253",
1891    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1892};
1893
1894/// The windows-1253 encoding.
1895///
1896/// This is the Greek encoding for Windows. It is mostly an extension of
1897/// ISO-8859-7, but U+0386 is mapped to a different byte.
1898///
1899/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1900/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1901///
1902/// This encoding matches the Windows code page 1253, except Windows decodes
1903/// unassigned code points to the Private Use Area of Unicode.
1904///
1905/// This will change from `static` to `const` if Rust changes
1906/// to make the referent of `pub const FOO: &'static Encoding`
1907/// unique cross-crate, so don't take the address of this
1908/// `static`.
1909pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1910
1911/// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1912///
1913/// For use only for taking the address of this form when
1914/// Rust prohibits the use of the non-`_INIT` form directly,
1915/// such as in initializers of other `static`s. If in doubt,
1916/// use the corresponding non-`_INIT` reference-typed `static`.
1917///
1918/// This part of the public API will go away if Rust changes
1919/// to make the referent of `pub const FOO: &'static Encoding`
1920/// unique cross-crate or if Rust starts allowing static arrays
1921/// to be initialized with `pub static FOO: &'static Encoding`
1922/// items.
1923pub static WINDOWS_1254_INIT: Encoding = Encoding {
1924    name: "windows-1254",
1925    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1926};
1927
1928/// The windows-1254 encoding.
1929///
1930/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1931/// which is known as Latin 5.
1932///
1933/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1934/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1935///
1936/// This encoding matches the Windows code page 1254.
1937///
1938/// This will change from `static` to `const` if Rust changes
1939/// to make the referent of `pub const FOO: &'static Encoding`
1940/// unique cross-crate, so don't take the address of this
1941/// `static`.
1942pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1943
1944/// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1945///
1946/// For use only for taking the address of this form when
1947/// Rust prohibits the use of the non-`_INIT` form directly,
1948/// such as in initializers of other `static`s. If in doubt,
1949/// use the corresponding non-`_INIT` reference-typed `static`.
1950///
1951/// This part of the public API will go away if Rust changes
1952/// to make the referent of `pub const FOO: &'static Encoding`
1953/// unique cross-crate or if Rust starts allowing static arrays
1954/// to be initialized with `pub static FOO: &'static Encoding`
1955/// items.
1956pub static WINDOWS_1255_INIT: Encoding = Encoding {
1957    name: "windows-1255",
1958    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1959};
1960
1961/// The windows-1255 encoding.
1962///
1963/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1964/// except for a currency sign swap.
1965///
1966/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1967/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1968///
1969/// This encoding matches the Windows code page 1255, except Windows decodes
1970/// unassigned code points to the Private Use Area of Unicode.
1971///
1972/// This will change from `static` to `const` if Rust changes
1973/// to make the referent of `pub const FOO: &'static Encoding`
1974/// unique cross-crate, so don't take the address of this
1975/// `static`.
1976pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1977
1978/// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1979///
1980/// For use only for taking the address of this form when
1981/// Rust prohibits the use of the non-`_INIT` form directly,
1982/// such as in initializers of other `static`s. If in doubt,
1983/// use the corresponding non-`_INIT` reference-typed `static`.
1984///
1985/// This part of the public API will go away if Rust changes
1986/// to make the referent of `pub const FOO: &'static Encoding`
1987/// unique cross-crate or if Rust starts allowing static arrays
1988/// to be initialized with `pub static FOO: &'static Encoding`
1989/// items.
1990pub static WINDOWS_1256_INIT: Encoding = Encoding {
1991    name: "windows-1256",
1992    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1993};
1994
1995/// The windows-1256 encoding.
1996///
1997/// This is the Arabic encoding for Windows.
1998///
1999/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
2000/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
2001///
2002/// This encoding matches the Windows code page 1256.
2003///
2004/// This will change from `static` to `const` if Rust changes
2005/// to make the referent of `pub const FOO: &'static Encoding`
2006/// unique cross-crate, so don't take the address of this
2007/// `static`.
2008pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
2009
2010/// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
2011///
2012/// For use only for taking the address of this form when
2013/// Rust prohibits the use of the non-`_INIT` form directly,
2014/// such as in initializers of other `static`s. If in doubt,
2015/// use the corresponding non-`_INIT` reference-typed `static`.
2016///
2017/// This part of the public API will go away if Rust changes
2018/// to make the referent of `pub const FOO: &'static Encoding`
2019/// unique cross-crate or if Rust starts allowing static arrays
2020/// to be initialized with `pub static FOO: &'static Encoding`
2021/// items.
2022pub static WINDOWS_1257_INIT: Encoding = Encoding {
2023    name: "windows-1257",
2024    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
2025};
2026
2027/// The windows-1257 encoding.
2028///
2029/// This is the Baltic encoding for Windows.
2030///
2031/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
2032/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
2033///
2034/// This encoding matches the Windows code page 1257, except Windows decodes
2035/// unassigned code points to the Private Use Area of Unicode.
2036///
2037/// This will change from `static` to `const` if Rust changes
2038/// to make the referent of `pub const FOO: &'static Encoding`
2039/// unique cross-crate, so don't take the address of this
2040/// `static`.
2041pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
2042
2043/// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2044///
2045/// For use only for taking the address of this form when
2046/// Rust prohibits the use of the non-`_INIT` form directly,
2047/// such as in initializers of other `static`s. If in doubt,
2048/// use the corresponding non-`_INIT` reference-typed `static`.
2049///
2050/// This part of the public API will go away if Rust changes
2051/// to make the referent of `pub const FOO: &'static Encoding`
2052/// unique cross-crate or if Rust starts allowing static arrays
2053/// to be initialized with `pub static FOO: &'static Encoding`
2054/// items.
2055pub static WINDOWS_1258_INIT: Encoding = Encoding {
2056    name: "windows-1258",
2057    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2058};
2059
2060/// The windows-1258 encoding.
2061///
2062/// This is the Vietnamese encoding for Windows.
2063///
2064/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2065/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2066///
2067/// This encoding matches the Windows code page 1258 when used in the
2068/// non-normalizing mode. Unlike with the other single-byte encodings, the
2069/// result of decoding is not necessarily in Normalization Form C. On the
2070/// other hand, input in the Normalization Form C is not encoded without
2071/// replacement. In general, it's a bad idea to encode to encodings other
2072/// than UTF-8, but this encoding is especially hazardous to encode to.
2073///
2074/// This will change from `static` to `const` if Rust changes
2075/// to make the referent of `pub const FOO: &'static Encoding`
2076/// unique cross-crate, so don't take the address of this
2077/// `static`.
2078pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2079
2080/// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2081///
2082/// For use only for taking the address of this form when
2083/// Rust prohibits the use of the non-`_INIT` form directly,
2084/// such as in initializers of other `static`s. If in doubt,
2085/// use the corresponding non-`_INIT` reference-typed `static`.
2086///
2087/// This part of the public API will go away if Rust changes
2088/// to make the referent of `pub const FOO: &'static Encoding`
2089/// unique cross-crate or if Rust starts allowing static arrays
2090/// to be initialized with `pub static FOO: &'static Encoding`
2091/// items.
2092pub static WINDOWS_874_INIT: Encoding = Encoding {
2093    name: "windows-874",
2094    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2095};
2096
2097/// The windows-874 encoding.
2098///
2099/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2100///
2101/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2102/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2103///
2104/// This encoding matches the Windows code page 874, except Windows decodes
2105/// unassigned code points to the Private Use Area of Unicode.
2106///
2107/// This will change from `static` to `const` if Rust changes
2108/// to make the referent of `pub const FOO: &'static Encoding`
2109/// unique cross-crate, so don't take the address of this
2110/// `static`.
2111pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2112
2113/// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2114///
2115/// For use only for taking the address of this form when
2116/// Rust prohibits the use of the non-`_INIT` form directly,
2117/// such as in initializers of other `static`s. If in doubt,
2118/// use the corresponding non-`_INIT` reference-typed `static`.
2119///
2120/// This part of the public API will go away if Rust changes
2121/// to make the referent of `pub const FOO: &'static Encoding`
2122/// unique cross-crate or if Rust starts allowing static arrays
2123/// to be initialized with `pub static FOO: &'static Encoding`
2124/// items.
2125pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2126    name: "x-mac-cyrillic",
2127    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2128};
2129
2130/// The x-mac-cyrillic encoding.
2131///
2132/// This is the MacUkrainian encoding from Mac OS Classic.
2133///
2134/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2135/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2136///
2137/// This encoding matches the Windows code page 10017.
2138///
2139/// This will change from `static` to `const` if Rust changes
2140/// to make the referent of `pub const FOO: &'static Encoding`
2141/// unique cross-crate, so don't take the address of this
2142/// `static`.
2143pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2144
2145/// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2146///
2147/// For use only for taking the address of this form when
2148/// Rust prohibits the use of the non-`_INIT` form directly,
2149/// such as in initializers of other `static`s. If in doubt,
2150/// use the corresponding non-`_INIT` reference-typed `static`.
2151///
2152/// This part of the public API will go away if Rust changes
2153/// to make the referent of `pub const FOO: &'static Encoding`
2154/// unique cross-crate or if Rust starts allowing static arrays
2155/// to be initialized with `pub static FOO: &'static Encoding`
2156/// items.
2157pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2158    name: "x-user-defined",
2159    variant: VariantEncoding::UserDefined,
2160};
2161
2162/// The x-user-defined encoding.
2163///
2164/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2165/// them to the Private Use Area of Unicode. It was used for loading binary
2166/// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2167/// the `"arraybuffer"` response type.
2168///
2169/// This encoding does not have a Windows code page number.
2170///
2171/// This will change from `static` to `const` if Rust changes
2172/// to make the referent of `pub const FOO: &'static Encoding`
2173/// unique cross-crate, so don't take the address of this
2174/// `static`.
2175pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2176
2177static LABELS_SORTED: [&'static str; 228] = [
2178    "l1",
2179    "l2",
2180    "l3",
2181    "l4",
2182    "l5",
2183    "l6",
2184    "l9",
2185    "866",
2186    "mac",
2187    "koi",
2188    "gbk",
2189    "big5",
2190    "utf8",
2191    "koi8",
2192    "sjis",
2193    "ucs-2",
2194    "ms932",
2195    "cp866",
2196    "utf-8",
2197    "cp819",
2198    "ascii",
2199    "x-gbk",
2200    "greek",
2201    "cp1250",
2202    "cp1251",
2203    "latin1",
2204    "gb2312",
2205    "cp1252",
2206    "latin2",
2207    "cp1253",
2208    "latin3",
2209    "cp1254",
2210    "latin4",
2211    "cp1255",
2212    "csbig5",
2213    "latin5",
2214    "utf-16",
2215    "cp1256",
2216    "ibm866",
2217    "latin6",
2218    "cp1257",
2219    "cp1258",
2220    "greek8",
2221    "ibm819",
2222    "arabic",
2223    "visual",
2224    "korean",
2225    "euc-jp",
2226    "koi8-r",
2227    "koi8_r",
2228    "euc-kr",
2229    "x-sjis",
2230    "koi8-u",
2231    "hebrew",
2232    "tis-620",
2233    "gb18030",
2234    "ksc5601",
2235    "gb_2312",
2236    "dos-874",
2237    "cn-big5",
2238    "unicode",
2239    "chinese",
2240    "logical",
2241    "cskoi8r",
2242    "cseuckr",
2243    "koi8-ru",
2244    "x-cp1250",
2245    "ksc_5601",
2246    "x-cp1251",
2247    "iso88591",
2248    "csgb2312",
2249    "x-cp1252",
2250    "iso88592",
2251    "x-cp1253",
2252    "iso88593",
2253    "ecma-114",
2254    "x-cp1254",
2255    "iso88594",
2256    "x-cp1255",
2257    "iso88595",
2258    "x-x-big5",
2259    "x-cp1256",
2260    "csibm866",
2261    "iso88596",
2262    "x-cp1257",
2263    "iso88597",
2264    "asmo-708",
2265    "ecma-118",
2266    "elot_928",
2267    "x-cp1258",
2268    "iso88598",
2269    "iso88599",
2270    "cyrillic",
2271    "utf-16be",
2272    "utf-16le",
2273    "us-ascii",
2274    "ms_kanji",
2275    "x-euc-jp",
2276    "iso885910",
2277    "iso8859-1",
2278    "iso885911",
2279    "iso8859-2",
2280    "iso8859-3",
2281    "iso885913",
2282    "iso8859-4",
2283    "iso885914",
2284    "iso8859-5",
2285    "iso885915",
2286    "iso8859-6",
2287    "iso8859-7",
2288    "iso8859-8",
2289    "iso-ir-58",
2290    "iso8859-9",
2291    "csunicode",
2292    "macintosh",
2293    "shift-jis",
2294    "shift_jis",
2295    "iso-ir-100",
2296    "iso8859-10",
2297    "iso-ir-110",
2298    "gb_2312-80",
2299    "iso-8859-1",
2300    "iso_8859-1",
2301    "iso-ir-101",
2302    "iso8859-11",
2303    "iso-8859-2",
2304    "iso_8859-2",
2305    "hz-gb-2312",
2306    "iso-8859-3",
2307    "iso_8859-3",
2308    "iso8859-13",
2309    "iso-8859-4",
2310    "iso_8859-4",
2311    "iso8859-14",
2312    "iso-ir-144",
2313    "iso-8859-5",
2314    "iso_8859-5",
2315    "iso8859-15",
2316    "iso-8859-6",
2317    "iso_8859-6",
2318    "iso-ir-126",
2319    "iso-8859-7",
2320    "iso_8859-7",
2321    "iso-ir-127",
2322    "iso-ir-157",
2323    "iso-8859-8",
2324    "iso_8859-8",
2325    "iso-ir-138",
2326    "iso-ir-148",
2327    "iso-8859-9",
2328    "iso_8859-9",
2329    "iso-ir-109",
2330    "iso-ir-149",
2331    "big5-hkscs",
2332    "csshiftjis",
2333    "iso-8859-10",
2334    "iso-8859-11",
2335    "csisolatin1",
2336    "csisolatin2",
2337    "iso-8859-13",
2338    "csisolatin3",
2339    "iso-8859-14",
2340    "windows-874",
2341    "csisolatin4",
2342    "iso-8859-15",
2343    "iso_8859-15",
2344    "csisolatin5",
2345    "iso-8859-16",
2346    "csisolatin6",
2347    "windows-949",
2348    "csisolatin9",
2349    "csiso88596e",
2350    "csiso88598e",
2351    "unicodefffe",
2352    "unicodefeff",
2353    "csmacintosh",
2354    "csiso88596i",
2355    "csiso88598i",
2356    "windows-31j",
2357    "x-mac-roman",
2358    "iso-2022-cn",
2359    "iso-2022-jp",
2360    "csiso2022jp",
2361    "iso-2022-kr",
2362    "csiso2022kr",
2363    "replacement",
2364    "windows-1250",
2365    "windows-1251",
2366    "windows-1252",
2367    "windows-1253",
2368    "windows-1254",
2369    "windows-1255",
2370    "windows-1256",
2371    "windows-1257",
2372    "windows-1258",
2373    "iso-8859-6-e",
2374    "iso-8859-8-e",
2375    "iso-8859-6-i",
2376    "iso-8859-8-i",
2377    "sun_eu_greek",
2378    "csksc56011987",
2379    "unicode20utf8",
2380    "unicode11utf8",
2381    "ks_c_5601-1987",
2382    "ansi_x3.4-1968",
2383    "ks_c_5601-1989",
2384    "x-mac-cyrillic",
2385    "x-user-defined",
2386    "csiso58gb231280",
2387    "iso-10646-ucs-2",
2388    "iso_8859-1:1987",
2389    "iso_8859-2:1987",
2390    "iso_8859-6:1987",
2391    "iso_8859-7:1987",
2392    "iso_8859-3:1988",
2393    "iso_8859-4:1988",
2394    "iso_8859-5:1988",
2395    "iso_8859-8:1988",
2396    "x-unicode20utf8",
2397    "iso_8859-9:1989",
2398    "csisolatingreek",
2399    "x-mac-ukrainian",
2400    "iso-2022-cn-ext",
2401    "csisolatinarabic",
2402    "csisolatinhebrew",
2403    "unicode-1-1-utf-8",
2404    "csisolatincyrillic",
2405    "cseucpkdfmtjapanese",
2406];
2407
2408static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 228] = [
2409    &WINDOWS_1252_INIT,
2410    &ISO_8859_2_INIT,
2411    &ISO_8859_3_INIT,
2412    &ISO_8859_4_INIT,
2413    &WINDOWS_1254_INIT,
2414    &ISO_8859_10_INIT,
2415    &ISO_8859_15_INIT,
2416    &IBM866_INIT,
2417    &MACINTOSH_INIT,
2418    &KOI8_R_INIT,
2419    &GBK_INIT,
2420    &BIG5_INIT,
2421    &UTF_8_INIT,
2422    &KOI8_R_INIT,
2423    &SHIFT_JIS_INIT,
2424    &UTF_16LE_INIT,
2425    &SHIFT_JIS_INIT,
2426    &IBM866_INIT,
2427    &UTF_8_INIT,
2428    &WINDOWS_1252_INIT,
2429    &WINDOWS_1252_INIT,
2430    &GBK_INIT,
2431    &ISO_8859_7_INIT,
2432    &WINDOWS_1250_INIT,
2433    &WINDOWS_1251_INIT,
2434    &WINDOWS_1252_INIT,
2435    &GBK_INIT,
2436    &WINDOWS_1252_INIT,
2437    &ISO_8859_2_INIT,
2438    &WINDOWS_1253_INIT,
2439    &ISO_8859_3_INIT,
2440    &WINDOWS_1254_INIT,
2441    &ISO_8859_4_INIT,
2442    &WINDOWS_1255_INIT,
2443    &BIG5_INIT,
2444    &WINDOWS_1254_INIT,
2445    &UTF_16LE_INIT,
2446    &WINDOWS_1256_INIT,
2447    &IBM866_INIT,
2448    &ISO_8859_10_INIT,
2449    &WINDOWS_1257_INIT,
2450    &WINDOWS_1258_INIT,
2451    &ISO_8859_7_INIT,
2452    &WINDOWS_1252_INIT,
2453    &ISO_8859_6_INIT,
2454    &ISO_8859_8_INIT,
2455    &EUC_KR_INIT,
2456    &EUC_JP_INIT,
2457    &KOI8_R_INIT,
2458    &KOI8_R_INIT,
2459    &EUC_KR_INIT,
2460    &SHIFT_JIS_INIT,
2461    &KOI8_U_INIT,
2462    &ISO_8859_8_INIT,
2463    &WINDOWS_874_INIT,
2464    &GB18030_INIT,
2465    &EUC_KR_INIT,
2466    &GBK_INIT,
2467    &WINDOWS_874_INIT,
2468    &BIG5_INIT,
2469    &UTF_16LE_INIT,
2470    &GBK_INIT,
2471    &ISO_8859_8_I_INIT,
2472    &KOI8_R_INIT,
2473    &EUC_KR_INIT,
2474    &KOI8_U_INIT,
2475    &WINDOWS_1250_INIT,
2476    &EUC_KR_INIT,
2477    &WINDOWS_1251_INIT,
2478    &WINDOWS_1252_INIT,
2479    &GBK_INIT,
2480    &WINDOWS_1252_INIT,
2481    &ISO_8859_2_INIT,
2482    &WINDOWS_1253_INIT,
2483    &ISO_8859_3_INIT,
2484    &ISO_8859_6_INIT,
2485    &WINDOWS_1254_INIT,
2486    &ISO_8859_4_INIT,
2487    &WINDOWS_1255_INIT,
2488    &ISO_8859_5_INIT,
2489    &BIG5_INIT,
2490    &WINDOWS_1256_INIT,
2491    &IBM866_INIT,
2492    &ISO_8859_6_INIT,
2493    &WINDOWS_1257_INIT,
2494    &ISO_8859_7_INIT,
2495    &ISO_8859_6_INIT,
2496    &ISO_8859_7_INIT,
2497    &ISO_8859_7_INIT,
2498    &WINDOWS_1258_INIT,
2499    &ISO_8859_8_INIT,
2500    &WINDOWS_1254_INIT,
2501    &ISO_8859_5_INIT,
2502    &UTF_16BE_INIT,
2503    &UTF_16LE_INIT,
2504    &WINDOWS_1252_INIT,
2505    &SHIFT_JIS_INIT,
2506    &EUC_JP_INIT,
2507    &ISO_8859_10_INIT,
2508    &WINDOWS_1252_INIT,
2509    &WINDOWS_874_INIT,
2510    &ISO_8859_2_INIT,
2511    &ISO_8859_3_INIT,
2512    &ISO_8859_13_INIT,
2513    &ISO_8859_4_INIT,
2514    &ISO_8859_14_INIT,
2515    &ISO_8859_5_INIT,
2516    &ISO_8859_15_INIT,
2517    &ISO_8859_6_INIT,
2518    &ISO_8859_7_INIT,
2519    &ISO_8859_8_INIT,
2520    &GBK_INIT,
2521    &WINDOWS_1254_INIT,
2522    &UTF_16LE_INIT,
2523    &MACINTOSH_INIT,
2524    &SHIFT_JIS_INIT,
2525    &SHIFT_JIS_INIT,
2526    &WINDOWS_1252_INIT,
2527    &ISO_8859_10_INIT,
2528    &ISO_8859_4_INIT,
2529    &GBK_INIT,
2530    &WINDOWS_1252_INIT,
2531    &WINDOWS_1252_INIT,
2532    &ISO_8859_2_INIT,
2533    &WINDOWS_874_INIT,
2534    &ISO_8859_2_INIT,
2535    &ISO_8859_2_INIT,
2536    &REPLACEMENT_INIT,
2537    &ISO_8859_3_INIT,
2538    &ISO_8859_3_INIT,
2539    &ISO_8859_13_INIT,
2540    &ISO_8859_4_INIT,
2541    &ISO_8859_4_INIT,
2542    &ISO_8859_14_INIT,
2543    &ISO_8859_5_INIT,
2544    &ISO_8859_5_INIT,
2545    &ISO_8859_5_INIT,
2546    &ISO_8859_15_INIT,
2547    &ISO_8859_6_INIT,
2548    &ISO_8859_6_INIT,
2549    &ISO_8859_7_INIT,
2550    &ISO_8859_7_INIT,
2551    &ISO_8859_7_INIT,
2552    &ISO_8859_6_INIT,
2553    &ISO_8859_10_INIT,
2554    &ISO_8859_8_INIT,
2555    &ISO_8859_8_INIT,
2556    &ISO_8859_8_INIT,
2557    &WINDOWS_1254_INIT,
2558    &WINDOWS_1254_INIT,
2559    &WINDOWS_1254_INIT,
2560    &ISO_8859_3_INIT,
2561    &EUC_KR_INIT,
2562    &BIG5_INIT,
2563    &SHIFT_JIS_INIT,
2564    &ISO_8859_10_INIT,
2565    &WINDOWS_874_INIT,
2566    &WINDOWS_1252_INIT,
2567    &ISO_8859_2_INIT,
2568    &ISO_8859_13_INIT,
2569    &ISO_8859_3_INIT,
2570    &ISO_8859_14_INIT,
2571    &WINDOWS_874_INIT,
2572    &ISO_8859_4_INIT,
2573    &ISO_8859_15_INIT,
2574    &ISO_8859_15_INIT,
2575    &WINDOWS_1254_INIT,
2576    &ISO_8859_16_INIT,
2577    &ISO_8859_10_INIT,
2578    &EUC_KR_INIT,
2579    &ISO_8859_15_INIT,
2580    &ISO_8859_6_INIT,
2581    &ISO_8859_8_INIT,
2582    &UTF_16BE_INIT,
2583    &UTF_16LE_INIT,
2584    &MACINTOSH_INIT,
2585    &ISO_8859_6_INIT,
2586    &ISO_8859_8_I_INIT,
2587    &SHIFT_JIS_INIT,
2588    &MACINTOSH_INIT,
2589    &REPLACEMENT_INIT,
2590    &ISO_2022_JP_INIT,
2591    &ISO_2022_JP_INIT,
2592    &REPLACEMENT_INIT,
2593    &REPLACEMENT_INIT,
2594    &REPLACEMENT_INIT,
2595    &WINDOWS_1250_INIT,
2596    &WINDOWS_1251_INIT,
2597    &WINDOWS_1252_INIT,
2598    &WINDOWS_1253_INIT,
2599    &WINDOWS_1254_INIT,
2600    &WINDOWS_1255_INIT,
2601    &WINDOWS_1256_INIT,
2602    &WINDOWS_1257_INIT,
2603    &WINDOWS_1258_INIT,
2604    &ISO_8859_6_INIT,
2605    &ISO_8859_8_INIT,
2606    &ISO_8859_6_INIT,
2607    &ISO_8859_8_I_INIT,
2608    &ISO_8859_7_INIT,
2609    &EUC_KR_INIT,
2610    &UTF_8_INIT,
2611    &UTF_8_INIT,
2612    &EUC_KR_INIT,
2613    &WINDOWS_1252_INIT,
2614    &EUC_KR_INIT,
2615    &X_MAC_CYRILLIC_INIT,
2616    &X_USER_DEFINED_INIT,
2617    &GBK_INIT,
2618    &UTF_16LE_INIT,
2619    &WINDOWS_1252_INIT,
2620    &ISO_8859_2_INIT,
2621    &ISO_8859_6_INIT,
2622    &ISO_8859_7_INIT,
2623    &ISO_8859_3_INIT,
2624    &ISO_8859_4_INIT,
2625    &ISO_8859_5_INIT,
2626    &ISO_8859_8_INIT,
2627    &UTF_8_INIT,
2628    &WINDOWS_1254_INIT,
2629    &ISO_8859_7_INIT,
2630    &X_MAC_CYRILLIC_INIT,
2631    &REPLACEMENT_INIT,
2632    &ISO_8859_6_INIT,
2633    &ISO_8859_8_INIT,
2634    &UTF_8_INIT,
2635    &ISO_8859_5_INIT,
2636    &EUC_JP_INIT,
2637];
2638
2639// END GENERATED CODE
2640
2641/// An encoding as defined in the [Encoding Standard][1].
2642///
2643/// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2644/// and, in most cases, vice versa. Each encoding has a name, an output
2645/// encoding, and one or more labels.
2646///
2647/// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2648/// encoding in formats and protocols. The _name_ of the encoding is the
2649/// preferred label in the case appropriate for returning from the
2650/// [`characterSet`][2] property of the `Document` DOM interface.
2651///
2652/// The _output encoding_ is the encoding used for form submission and URL
2653/// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2654/// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2655/// encodings.
2656///
2657/// [1]: https://encoding.spec.whatwg.org/
2658/// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2659///
2660/// # Streaming vs. Non-Streaming
2661///
2662/// When you have the entire input in a single buffer, you can use the
2663/// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2664/// [`decode_without_bom_handling()`][5],
2665/// [`decode_without_bom_handling_and_without_replacement()`][6] and
2666/// [`encode()`][7]. (These methods are available to Rust callers only and are
2667/// not available in the C API.) Unlike the rest of the API available to Rust,
2668/// these methods perform heap allocations. You should the `Decoder` and
2669/// `Encoder` objects when your input is split into multiple buffers or when
2670/// you want to control the allocation of the output buffers.
2671///
2672/// [3]: #method.decode
2673/// [4]: #method.decode_with_bom_removal
2674/// [5]: #method.decode_without_bom_handling
2675/// [6]: #method.decode_without_bom_handling_and_without_replacement
2676/// [7]: #method.encode
2677///
2678/// # Instances
2679///
2680/// All instances of `Encoding` are statically allocated and have the `'static`
2681/// lifetime. There is precisely one unique `Encoding` instance for each
2682/// encoding defined in the Encoding Standard.
2683///
2684/// To obtain a reference to a particular encoding whose identity you know at
2685/// compile time, use a `static` that refers to encoding. There is a `static`
2686/// for each encoding. The `static`s are named in all caps with hyphens
2687/// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2688/// name). For example, if you know at compile time that you will want to
2689/// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2690/// in C/C++).
2691///
2692/// Additionally, there are non-reference-typed forms ending with `_INIT` to
2693/// work around the problem that `static`s of the type `&'static Encoding`
2694/// cannot be used to initialize items of an array whose type is
2695/// `[&'static Encoding; N]`.
2696///
2697/// If you don't know what encoding you need at compile time and need to
2698/// dynamically get an encoding by label, use
2699/// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2700///
2701/// Instances of `Encoding` can be compared with `==` (in both Rust and in
2702/// C/C++).
2703pub struct Encoding {
2704    name: &'static str,
2705    variant: VariantEncoding,
2706}
2707
2708impl Encoding {
2709    /// Implements the
2710    /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2711    /// algorithm.
2712    ///
2713    /// If, after ASCII-lowercasing and removing leading and trailing
2714    /// whitespace, the argument matches a label defined in the Encoding
2715    /// Standard, `Some(&'static Encoding)` representing the corresponding
2716    /// encoding is returned. If there is no match, `None` is returned.
2717    ///
2718    /// This is the right method to use if the action upon the method returning
2719    /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2720    /// When the action upon the method returning `None` is not to proceed with
2721    /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2722    /// appropriate.
2723    ///
2724    /// The argument is of type `&[u8]` instead of `&str` to save callers
2725    /// that are extracting the label from a non-UTF-8 protocol the trouble
2726    /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2727    /// on it.)
2728    ///
2729    /// Available via the C wrapper.
2730    ///
2731    /// # Example
2732    /// ```
2733    /// use encoding_rs::Encoding;
2734    ///
2735    /// assert_eq!(Some(encoding_rs::UTF_8), Encoding::for_label(b"utf-8"));
2736    /// assert_eq!(Some(encoding_rs::UTF_8), Encoding::for_label(b"unicode11utf8"));
2737    ///
2738    /// assert_eq!(Some(encoding_rs::ISO_8859_2), Encoding::for_label(b"latin2"));
2739    ///
2740    /// assert_eq!(Some(encoding_rs::UTF_16BE), Encoding::for_label(b"utf-16be"));
2741    ///
2742    /// assert_eq!(None, Encoding::for_label(b"unrecognized label"));
2743    /// ```
2744    pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2745        let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2746        let mut trimmed_pos = 0usize;
2747        let mut iter = label.into_iter();
2748        // before
2749        loop {
2750            match iter.next() {
2751                None => {
2752                    return None;
2753                }
2754                Some(byte) => {
2755                    // The characters used in labels are:
2756                    // a-z (except q, but excluding it below seems excessive)
2757                    // 0-9
2758                    // . _ - :
2759                    match *byte {
2760                        0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2761                            continue;
2762                        }
2763                        b'A'..=b'Z' => {
2764                            trimmed[trimmed_pos] = *byte + 0x20u8;
2765                            trimmed_pos = 1usize;
2766                            break;
2767                        }
2768                        b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2769                            trimmed[trimmed_pos] = *byte;
2770                            trimmed_pos = 1usize;
2771                            break;
2772                        }
2773                        _ => {
2774                            return None;
2775                        }
2776                    }
2777                }
2778            }
2779        }
2780        // inside
2781        loop {
2782            match iter.next() {
2783                None => {
2784                    break;
2785                }
2786                Some(byte) => {
2787                    match *byte {
2788                        0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2789                            break;
2790                        }
2791                        b'A'..=b'Z' => {
2792                            if trimmed_pos == LONGEST_LABEL_LENGTH {
2793                                // There's no encoding with a label this long
2794                                return None;
2795                            }
2796                            trimmed[trimmed_pos] = *byte + 0x20u8;
2797                            trimmed_pos += 1usize;
2798                            continue;
2799                        }
2800                        b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2801                            if trimmed_pos == LONGEST_LABEL_LENGTH {
2802                                // There's no encoding with a label this long
2803                                return None;
2804                            }
2805                            trimmed[trimmed_pos] = *byte;
2806                            trimmed_pos += 1usize;
2807                            continue;
2808                        }
2809                        _ => {
2810                            return None;
2811                        }
2812                    }
2813                }
2814            }
2815        }
2816        // after
2817        loop {
2818            match iter.next() {
2819                None => {
2820                    break;
2821                }
2822                Some(byte) => {
2823                    match *byte {
2824                        0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2825                            continue;
2826                        }
2827                        _ => {
2828                            // There's no label with space in the middle
2829                            return None;
2830                        }
2831                    }
2832                }
2833            }
2834        }
2835        let candidate = &trimmed[..trimmed_pos];
2836        match LABELS_SORTED.binary_search_by(|probe| {
2837            let bytes = probe.as_bytes();
2838            let c = bytes.len().cmp(&candidate.len());
2839            if c != Ordering::Equal {
2840                return c;
2841            }
2842            let probe_iter = bytes.iter().rev();
2843            let candidate_iter = candidate.iter().rev();
2844            probe_iter.cmp(candidate_iter)
2845        }) {
2846            Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2847            Err(_) => None,
2848        }
2849    }
2850
2851    /// This method behaves the same as `for_label()`, except when `for_label()`
2852    /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2853    ///
2854    /// This method is useful in scenarios where a fatal error is required
2855    /// upon invalid label, because in those cases the caller typically wishes
2856    /// to treat the labels that map to the replacement encoding as fatal
2857    /// errors, too.
2858    ///
2859    /// It is not OK to use this method when the action upon the method returning
2860    /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2861    /// case, the `for_label()` method should be used instead in order to avoid
2862    /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2863    ///
2864    /// Available via the C wrapper.
2865    #[inline]
2866    pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2867        match Encoding::for_label(label) {
2868            None => None,
2869            Some(encoding) => {
2870                if encoding == REPLACEMENT {
2871                    None
2872                } else {
2873                    Some(encoding)
2874                }
2875            }
2876        }
2877    }
2878
2879    /// Performs non-incremental BOM sniffing.
2880    ///
2881    /// The argument must either be a buffer representing the entire input
2882    /// stream (non-streaming case) or a buffer representing at least the first
2883    /// three bytes of the input stream (streaming case).
2884    ///
2885    /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2886    /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2887    /// or UTF-16BE BOM or `None` otherwise.
2888    ///
2889    /// Available via the C wrapper.
2890    #[inline]
2891    pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2892        if buffer.starts_with(b"\xEF\xBB\xBF") {
2893            Some((UTF_8, 3))
2894        } else if buffer.starts_with(b"\xFF\xFE") {
2895            Some((UTF_16LE, 2))
2896        } else if buffer.starts_with(b"\xFE\xFF") {
2897            Some((UTF_16BE, 2))
2898        } else {
2899            None
2900        }
2901    }
2902
2903    /// Returns the name of this encoding.
2904    ///
2905    /// This name is appropriate to return as-is from the DOM
2906    /// `document.characterSet` property.
2907    ///
2908    /// Available via the C wrapper.
2909    #[inline]
2910    pub fn name(&'static self) -> &'static str {
2911        self.name
2912    }
2913
2914    /// Checks whether the _output encoding_ of this encoding can encode every
2915    /// `char`. (Only true if the output encoding is UTF-8.)
2916    ///
2917    /// Available via the C wrapper.
2918    #[inline]
2919    pub fn can_encode_everything(&'static self) -> bool {
2920        self.output_encoding() == UTF_8
2921    }
2922
2923    /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2924    /// U+0000...U+007F and vice versa.
2925    ///
2926    /// Available via the C wrapper.
2927    #[inline]
2928    pub fn is_ascii_compatible(&'static self) -> bool {
2929        !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2930    }
2931
2932    /// Checks whether this encoding maps one byte to one Basic Multilingual
2933    /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2934    /// vice versa (for mappable characters).
2935    ///
2936    /// `true` iff this encoding is on the list of [Legacy single-byte
2937    /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2938    /// in the spec or x-user-defined.
2939    ///
2940    /// Available via the C wrapper.
2941    #[inline]
2942    pub fn is_single_byte(&'static self) -> bool {
2943        self.variant.is_single_byte()
2944    }
2945
2946    /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2947    /// U+0000...U+007F and vice versa.
2948    #[cfg(feature = "alloc")]
2949    #[inline]
2950    fn is_potentially_borrowable(&'static self) -> bool {
2951        !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2952    }
2953
2954    /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2955    /// UTF-16BE, UTF-16LE, and replacement and the encoding itself otherwise.
2956    ///
2957    /// _Note:_ The _output encoding_ concept is needed for form submission and
2958    /// error handling in the query strings of URLs in the Web Platform.
2959    ///
2960    /// Available via the C wrapper.
2961    #[inline]
2962    pub fn output_encoding(&'static self) -> &'static Encoding {
2963        if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2964            UTF_8
2965        } else {
2966            self
2967        }
2968    }
2969
2970    /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2971    /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2972    /// entire input is available as a single buffer (i.e. the end of the
2973    /// buffer marks the end of the stream).
2974    ///
2975    /// The BOM, if any, does not appear in the output.
2976    ///
2977    /// This method implements the (non-streaming version of) the
2978    /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2979    ///
2980    /// The second item in the returned tuple is the encoding that was actually
2981    /// used (which may differ from this encoding thanks to BOM sniffing).
2982    ///
2983    /// The third item in the returned tuple indicates whether there were
2984    /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2985    ///
2986    /// _Note:_ It is wrong to use this when the input buffer represents only
2987    /// a segment of the input instead of the whole input. Use `new_decoder()`
2988    /// when decoding segmented input.
2989    ///
2990    /// This method performs a one or two heap allocations for the backing
2991    /// buffer of the `String` when unable to borrow. (One allocation if not
2992    /// errors and potentially another one in the presence of errors.) The
2993    /// first allocation assumes jemalloc and may not be optimal with
2994    /// allocators that do not use power-of-two buckets. A borrow is performed
2995    /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2996    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2997    /// ISO-2022-JP and the input is entirely in the ASCII state without state
2998    /// transitions.
2999    ///
3000    /// # Panics
3001    ///
3002    /// If the size calculation for a heap-allocated backing buffer overflows
3003    /// `usize`.
3004    ///
3005    /// Available to Rust only and only with the `alloc` feature enabled (enabled
3006    /// by default).
3007    #[cfg(feature = "alloc")]
3008    #[inline]
3009    pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
3010        let (encoding, without_bom) = match Encoding::for_bom(bytes) {
3011            Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
3012            None => (self, bytes),
3013        };
3014        let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
3015        (cow, encoding, had_errors)
3016    }
3017
3018    /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
3019    /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
3020    /// entire input is available as a single buffer (i.e. the end of the
3021    /// buffer marks the end of the stream).
3022    ///
3023    /// Only an initial byte sequence that is a BOM for this encoding is removed.
3024    ///
3025    /// When invoked on `UTF_8`, this method implements the (non-streaming
3026    /// version of) the
3027    /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
3028    /// concept.
3029    ///
3030    /// The second item in the returned pair indicates whether there were
3031    /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3032    ///
3033    /// _Note:_ It is wrong to use this when the input buffer represents only
3034    /// a segment of the input instead of the whole input. Use
3035    /// `new_decoder_with_bom_removal()` when decoding segmented input.
3036    ///
3037    /// This method performs a one or two heap allocations for the backing
3038    /// buffer of the `String` when unable to borrow. (One allocation if not
3039    /// errors and potentially another one in the presence of errors.) The
3040    /// first allocation assumes jemalloc and may not be optimal with
3041    /// allocators that do not use power-of-two buckets. A borrow is performed
3042    /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3043    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3044    /// ISO-2022-JP and the input is entirely in the ASCII state without state
3045    /// transitions.
3046    ///
3047    /// # Panics
3048    ///
3049    /// If the size calculation for a heap-allocated backing buffer overflows
3050    /// `usize`.
3051    ///
3052    /// Available to Rust only and only with the `alloc` feature enabled (enabled
3053    /// by default).
3054    #[cfg(feature = "alloc")]
3055    #[inline]
3056    pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3057        let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
3058            &bytes[3..]
3059        } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
3060            || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
3061        {
3062            &bytes[2..]
3063        } else {
3064            bytes
3065        };
3066        self.decode_without_bom_handling(without_bom)
3067    }
3068
3069    /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3070    /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
3071    /// the entire input is available as a single buffer (i.e. the end of the
3072    /// buffer marks the end of the stream).
3073    ///
3074    /// When invoked on `UTF_8`, this method implements the (non-streaming
3075    /// version of) the
3076    /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
3077    /// spec concept.
3078    ///
3079    /// The second item in the returned pair indicates whether there were
3080    /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3081    ///
3082    /// _Note:_ It is wrong to use this when the input buffer represents only
3083    /// a segment of the input instead of the whole input. Use
3084    /// `new_decoder_without_bom_handling()` when decoding segmented input.
3085    ///
3086    /// This method performs a one or two heap allocations for the backing
3087    /// buffer of the `String` when unable to borrow. (One allocation if not
3088    /// errors and potentially another one in the presence of errors.) The
3089    /// first allocation assumes jemalloc and may not be optimal with
3090    /// allocators that do not use power-of-two buckets. A borrow is performed
3091    /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3092    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3093    /// ISO-2022-JP and the input is entirely in the ASCII state without state
3094    /// transitions.
3095    ///
3096    /// # Panics
3097    ///
3098    /// If the size calculation for a heap-allocated backing buffer overflows
3099    /// `usize`.
3100    ///
3101    /// Available to Rust only and only with the `alloc` feature enabled (enabled
3102    /// by default).
3103    #[cfg(feature = "alloc")]
3104    pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3105        let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3106            let valid_up_to = if self == UTF_8 {
3107                utf8_valid_up_to(bytes)
3108            } else if self == ISO_2022_JP {
3109                iso_2022_jp_ascii_valid_up_to(bytes)
3110            } else {
3111                ascii_valid_up_to(bytes)
3112            };
3113            if valid_up_to == bytes.len() {
3114                let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3115                return (Cow::Borrowed(str), false);
3116            }
3117            let decoder = self.new_decoder_without_bom_handling();
3118
3119            let rounded_without_replacement = checked_next_power_of_two(checked_add(
3120                valid_up_to,
3121                decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3122            ));
3123            let with_replacement = checked_add(
3124                valid_up_to,
3125                decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3126            );
3127            let mut string = String::with_capacity(
3128                checked_min(rounded_without_replacement, with_replacement).unwrap(),
3129            );
3130            unsafe {
3131                let vec = string.as_mut_vec();
3132                vec.set_len(valid_up_to);
3133                core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3134            }
3135            (decoder, string, valid_up_to)
3136        } else {
3137            let decoder = self.new_decoder_without_bom_handling();
3138            let rounded_without_replacement = checked_next_power_of_two(
3139                decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3140            );
3141            let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3142            let string = String::with_capacity(
3143                checked_min(rounded_without_replacement, with_replacement).unwrap(),
3144            );
3145            (decoder, string, 0)
3146        };
3147
3148        let mut total_had_errors = false;
3149        loop {
3150            let (result, read, had_errors) =
3151                decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3152            total_read += read;
3153            total_had_errors |= had_errors;
3154            match result {
3155                CoderResult::InputEmpty => {
3156                    debug_assert_eq!(total_read, bytes.len());
3157                    return (Cow::Owned(string), total_had_errors);
3158                }
3159                CoderResult::OutputFull => {
3160                    // Allocate for the worst case. That is, we should come
3161                    // here at most once per invocation of this method.
3162                    let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3163                    string.reserve(needed.unwrap());
3164                }
3165            }
3166        }
3167    }
3168
3169    /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3170    /// _with malformed sequences treated as fatal_ when the entire input is
3171    /// available as a single buffer (i.e. the end of the buffer marks the end
3172    /// of the stream).
3173    ///
3174    /// When invoked on `UTF_8`, this method implements the (non-streaming
3175    /// version of) the
3176    /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3177    /// spec concept.
3178    ///
3179    /// Returns `None` if a malformed sequence was encountered and the result
3180    /// of the decode as `Some(String)` otherwise.
3181    ///
3182    /// _Note:_ It is wrong to use this when the input buffer represents only
3183    /// a segment of the input instead of the whole input. Use
3184    /// `new_decoder_without_bom_handling()` when decoding segmented input.
3185    ///
3186    /// This method performs a single heap allocation for the backing
3187    /// buffer of the `String` when unable to borrow. A borrow is performed if
3188    /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3189    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3190    /// ISO-2022-JP and the input is entirely in the ASCII state without state
3191    /// transitions.
3192    ///
3193    /// # Panics
3194    ///
3195    /// If the size calculation for a heap-allocated backing buffer overflows
3196    /// `usize`.
3197    ///
3198    /// Available to Rust only and only with the `alloc` feature enabled (enabled
3199    /// by default).
3200    #[cfg(feature = "alloc")]
3201    pub fn decode_without_bom_handling_and_without_replacement<'a>(
3202        &'static self,
3203        bytes: &'a [u8],
3204    ) -> Option<Cow<'a, str>> {
3205        if self == UTF_8 {
3206            let valid_up_to = utf8_valid_up_to(bytes);
3207            if valid_up_to == bytes.len() {
3208                let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3209                return Some(Cow::Borrowed(str));
3210            }
3211            return None;
3212        }
3213        let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3214            let valid_up_to = if self == ISO_2022_JP {
3215                iso_2022_jp_ascii_valid_up_to(bytes)
3216            } else {
3217                ascii_valid_up_to(bytes)
3218            };
3219            if valid_up_to == bytes.len() {
3220                let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3221                return Some(Cow::Borrowed(str));
3222            }
3223            let decoder = self.new_decoder_without_bom_handling();
3224            let mut string = String::with_capacity(
3225                checked_add(
3226                    valid_up_to,
3227                    decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3228                )
3229                .unwrap(),
3230            );
3231            unsafe {
3232                let vec = string.as_mut_vec();
3233                vec.set_len(valid_up_to);
3234                core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3235            }
3236            (decoder, string, &bytes[valid_up_to..])
3237        } else {
3238            let decoder = self.new_decoder_without_bom_handling();
3239            let string = String::with_capacity(
3240                decoder
3241                    .max_utf8_buffer_length_without_replacement(bytes.len())
3242                    .unwrap(),
3243            );
3244            (decoder, string, bytes)
3245        };
3246        let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3247        match result {
3248            DecoderResult::InputEmpty => {
3249                debug_assert_eq!(read, input.len());
3250                Some(Cow::Owned(string))
3251            }
3252            DecoderResult::Malformed(_, _) => None,
3253            DecoderResult::OutputFull => unreachable!(),
3254        }
3255    }
3256
3257    /// Encode complete input to `Cow<'a, [u8]>` using the
3258    /// [_output encoding_](Encoding::output_encoding) of this encoding with
3259    /// unmappable characters replaced with decimal numeric character references
3260    /// when the entire input is available as a single buffer (i.e. the end of
3261    /// the buffer marks the end of the stream).
3262    ///
3263    /// This method implements the (non-streaming version of) the
3264    /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3265    /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3266    /// spec concept, it is slightly more efficient to use
3267    /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3268    /// method on `UTF_8`.
3269    ///
3270    /// The second item in the returned tuple is the encoding that was actually
3271    /// used (*which may differ from this encoding thanks to some encodings
3272    /// having UTF-8 as their output encoding*).
3273    ///
3274    /// The third item in the returned tuple indicates whether there were
3275    /// unmappable characters (that were replaced with HTML numeric character
3276    /// references).
3277    ///
3278    /// _Note:_ It is wrong to use this when the input buffer represents only
3279    /// a segment of the input instead of the whole input. Use `new_encoder()`
3280    /// when encoding segmented output.
3281    ///
3282    /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3283    /// ASCII-compatible encoding, this method returns a borrow of the input
3284    /// without a heap allocation. Otherwise, this method performs a single
3285    /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3286    /// unmappable characters and potentially multiple heap allocations if
3287    /// there are. These allocations are tuned for jemalloc and may not be
3288    /// optimal when using a different allocator that doesn't use power-of-two
3289    /// buckets.
3290    ///
3291    /// # Panics
3292    ///
3293    /// If the size calculation for a heap-allocated backing buffer overflows
3294    /// `usize`.
3295    ///
3296    /// Available to Rust only and only with the `alloc` feature enabled (enabled
3297    /// by default).
3298    #[cfg(feature = "alloc")]
3299    pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3300        let output_encoding = self.output_encoding();
3301        if output_encoding == UTF_8 {
3302            return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3303        }
3304        debug_assert!(output_encoding.is_potentially_borrowable());
3305        let bytes = string.as_bytes();
3306        let valid_up_to = if output_encoding == ISO_2022_JP {
3307            iso_2022_jp_ascii_valid_up_to(bytes)
3308        } else {
3309            ascii_valid_up_to(bytes)
3310        };
3311        if valid_up_to == bytes.len() {
3312            return (Cow::Borrowed(bytes), output_encoding, false);
3313        }
3314        let mut encoder = output_encoding.new_encoder();
3315        let mut vec: Vec<u8> = Vec::with_capacity(
3316            (checked_add(
3317                valid_up_to,
3318                encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3319            ))
3320            .unwrap()
3321            .next_power_of_two(),
3322        );
3323        unsafe {
3324            vec.set_len(valid_up_to);
3325            core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3326        }
3327        let mut total_read = valid_up_to;
3328        let mut total_had_errors = false;
3329        loop {
3330            let (result, read, had_errors) =
3331                encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3332            total_read += read;
3333            total_had_errors |= had_errors;
3334            match result {
3335                CoderResult::InputEmpty => {
3336                    debug_assert_eq!(total_read, string.len());
3337                    return (Cow::Owned(vec), output_encoding, total_had_errors);
3338                }
3339                CoderResult::OutputFull => {
3340                    // reserve_exact wants to know how much more on top of current
3341                    // length--not current capacity.
3342                    let needed = encoder
3343                        .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3344                    let rounded = (checked_add(vec.capacity(), needed))
3345                        .unwrap()
3346                        .next_power_of_two();
3347                    let additional = rounded - vec.len();
3348                    vec.reserve_exact(additional);
3349                }
3350            }
3351        }
3352    }
3353
3354    fn new_variant_decoder(&'static self) -> VariantDecoder {
3355        self.variant.new_variant_decoder()
3356    }
3357
3358    /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3359    ///
3360    /// BOM sniffing may cause the returned decoder to morph into a decoder
3361    /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. The BOM
3362    /// does not appear in the output.
3363    ///
3364    /// Available via the C wrapper.
3365    #[inline]
3366    pub fn new_decoder(&'static self) -> Decoder {
3367        Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3368    }
3369
3370    /// Instantiates a new decoder for this encoding with BOM removal.
3371    ///
3372    /// If the input starts with bytes that are the BOM for this encoding,
3373    /// those bytes are removed. However, the decoder never morphs into a
3374    /// decoder for another encoding: A BOM for another encoding is treated as
3375    /// (potentially malformed) input to the decoding algorithm for this
3376    /// encoding.
3377    ///
3378    /// Available via the C wrapper.
3379    #[inline]
3380    pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3381        Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3382    }
3383
3384    /// Instantiates a new decoder for this encoding with BOM handling disabled.
3385    ///
3386    /// If the input starts with bytes that look like a BOM, those bytes are
3387    /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3388    /// for another encoding.)
3389    ///
3390    /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3391    /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3392    /// instead of this method to cause the BOM to be removed.
3393    ///
3394    /// Available via the C wrapper.
3395    #[inline]
3396    pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3397        Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3398    }
3399
3400    /// Instantiates a new encoder for the [_output encoding_](Encoding::output_encoding)
3401    /// of this encoding.
3402    ///
3403    /// _Note:_ The output encoding of UTF-16BE, UTF-16LE, and replacement is UTF-8. There
3404    /// is no encoder for UTF-16BE, UTF-16LE, and replacement themselves.
3405    ///
3406    /// Available via the C wrapper.
3407    #[inline]
3408    pub fn new_encoder(&'static self) -> Encoder {
3409        let enc = self.output_encoding();
3410        enc.variant.new_encoder(enc)
3411    }
3412
3413    /// Validates UTF-8.
3414    ///
3415    /// Returns the index of the first byte that makes the input malformed as
3416    /// UTF-8 or the length of the slice if the slice is entirely valid.
3417    ///
3418    /// This is currently faster than the corresponding standard library
3419    /// functionality. If this implementation gets upstreamed to the standard
3420    /// library, this method may be removed in the future.
3421    ///
3422    /// Available via the C wrapper.
3423    pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3424        utf8_valid_up_to(bytes)
3425    }
3426
3427    /// Validates ASCII.
3428    ///
3429    /// Returns the index of the first byte that makes the input malformed as
3430    /// ASCII or the length of the slice if the slice is entirely valid.
3431    ///
3432    /// Available via the C wrapper.
3433    pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3434        ascii_valid_up_to(bytes)
3435    }
3436
3437    /// Validates ISO-2022-JP ASCII-state data.
3438    ///
3439    /// Returns the index of the first byte that makes the input not
3440    /// representable in the ASCII state of ISO-2022-JP or the length of the
3441    /// slice if the slice is entirely representable in the ASCII state of
3442    /// ISO-2022-JP.
3443    ///
3444    /// Available via the C wrapper.
3445    pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3446        iso_2022_jp_ascii_valid_up_to(bytes)
3447    }
3448}
3449
3450impl PartialEq for Encoding {
3451    #[inline]
3452    fn eq(&self, other: &Encoding) -> bool {
3453        (self as *const Encoding) == (other as *const Encoding)
3454    }
3455}
3456
3457impl Eq for Encoding {}
3458
3459#[cfg(test)]
3460impl PartialOrd for Encoding {
3461    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
3462        (self as *const Encoding as usize).partial_cmp(&(other as *const Encoding as usize))
3463    }
3464}
3465
3466#[cfg(test)]
3467impl Ord for Encoding {
3468    fn cmp(&self, other: &Self) -> Ordering {
3469        (self as *const Encoding as usize).cmp(&(other as *const Encoding as usize))
3470    }
3471}
3472
3473impl Hash for Encoding {
3474    #[inline]
3475    fn hash<H: Hasher>(&self, state: &mut H) {
3476        (self as *const Encoding).hash(state);
3477    }
3478}
3479
3480impl core::fmt::Debug for Encoding {
3481    #[inline]
3482    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
3483        write!(f, "Encoding {{ {} }}", self.name)
3484    }
3485}
3486
3487#[cfg(feature = "serde")]
3488impl Serialize for Encoding {
3489    #[inline]
3490    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3491    where
3492        S: Serializer,
3493    {
3494        serializer.serialize_str(self.name)
3495    }
3496}
3497
3498#[cfg(feature = "serde")]
3499struct EncodingVisitor;
3500
3501#[cfg(feature = "serde")]
3502impl<'de> Visitor<'de> for EncodingVisitor {
3503    type Value = &'static Encoding;
3504
3505    fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
3506        formatter.write_str("a valid encoding label")
3507    }
3508
3509    fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3510    where
3511        E: serde::de::Error,
3512    {
3513        if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3514            Ok(enc)
3515        } else {
3516            Err(E::custom(alloc::format!(
3517                "invalid encoding label: {}",
3518                value
3519            )))
3520        }
3521    }
3522}
3523
3524#[cfg(feature = "serde")]
3525impl<'de> Deserialize<'de> for &'static Encoding {
3526    fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3527    where
3528        D: Deserializer<'de>,
3529    {
3530        deserializer.deserialize_str(EncodingVisitor)
3531    }
3532}
3533
3534/// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3535#[derive(PartialEq, Debug, Copy, Clone)]
3536enum DecoderLifeCycle {
3537    /// The decoder has seen no input yet.
3538    AtStart,
3539    /// The decoder has seen no input yet but expects UTF-8.
3540    AtUtf8Start,
3541    /// The decoder has seen no input yet but expects UTF-16BE.
3542    AtUtf16BeStart,
3543    /// The decoder has seen no input yet but expects UTF-16LE.
3544    AtUtf16LeStart,
3545    /// The decoder has seen EF.
3546    SeenUtf8First,
3547    /// The decoder has seen EF, BB.
3548    SeenUtf8Second,
3549    /// The decoder has seen FE.
3550    SeenUtf16BeFirst,
3551    /// The decoder has seen FF.
3552    SeenUtf16LeFirst,
3553    /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3554    /// underlying decoder reported EF as an error, so we need to remember to
3555    /// push BB before the next buffer.
3556    ConvertingWithPendingBB,
3557    /// No longer looking for a BOM and EOF not yet seen.
3558    Converting,
3559    /// EOF has been seen.
3560    Finished,
3561}
3562
3563/// Communicate the BOM handling mode.
3564#[derive(Debug, Copy, Clone)]
3565enum BomHandling {
3566    /// Don't handle the BOM
3567    Off,
3568    /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3569    Sniff,
3570    /// Remove the BOM only if it's the BOM for this encoding
3571    Remove,
3572}
3573
3574/// Result of a (potentially partial) decode or encode operation with
3575/// replacement.
3576#[must_use]
3577#[derive(Debug, PartialEq, Eq)]
3578pub enum CoderResult {
3579    /// The input was exhausted.
3580    ///
3581    /// If this result was returned from a call where `last` was `true`, the
3582    /// conversion process has completed. Otherwise, the caller should call a
3583    /// decode or encode method again with more input.
3584    InputEmpty,
3585
3586    /// The converter cannot produce another unit of output, because the output
3587    /// buffer does not have enough space left.
3588    ///
3589    /// The caller must provide more output space upon the next call and re-push
3590    /// the remaining input to the converter.
3591    OutputFull,
3592}
3593
3594/// Result of a (potentially partial) decode operation without replacement.
3595#[must_use]
3596#[derive(Debug, PartialEq, Eq)]
3597pub enum DecoderResult {
3598    /// The input was exhausted.
3599    ///
3600    /// If this result was returned from a call where `last` was `true`, the
3601    /// decoding process has completed. Otherwise, the caller should call a
3602    /// decode method again with more input.
3603    InputEmpty,
3604
3605    /// The decoder cannot produce another unit of output, because the output
3606    /// buffer does not have enough space left.
3607    ///
3608    /// The caller must provide more output space upon the next call and re-push
3609    /// the remaining input to the decoder.
3610    OutputFull,
3611
3612    /// The decoder encountered a malformed byte sequence.
3613    ///
3614    /// The caller must either treat this as a fatal error or must append one
3615    /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3616    /// the remaining input to the decoder.
3617    ///
3618    /// The first wrapped integer indicates the length of the malformed byte
3619    /// sequence. The second wrapped integer indicates the number of bytes
3620    /// that were consumed after the malformed sequence. If the second
3621    /// integer is zero, the last byte that was consumed is the last byte of
3622    /// the malformed sequence. Note that the malformed bytes may have been part
3623    /// of an earlier input buffer.
3624    ///
3625    /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3626    /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3627    /// of the two is 6, which happens with ISO-2022-JP.
3628    Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3629}
3630
3631/// A converter that decodes a byte stream into Unicode according to a
3632/// character encoding in a streaming (incremental) manner.
3633///
3634/// The various `decode_*` methods take an input buffer (`src`) and an output
3635/// buffer `dst` both of which are caller-allocated. There are variants for
3636/// both UTF-8 and UTF-16 output buffers.
3637///
3638/// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3639/// into `dst` until one of the following three things happens:
3640///
3641/// 1. A malformed byte sequence is encountered (`*_without_replacement`
3642///    variants only).
3643///
3644/// 2. The output buffer has been filled so near capacity that the decoder
3645///    cannot be sure that processing an additional byte of input wouldn't
3646///    cause so much output that the output buffer would overflow.
3647///
3648/// 3. All the input bytes have been processed.
3649///
3650/// The `decode_*` method then returns tuple of a status indicating which one
3651/// of the three reasons to return happened, how many input bytes were read,
3652/// how many output code units (`u8` when decoding into UTF-8 and `u16`
3653/// when decoding to UTF-16) were written (except when decoding into `String`,
3654/// whose length change indicates this), and in the case of the
3655/// variants performing replacement, a boolean indicating whether an error was
3656/// replaced with the REPLACEMENT CHARACTER during the call.
3657///
3658/// The number of bytes "written" is what's logically written. Garbage may be
3659/// written in the output buffer beyond the point logically written to.
3660/// Therefore, if you wish to decode into an `&mut str`, you should use the
3661/// methods that take an `&mut str` argument instead of the ones that take an
3662/// `&mut [u8]` argument. The former take care of overwriting the trailing
3663/// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3664/// latter don't.
3665///
3666/// In the case of the `*_without_replacement` variants, the status is a
3667/// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3668/// `InputEmpty` corresponding to the three cases listed above).
3669///
3670/// In the case of methods whose name does not end with
3671/// `*_without_replacement`, malformed sequences are automatically replaced
3672/// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3673/// return early.
3674///
3675/// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3676/// space. When decoding to UTF-16, the output buffer must have at least two
3677/// UTF-16 code units (`u16`) of space.
3678///
3679/// When decoding to UTF-8 without replacement, the methods are guaranteed
3680/// not to return indicating that more output space is needed if the length
3681/// of the output buffer is at least the length returned by
3682/// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3683/// with replacement, the length of the output buffer that guarantees the
3684/// methods not to return indicating that more output space is needed is given
3685/// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3686/// or without replacement, the length of the output buffer that guarantees
3687/// the methods not to return indicating that more output space is needed is
3688/// given by [`max_utf16_buffer_length()`][4].
3689///
3690/// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3691/// and the output after each `decode_*` call is guaranteed to consist of
3692/// complete characters. (I.e. the code unit sequence for the last character is
3693/// guaranteed not to be split across output buffers.)
3694///
3695/// The boolean argument `last` indicates that the end of the stream is reached
3696/// when all the bytes in `src` have been consumed.
3697///
3698/// A `Decoder` object can be used to incrementally decode a byte stream.
3699///
3700/// During the processing of a single stream, the caller must call `decode_*`
3701/// zero or more times with `last` set to `false` and then call `decode_*` at
3702/// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3703/// the processing of the stream has ended. Otherwise, the caller must call
3704/// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3705///  a fatal error).
3706///
3707/// Once the stream has ended, the `Decoder` object must not be used anymore.
3708/// That is, you need to create another one to process another stream.
3709///
3710/// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3711/// the caller does not wish to treat it as a fatal error, the input buffer
3712/// `src` may not have been completely consumed. In that case, the caller must
3713/// pass the unconsumed contents of `src` to `decode_*` again upon the next
3714/// call.
3715///
3716/// [1]: enum.DecoderResult.html
3717/// [2]: #method.max_utf8_buffer_length_without_replacement
3718/// [3]: #method.max_utf8_buffer_length
3719/// [4]: #method.max_utf16_buffer_length
3720///
3721/// # Infinite loops
3722///
3723/// When converting with a fixed-size output buffer whose size is too small to
3724/// accommodate one character or (when applicable) one numeric character
3725/// reference of output, an infinite loop ensues. When converting with a
3726/// fixed-size output buffer, it generally makes sense to make the buffer
3727/// fairly large (e.g. couple of kilobytes).
3728pub struct Decoder {
3729    encoding: &'static Encoding,
3730    variant: VariantDecoder,
3731    life_cycle: DecoderLifeCycle,
3732}
3733
3734impl Decoder {
3735    fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3736        Decoder {
3737            encoding: enc,
3738            variant: decoder,
3739            life_cycle: match sniffing {
3740                BomHandling::Off => DecoderLifeCycle::Converting,
3741                BomHandling::Sniff => DecoderLifeCycle::AtStart,
3742                BomHandling::Remove => {
3743                    if enc == UTF_8 {
3744                        DecoderLifeCycle::AtUtf8Start
3745                    } else if enc == UTF_16BE {
3746                        DecoderLifeCycle::AtUtf16BeStart
3747                    } else if enc == UTF_16LE {
3748                        DecoderLifeCycle::AtUtf16LeStart
3749                    } else {
3750                        DecoderLifeCycle::Converting
3751                    }
3752                }
3753            },
3754        }
3755    }
3756
3757    /// The `Encoding` this `Decoder` is for.
3758    ///
3759    /// BOM sniffing can change the return value of this method during the life
3760    /// of the decoder.
3761    ///
3762    /// Available via the C wrapper.
3763    #[inline]
3764    pub fn encoding(&self) -> &'static Encoding {
3765        self.encoding
3766    }
3767
3768    /// Query the worst-case UTF-8 output size _with replacement_.
3769    ///
3770    /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3771    /// that will not overflow given the current state of the decoder and
3772    /// `byte_length` number of additional input bytes when decoding with
3773    /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3774    /// sequence or `None` if `usize` would overflow.
3775    ///
3776    /// Available via the C wrapper.
3777    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3778        // Need to consider a) the decoder morphing due to the BOM and b) a partial
3779        // BOM getting pushed to the underlying decoder.
3780        match self.life_cycle {
3781            DecoderLifeCycle::Converting
3782            | DecoderLifeCycle::AtUtf8Start
3783            | DecoderLifeCycle::AtUtf16LeStart
3784            | DecoderLifeCycle::AtUtf16BeStart => {
3785                return self.variant.max_utf8_buffer_length(byte_length);
3786            }
3787            DecoderLifeCycle::AtStart => {
3788                if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3789                    if let Some(utf16_bom) = checked_add(
3790                        1,
3791                        checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3792                    ) {
3793                        let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3794                        let encoding = self.encoding();
3795                        if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3796                            // No need to consider the internal state of the underlying decoder,
3797                            // because it is at start, because no data has reached it yet.
3798                            return Some(utf_bom);
3799                        } else if let Some(non_bom) =
3800                            self.variant.max_utf8_buffer_length(byte_length)
3801                        {
3802                            return Some(core::cmp::max(utf_bom, non_bom));
3803                        }
3804                    }
3805                }
3806            }
3807            DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3808                // Add two bytes even when only one byte has been seen,
3809                // because the one byte can become a lead byte in multibyte
3810                // decoders, but only after the decoder has been queried
3811                // for max length, so the decoder's own logic for adding
3812                // one for a pending lead cannot work.
3813                if let Some(sum) = byte_length.checked_add(2) {
3814                    if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3815                        if self.encoding() == UTF_8 {
3816                            // No need to consider the internal state of the underlying decoder,
3817                            // because it is at start, because no data has reached it yet.
3818                            return Some(utf8_bom);
3819                        } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3820                            return Some(core::cmp::max(utf8_bom, non_bom));
3821                        }
3822                    }
3823                }
3824            }
3825            DecoderLifeCycle::ConvertingWithPendingBB => {
3826                if let Some(sum) = byte_length.checked_add(2) {
3827                    return self.variant.max_utf8_buffer_length(sum);
3828                }
3829            }
3830            DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3831                // Add two bytes even when only one byte has been seen,
3832                // because the one byte can become a lead byte in multibyte
3833                // decoders, but only after the decoder has been queried
3834                // for max length, so the decoder's own logic for adding
3835                // one for a pending lead cannot work.
3836                if let Some(sum) = byte_length.checked_add(2) {
3837                    if let Some(utf16_bom) =
3838                        checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3839                    {
3840                        let encoding = self.encoding();
3841                        if encoding == UTF_16LE || encoding == UTF_16BE {
3842                            // No need to consider the internal state of the underlying decoder,
3843                            // because it is at start, because no data has reached it yet.
3844                            return Some(utf16_bom);
3845                        } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3846                            return Some(core::cmp::max(utf16_bom, non_bom));
3847                        }
3848                    }
3849                }
3850            }
3851            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3852        }
3853        None
3854    }
3855
3856    /// Query the worst-case UTF-8 output size _without replacement_.
3857    ///
3858    /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3859    /// that will not overflow given the current state of the decoder and
3860    /// `byte_length` number of additional input bytes when decoding without
3861    /// replacement error handling or `None` if `usize` would overflow.
3862    ///
3863    /// Note that this value may be too small for the `_with_replacement` case.
3864    /// Use `max_utf8_buffer_length()` for that case.
3865    ///
3866    /// Available via the C wrapper.
3867    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3868        // Need to consider a) the decoder morphing due to the BOM and b) a partial
3869        // BOM getting pushed to the underlying decoder.
3870        match self.life_cycle {
3871            DecoderLifeCycle::Converting
3872            | DecoderLifeCycle::AtUtf8Start
3873            | DecoderLifeCycle::AtUtf16LeStart
3874            | DecoderLifeCycle::AtUtf16BeStart => {
3875                return self
3876                    .variant
3877                    .max_utf8_buffer_length_without_replacement(byte_length);
3878            }
3879            DecoderLifeCycle::AtStart => {
3880                if let Some(utf8_bom) = byte_length.checked_add(3) {
3881                    if let Some(utf16_bom) = checked_add(
3882                        1,
3883                        checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3884                    ) {
3885                        let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3886                        let encoding = self.encoding();
3887                        if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3888                            // No need to consider the internal state of the underlying decoder,
3889                            // because it is at start, because no data has reached it yet.
3890                            return Some(utf_bom);
3891                        } else if let Some(non_bom) = self
3892                            .variant
3893                            .max_utf8_buffer_length_without_replacement(byte_length)
3894                        {
3895                            return Some(core::cmp::max(utf_bom, non_bom));
3896                        }
3897                    }
3898                }
3899            }
3900            DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3901                // Add two bytes even when only one byte has been seen,
3902                // because the one byte can become a lead byte in multibyte
3903                // decoders, but only after the decoder has been queried
3904                // for max length, so the decoder's own logic for adding
3905                // one for a pending lead cannot work.
3906                if let Some(sum) = byte_length.checked_add(2) {
3907                    if let Some(utf8_bom) = sum.checked_add(3) {
3908                        if self.encoding() == UTF_8 {
3909                            // No need to consider the internal state of the underlying decoder,
3910                            // because it is at start, because no data has reached it yet.
3911                            return Some(utf8_bom);
3912                        } else if let Some(non_bom) =
3913                            self.variant.max_utf8_buffer_length_without_replacement(sum)
3914                        {
3915                            return Some(core::cmp::max(utf8_bom, non_bom));
3916                        }
3917                    }
3918                }
3919            }
3920            DecoderLifeCycle::ConvertingWithPendingBB => {
3921                if let Some(sum) = byte_length.checked_add(2) {
3922                    return self.variant.max_utf8_buffer_length_without_replacement(sum);
3923                }
3924            }
3925            DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3926                // Add two bytes even when only one byte has been seen,
3927                // because the one byte can become a lead byte in multibyte
3928                // decoders, but only after the decoder has been queried
3929                // for max length, so the decoder's own logic for adding
3930                // one for a pending lead cannot work.
3931                if let Some(sum) = byte_length.checked_add(2) {
3932                    if let Some(utf16_bom) =
3933                        checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3934                    {
3935                        let encoding = self.encoding();
3936                        if encoding == UTF_16LE || encoding == UTF_16BE {
3937                            // No need to consider the internal state of the underlying decoder,
3938                            // because it is at start, because no data has reached it yet.
3939                            return Some(utf16_bom);
3940                        } else if let Some(non_bom) =
3941                            self.variant.max_utf8_buffer_length_without_replacement(sum)
3942                        {
3943                            return Some(core::cmp::max(utf16_bom, non_bom));
3944                        }
3945                    }
3946                }
3947            }
3948            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3949        }
3950        None
3951    }
3952
3953    /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3954    /// replaced with the REPLACEMENT CHARACTER.
3955    ///
3956    /// See the documentation of the struct for documentation for `decode_*`
3957    /// methods collectively.
3958    ///
3959    /// Available via the C wrapper.
3960    pub fn decode_to_utf8(
3961        &mut self,
3962        src: &[u8],
3963        dst: &mut [u8],
3964        last: bool,
3965    ) -> (CoderResult, usize, usize, bool) {
3966        let mut had_errors = false;
3967        let mut total_read = 0usize;
3968        let mut total_written = 0usize;
3969        loop {
3970            let (result, read, written) = self.decode_to_utf8_without_replacement(
3971                &src[total_read..],
3972                &mut dst[total_written..],
3973                last,
3974            );
3975            total_read += read;
3976            total_written += written;
3977            match result {
3978                DecoderResult::InputEmpty => {
3979                    return (
3980                        CoderResult::InputEmpty,
3981                        total_read,
3982                        total_written,
3983                        had_errors,
3984                    );
3985                }
3986                DecoderResult::OutputFull => {
3987                    return (
3988                        CoderResult::OutputFull,
3989                        total_read,
3990                        total_written,
3991                        had_errors,
3992                    );
3993                }
3994                DecoderResult::Malformed(_, _) => {
3995                    had_errors = true;
3996                    // There should always be space for the U+FFFD, because
3997                    // otherwise we'd have gotten OutputFull already.
3998                    // XXX: is the above comment actually true for UTF-8 itself?
3999                    // TODO: Consider having fewer bound checks here.
4000                    dst[total_written] = 0xEFu8;
4001                    total_written += 1;
4002                    dst[total_written] = 0xBFu8;
4003                    total_written += 1;
4004                    dst[total_written] = 0xBDu8;
4005                    total_written += 1;
4006                }
4007            }
4008        }
4009    }
4010
4011    /// Incrementally decode a byte stream into UTF-8 with malformed sequences
4012    /// replaced with the REPLACEMENT CHARACTER with type system signaling
4013    /// of UTF-8 validity.
4014    ///
4015    /// This methods calls `decode_to_utf8` and then zeroes
4016    /// out up to three bytes that aren't logically part of the write in order
4017    /// to retain the UTF-8 validity even for the unwritten part of the buffer.
4018    ///
4019    /// See the documentation of the struct for documentation for `decode_*`
4020    /// methods collectively.
4021    ///
4022    /// Available to Rust only.
4023    pub fn decode_to_str(
4024        &mut self,
4025        src: &[u8],
4026        dst: &mut str,
4027        last: bool,
4028    ) -> (CoderResult, usize, usize, bool) {
4029        let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4030        let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
4031        let len = bytes.len();
4032        let mut trail = written;
4033        // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4034        // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4035        // encodings to avoid overwriting here.
4036        if self.encoding != UTF_8 {
4037            let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4038            while trail < max {
4039                bytes[trail] = 0;
4040                trail += 1;
4041            }
4042        }
4043        while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4044            bytes[trail] = 0;
4045            trail += 1;
4046        }
4047        (result, read, written, replaced)
4048    }
4049
4050    /// Incrementally decode a byte stream into UTF-8 with malformed sequences
4051    /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
4052    ///
4053    /// Like the others, this method follows the logic that the output buffer is
4054    /// caller-allocated. This method treats the capacity of the `String` as
4055    /// the output limit. That is, this method guarantees not to cause a
4056    /// reallocation of the backing buffer of `String`.
4057    ///
4058    /// The return value is a tuple that contains the `DecoderResult`, the
4059    /// number of bytes read and a boolean indicating whether replacements
4060    /// were done. The number of bytes written is signaled via the length of
4061    /// the `String` changing.
4062    ///
4063    /// See the documentation of the struct for documentation for `decode_*`
4064    /// methods collectively.
4065    ///
4066    /// Available to Rust only and only with the `alloc` feature enabled (enabled
4067    /// by default).
4068    #[cfg(feature = "alloc")]
4069    pub fn decode_to_string(
4070        &mut self,
4071        src: &[u8],
4072        dst: &mut String,
4073        last: bool,
4074    ) -> (CoderResult, usize, bool) {
4075        unsafe {
4076            let vec = dst.as_mut_vec();
4077            let old_len = vec.len();
4078            let capacity = vec.capacity();
4079            vec.set_len(capacity);
4080            let (result, read, written, replaced) =
4081                self.decode_to_utf8(src, &mut vec[old_len..], last);
4082            vec.set_len(old_len + written);
4083            (result, read, replaced)
4084        }
4085    }
4086
4087    public_decode_function!(/// Incrementally decode a byte stream into UTF-8
4088                            /// _without replacement_.
4089                            ///
4090                            /// See the documentation of the struct for
4091                            /// documentation for `decode_*` methods
4092                            /// collectively.
4093                            ///
4094                            /// Available via the C wrapper.
4095                            ,
4096                            decode_to_utf8_without_replacement,
4097                            decode_to_utf8_raw,
4098                            decode_to_utf8_checking_end,
4099                            decode_to_utf8_after_one_potential_bom_byte,
4100                            decode_to_utf8_after_two_potential_bom_bytes,
4101                            decode_to_utf8_checking_end_with_offset,
4102                            u8);
4103
4104    /// Incrementally decode a byte stream into UTF-8 with type system signaling
4105    /// of UTF-8 validity.
4106    ///
4107    /// This methods calls `decode_to_utf8` and then zeroes out up to three
4108    /// bytes that aren't logically part of the write in order to retain the
4109    /// UTF-8 validity even for the unwritten part of the buffer.
4110    ///
4111    /// See the documentation of the struct for documentation for `decode_*`
4112    /// methods collectively.
4113    ///
4114    /// Available to Rust only.
4115    pub fn decode_to_str_without_replacement(
4116        &mut self,
4117        src: &[u8],
4118        dst: &mut str,
4119        last: bool,
4120    ) -> (DecoderResult, usize, usize) {
4121        let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4122        let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4123        let len = bytes.len();
4124        let mut trail = written;
4125        // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4126        // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4127        // encodings to avoid overwriting here.
4128        if self.encoding != UTF_8 {
4129            let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4130            while trail < max {
4131                bytes[trail] = 0;
4132                trail += 1;
4133            }
4134        }
4135        while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4136            bytes[trail] = 0;
4137            trail += 1;
4138        }
4139        (result, read, written)
4140    }
4141
4142    /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4143    ///
4144    /// Like the others, this method follows the logic that the output buffer is
4145    /// caller-allocated. This method treats the capacity of the `String` as
4146    /// the output limit. That is, this method guarantees not to cause a
4147    /// reallocation of the backing buffer of `String`.
4148    ///
4149    /// The return value is a pair that contains the `DecoderResult` and the
4150    /// number of bytes read. The number of bytes written is signaled via
4151    /// the length of the `String` changing.
4152    ///
4153    /// See the documentation of the struct for documentation for `decode_*`
4154    /// methods collectively.
4155    ///
4156    /// Available to Rust only and only with the `alloc` feature enabled (enabled
4157    /// by default).
4158    #[cfg(feature = "alloc")]
4159    pub fn decode_to_string_without_replacement(
4160        &mut self,
4161        src: &[u8],
4162        dst: &mut String,
4163        last: bool,
4164    ) -> (DecoderResult, usize) {
4165        unsafe {
4166            let vec = dst.as_mut_vec();
4167            let old_len = vec.len();
4168            let capacity = vec.capacity();
4169            vec.set_len(capacity);
4170            let (result, read, written) =
4171                self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4172            vec.set_len(old_len + written);
4173            (result, read)
4174        }
4175    }
4176
4177    /// Query the worst-case UTF-16 output size (with or without replacement).
4178    ///
4179    /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4180    /// that will not overflow given the current state of the decoder and
4181    /// `byte_length` number of additional input bytes or `None` if `usize`
4182    /// would overflow.
4183    ///
4184    /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4185    /// return value of this method applies also in the
4186    /// `_without_replacement` case.
4187    ///
4188    /// Available via the C wrapper.
4189    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4190        // Need to consider a) the decoder morphing due to the BOM and b) a partial
4191        // BOM getting pushed to the underlying decoder.
4192        match self.life_cycle {
4193            DecoderLifeCycle::Converting
4194            | DecoderLifeCycle::AtUtf8Start
4195            | DecoderLifeCycle::AtUtf16LeStart
4196            | DecoderLifeCycle::AtUtf16BeStart => {
4197                return self.variant.max_utf16_buffer_length(byte_length);
4198            }
4199            DecoderLifeCycle::AtStart => {
4200                if let Some(utf8_bom) = byte_length.checked_add(1) {
4201                    if let Some(utf16_bom) =
4202                        checked_add(1, checked_div(byte_length.checked_add(1), 2))
4203                    {
4204                        let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
4205                        let encoding = self.encoding();
4206                        if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4207                            // No need to consider the internal state of the underlying decoder,
4208                            // because it is at start, because no data has reached it yet.
4209                            return Some(utf_bom);
4210                        } else if let Some(non_bom) =
4211                            self.variant.max_utf16_buffer_length(byte_length)
4212                        {
4213                            return Some(core::cmp::max(utf_bom, non_bom));
4214                        }
4215                    }
4216                }
4217            }
4218            DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4219                // Add two bytes even when only one byte has been seen,
4220                // because the one byte can become a lead byte in multibyte
4221                // decoders, but only after the decoder has been queried
4222                // for max length, so the decoder's own logic for adding
4223                // one for a pending lead cannot work.
4224                if let Some(sum) = byte_length.checked_add(2) {
4225                    if let Some(utf8_bom) = sum.checked_add(1) {
4226                        if self.encoding() == UTF_8 {
4227                            // No need to consider the internal state of the underlying decoder,
4228                            // because it is at start, because no data has reached it yet.
4229                            return Some(utf8_bom);
4230                        } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4231                            return Some(core::cmp::max(utf8_bom, non_bom));
4232                        }
4233                    }
4234                }
4235            }
4236            DecoderLifeCycle::ConvertingWithPendingBB => {
4237                if let Some(sum) = byte_length.checked_add(2) {
4238                    return self.variant.max_utf16_buffer_length(sum);
4239                }
4240            }
4241            DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4242                // Add two bytes even when only one byte has been seen,
4243                // because the one byte can become a lead byte in multibyte
4244                // decoders, but only after the decoder has been queried
4245                // for max length, so the decoder's own logic for adding
4246                // one for a pending lead cannot work.
4247                if let Some(sum) = byte_length.checked_add(2) {
4248                    if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4249                        let encoding = self.encoding();
4250                        if encoding == UTF_16LE || encoding == UTF_16BE {
4251                            // No need to consider the internal state of the underlying decoder,
4252                            // because it is at start, because no data has reached it yet.
4253                            return Some(utf16_bom);
4254                        } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4255                            return Some(core::cmp::max(utf16_bom, non_bom));
4256                        }
4257                    }
4258                }
4259            }
4260            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4261        }
4262        None
4263    }
4264
4265    /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4266    /// replaced with the REPLACEMENT CHARACTER.
4267    ///
4268    /// See the documentation of the struct for documentation for `decode_*`
4269    /// methods collectively.
4270    ///
4271    /// Available via the C wrapper.
4272    pub fn decode_to_utf16(
4273        &mut self,
4274        src: &[u8],
4275        dst: &mut [u16],
4276        last: bool,
4277    ) -> (CoderResult, usize, usize, bool) {
4278        let mut had_errors = false;
4279        let mut total_read = 0usize;
4280        let mut total_written = 0usize;
4281        loop {
4282            let (result, read, written) = self.decode_to_utf16_without_replacement(
4283                &src[total_read..],
4284                &mut dst[total_written..],
4285                last,
4286            );
4287            total_read += read;
4288            total_written += written;
4289            match result {
4290                DecoderResult::InputEmpty => {
4291                    return (
4292                        CoderResult::InputEmpty,
4293                        total_read,
4294                        total_written,
4295                        had_errors,
4296                    );
4297                }
4298                DecoderResult::OutputFull => {
4299                    return (
4300                        CoderResult::OutputFull,
4301                        total_read,
4302                        total_written,
4303                        had_errors,
4304                    );
4305                }
4306                DecoderResult::Malformed(_, _) => {
4307                    had_errors = true;
4308                    // There should always be space for the U+FFFD, because
4309                    // otherwise we'd have gotten OutputFull already.
4310                    dst[total_written] = 0xFFFD;
4311                    total_written += 1;
4312                }
4313            }
4314        }
4315    }
4316
4317    public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4318                            /// _without replacement_.
4319                            ///
4320                            /// See the documentation of the struct for
4321                            /// documentation for `decode_*` methods
4322                            /// collectively.
4323                            ///
4324                            /// Available via the C wrapper.
4325                            ,
4326                            decode_to_utf16_without_replacement,
4327                            decode_to_utf16_raw,
4328                            decode_to_utf16_checking_end,
4329                            decode_to_utf16_after_one_potential_bom_byte,
4330                            decode_to_utf16_after_two_potential_bom_bytes,
4331                            decode_to_utf16_checking_end_with_offset,
4332                            u16);
4333
4334    /// Checks for compatibility with storing Unicode scalar values as unsigned
4335    /// bytes taking into account the state of the decoder.
4336    ///
4337    /// Returns `None` if the decoder is not in a neutral state, including waiting
4338    /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4339    ///
4340    /// Otherwise returns the index of the first byte whose unsigned value doesn't
4341    /// directly correspond to the decoded Unicode scalar value, or the length
4342    /// of the input if all bytes in the input decode directly to scalar values
4343    /// corresponding to the unsigned byte values.
4344    ///
4345    /// Does not change the state of the decoder.
4346    ///
4347    /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4348    /// storage optimizations.
4349    ///
4350    /// Available via the C wrapper.
4351    pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4352        match self.life_cycle {
4353            DecoderLifeCycle::Converting => {
4354                return self.variant.latin1_byte_compatible_up_to(bytes);
4355            }
4356            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4357            _ => None,
4358        }
4359    }
4360}
4361
4362/// Result of a (potentially partial) encode operation without replacement.
4363#[must_use]
4364#[derive(Debug, PartialEq, Eq)]
4365pub enum EncoderResult {
4366    /// The input was exhausted.
4367    ///
4368    /// If this result was returned from a call where `last` was `true`, the
4369    /// decoding process has completed. Otherwise, the caller should call a
4370    /// decode method again with more input.
4371    InputEmpty,
4372
4373    /// The encoder cannot produce another unit of output, because the output
4374    /// buffer does not have enough space left.
4375    ///
4376    /// The caller must provide more output space upon the next call and re-push
4377    /// the remaining input to the decoder.
4378    OutputFull,
4379
4380    /// The encoder encountered an unmappable character.
4381    ///
4382    /// The caller must either treat this as a fatal error or must append
4383    /// a placeholder to the output and then re-push the remaining input to the
4384    /// encoder.
4385    Unmappable(char),
4386}
4387
4388impl EncoderResult {
4389    fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4390        EncoderResult::Unmappable(::core::char::from_u32(u32::from(bmp)).unwrap())
4391    }
4392}
4393
4394/// A converter that encodes a Unicode stream into bytes according to a
4395/// character encoding in a streaming (incremental) manner.
4396///
4397/// The various `encode_*` methods take an input buffer (`src`) and an output
4398/// buffer `dst` both of which are caller-allocated. There are variants for
4399/// both UTF-8 and UTF-16 input buffers.
4400///
4401/// An `encode_*` method encode characters from `src` into bytes characters
4402/// stored into `dst` until one of the following three things happens:
4403///
4404/// 1. An unmappable character is encountered (`*_without_replacement` variants
4405///    only).
4406///
4407/// 2. The output buffer has been filled so near capacity that the decoder
4408///    cannot be sure that processing an additional character of input wouldn't
4409///    cause so much output that the output buffer would overflow.
4410///
4411/// 3. All the input characters have been processed.
4412///
4413/// The `encode_*` method then returns tuple of a status indicating which one
4414/// of the three reasons to return happened, how many input code units (`u8`
4415/// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4416/// how many output bytes were written (except when encoding into `Vec<u8>`,
4417/// whose length change indicates this), and in the case of the variants that
4418/// perform replacement, a boolean indicating whether an unmappable
4419/// character was replaced with a numeric character reference during the call.
4420///
4421/// The number of bytes "written" is what's logically written. Garbage may be
4422/// written in the output buffer beyond the point logically written to.
4423///
4424/// In the case of the methods whose name ends with
4425/// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4426/// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4427/// the three cases listed above).
4428///
4429/// In the case of methods whose name does not end with
4430/// `*_without_replacement`, unmappable characters are automatically replaced
4431/// with the corresponding numeric character references and unmappable
4432/// characters do not cause the methods to return early.
4433///
4434/// When encoding from UTF-8 without replacement, the methods are guaranteed
4435/// not to return indicating that more output space is needed if the length
4436/// of the output buffer is at least the length returned by
4437/// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4438/// UTF-8 with replacement, the length of the output buffer that guarantees the
4439/// methods not to return indicating that more output space is needed in the
4440/// absence of unmappable characters is given by
4441/// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4442/// UTF-16 without replacement, the methods are guaranteed not to return
4443/// indicating that more output space is needed if the length of the output
4444/// buffer is at least the length returned by
4445/// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4446/// from UTF-16 with replacement, the the length of the output buffer that
4447/// guarantees the methods not to return indicating that more output space is
4448/// needed in the absence of unmappable characters is given by
4449/// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4450/// When encoding with replacement, applications are not expected to size the
4451/// buffer for the worst case ahead of time but to resize the buffer if there
4452/// are unmappable characters. This is why max length queries are only available
4453/// for the case where there are no unmappable characters.
4454///
4455/// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4456/// calling from Rust, the type system takes care of this.) When encoding from
4457/// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4458/// CHARACTERS. Therefore, in order for astral characters not to turn into a
4459/// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4460/// are not split across input buffer boundaries.
4461///
4462/// After an `encode_*` call returns, the output produced so far, taken as a
4463/// whole from the start of the stream, is guaranteed to consist of a valid
4464/// byte sequence in the target encoding. (I.e. the code unit sequence for a
4465/// character is guaranteed not to be split across output buffers. However, due
4466/// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4467/// from the start for it to be valid. For other encodings, the validity holds
4468/// on a per-output buffer basis.)
4469///
4470/// The boolean argument `last` indicates that the end of the stream is reached
4471/// when all the characters in `src` have been consumed. This argument is needed
4472/// for ISO-2022-JP and is ignored for other encodings.
4473///
4474/// An `Encoder` object can be used to incrementally encode a byte stream.
4475///
4476/// During the processing of a single stream, the caller must call `encode_*`
4477/// zero or more times with `last` set to `false` and then call `encode_*` at
4478/// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4479/// the processing of the stream has ended. Otherwise, the caller must call
4480/// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4481/// as a fatal error).
4482///
4483/// Once the stream has ended, the `Encoder` object must not be used anymore.
4484/// That is, you need to create another one to process another stream.
4485///
4486/// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4487/// and the caller does not wish to treat it as a fatal error, the input buffer
4488/// `src` may not have been completely consumed. In that case, the caller must
4489/// pass the unconsumed contents of `src` to `encode_*` again upon the next
4490/// call.
4491///
4492/// [1]: enum.EncoderResult.html
4493/// [2]: #method.max_buffer_length_from_utf8_without_replacement
4494/// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4495/// [4]: #method.max_buffer_length_from_utf16_without_replacement
4496/// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4497///
4498/// # Infinite loops
4499///
4500/// When converting with a fixed-size output buffer whose size is too small to
4501/// accommodate one character of output, an infinite loop ensues. When
4502/// converting with a fixed-size output buffer, it generally makes sense to
4503/// make the buffer fairly large (e.g. couple of kilobytes).
4504pub struct Encoder {
4505    encoding: &'static Encoding,
4506    variant: VariantEncoder,
4507}
4508
4509impl Encoder {
4510    fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4511        Encoder {
4512            encoding: enc,
4513            variant: encoder,
4514        }
4515    }
4516
4517    /// The `Encoding` this `Encoder` is for.
4518    #[inline]
4519    pub fn encoding(&self) -> &'static Encoding {
4520        self.encoding
4521    }
4522
4523    /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4524    /// ASCII state and `false` otherwise.
4525    #[inline]
4526    pub fn has_pending_state(&self) -> bool {
4527        self.variant.has_pending_state()
4528    }
4529
4530    /// Query the worst-case output size when encoding from UTF-8 with
4531    /// replacement.
4532    ///
4533    /// Returns the size of the output buffer in bytes that will not overflow
4534    /// given the current state of the encoder and `byte_length` number of
4535    /// additional input code units if there are no unmappable characters in
4536    /// the input or `None` if `usize` would overflow.
4537    ///
4538    /// Available via the C wrapper.
4539    pub fn max_buffer_length_from_utf8_if_no_unmappables(
4540        &self,
4541        byte_length: usize,
4542    ) -> Option<usize> {
4543        checked_add(
4544            if self.encoding().can_encode_everything() {
4545                0
4546            } else {
4547                NCR_EXTRA
4548            },
4549            self.max_buffer_length_from_utf8_without_replacement(byte_length),
4550        )
4551    }
4552
4553    /// Query the worst-case output size when encoding from UTF-8 without
4554    /// replacement.
4555    ///
4556    /// Returns the size of the output buffer in bytes that will not overflow
4557    /// given the current state of the encoder and `byte_length` number of
4558    /// additional input code units or `None` if `usize` would overflow.
4559    ///
4560    /// Available via the C wrapper.
4561    pub fn max_buffer_length_from_utf8_without_replacement(
4562        &self,
4563        byte_length: usize,
4564    ) -> Option<usize> {
4565        self.variant
4566            .max_buffer_length_from_utf8_without_replacement(byte_length)
4567    }
4568
4569    /// Incrementally encode into byte stream from UTF-8 with unmappable
4570    /// characters replaced with HTML (decimal) numeric character references.
4571    ///
4572    /// See the documentation of the struct for documentation for `encode_*`
4573    /// methods collectively.
4574    ///
4575    /// Available via the C wrapper.
4576    pub fn encode_from_utf8(
4577        &mut self,
4578        src: &str,
4579        dst: &mut [u8],
4580        last: bool,
4581    ) -> (CoderResult, usize, usize, bool) {
4582        let dst_len = dst.len();
4583        let effective_dst_len = if self.encoding().can_encode_everything() {
4584            dst_len
4585        } else {
4586            if dst_len < NCR_EXTRA {
4587                if src.is_empty() && !(last && self.has_pending_state()) {
4588                    return (CoderResult::InputEmpty, 0, 0, false);
4589                }
4590                return (CoderResult::OutputFull, 0, 0, false);
4591            }
4592            dst_len - NCR_EXTRA
4593        };
4594        let mut had_unmappables = false;
4595        let mut total_read = 0usize;
4596        let mut total_written = 0usize;
4597        loop {
4598            let (result, read, written) = self.encode_from_utf8_without_replacement(
4599                &src[total_read..],
4600                &mut dst[total_written..effective_dst_len],
4601                last,
4602            );
4603            total_read += read;
4604            total_written += written;
4605            match result {
4606                EncoderResult::InputEmpty => {
4607                    return (
4608                        CoderResult::InputEmpty,
4609                        total_read,
4610                        total_written,
4611                        had_unmappables,
4612                    );
4613                }
4614                EncoderResult::OutputFull => {
4615                    return (
4616                        CoderResult::OutputFull,
4617                        total_read,
4618                        total_written,
4619                        had_unmappables,
4620                    );
4621                }
4622                EncoderResult::Unmappable(unmappable) => {
4623                    had_unmappables = true;
4624                    debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4625                    debug_assert_ne!(self.encoding(), UTF_16BE);
4626                    debug_assert_ne!(self.encoding(), UTF_16LE);
4627                    // Additionally, Iso2022JpEncoder is responsible for
4628                    // transitioning to ASCII when returning with Unmappable.
4629                    total_written += write_ncr(unmappable, &mut dst[total_written..]);
4630                    if total_written >= effective_dst_len {
4631                        if total_read == src.len() && !(last && self.has_pending_state()) {
4632                            return (
4633                                CoderResult::InputEmpty,
4634                                total_read,
4635                                total_written,
4636                                had_unmappables,
4637                            );
4638                        }
4639                        return (
4640                            CoderResult::OutputFull,
4641                            total_read,
4642                            total_written,
4643                            had_unmappables,
4644                        );
4645                    }
4646                }
4647            }
4648        }
4649    }
4650
4651    /// Incrementally encode into byte stream from UTF-8 with unmappable
4652    /// characters replaced with HTML (decimal) numeric character references.
4653    ///
4654    /// See the documentation of the struct for documentation for `encode_*`
4655    /// methods collectively.
4656    ///
4657    /// Available to Rust only and only with the `alloc` feature enabled (enabled
4658    /// by default).
4659    #[cfg(feature = "alloc")]
4660    pub fn encode_from_utf8_to_vec(
4661        &mut self,
4662        src: &str,
4663        dst: &mut Vec<u8>,
4664        last: bool,
4665    ) -> (CoderResult, usize, bool) {
4666        unsafe {
4667            let old_len = dst.len();
4668            let capacity = dst.capacity();
4669            dst.set_len(capacity);
4670            let (result, read, written, replaced) =
4671                self.encode_from_utf8(src, &mut dst[old_len..], last);
4672            dst.set_len(old_len + written);
4673            (result, read, replaced)
4674        }
4675    }
4676
4677    /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4678    ///
4679    /// See the documentation of the struct for documentation for `encode_*`
4680    /// methods collectively.
4681    ///
4682    /// Available via the C wrapper.
4683    pub fn encode_from_utf8_without_replacement(
4684        &mut self,
4685        src: &str,
4686        dst: &mut [u8],
4687        last: bool,
4688    ) -> (EncoderResult, usize, usize) {
4689        self.variant.encode_from_utf8_raw(src, dst, last)
4690    }
4691
4692    /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4693    ///
4694    /// See the documentation of the struct for documentation for `encode_*`
4695    /// methods collectively.
4696    ///
4697    /// Available to Rust only and only with the `alloc` feature enabled (enabled
4698    /// by default).
4699    #[cfg(feature = "alloc")]
4700    pub fn encode_from_utf8_to_vec_without_replacement(
4701        &mut self,
4702        src: &str,
4703        dst: &mut Vec<u8>,
4704        last: bool,
4705    ) -> (EncoderResult, usize) {
4706        unsafe {
4707            let old_len = dst.len();
4708            let capacity = dst.capacity();
4709            dst.set_len(capacity);
4710            let (result, read, written) =
4711                self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4712            dst.set_len(old_len + written);
4713            (result, read)
4714        }
4715    }
4716
4717    /// Query the worst-case output size when encoding from UTF-16 with
4718    /// replacement.
4719    ///
4720    /// Returns the size of the output buffer in bytes that will not overflow
4721    /// given the current state of the encoder and `u16_length` number of
4722    /// additional input code units if there are no unmappable characters in
4723    /// the input or `None` if `usize` would overflow.
4724    ///
4725    /// Available via the C wrapper.
4726    pub fn max_buffer_length_from_utf16_if_no_unmappables(
4727        &self,
4728        u16_length: usize,
4729    ) -> Option<usize> {
4730        checked_add(
4731            if self.encoding().can_encode_everything() {
4732                0
4733            } else {
4734                NCR_EXTRA
4735            },
4736            self.max_buffer_length_from_utf16_without_replacement(u16_length),
4737        )
4738    }
4739
4740    /// Query the worst-case output size when encoding from UTF-16 without
4741    /// replacement.
4742    ///
4743    /// Returns the size of the output buffer in bytes that will not overflow
4744    /// given the current state of the encoder and `u16_length` number of
4745    /// additional input code units or `None` if `usize` would overflow.
4746    ///
4747    /// Available via the C wrapper.
4748    pub fn max_buffer_length_from_utf16_without_replacement(
4749        &self,
4750        u16_length: usize,
4751    ) -> Option<usize> {
4752        self.variant
4753            .max_buffer_length_from_utf16_without_replacement(u16_length)
4754    }
4755
4756    /// Incrementally encode into byte stream from UTF-16 with unmappable
4757    /// characters replaced with HTML (decimal) numeric character references.
4758    ///
4759    /// See the documentation of the struct for documentation for `encode_*`
4760    /// methods collectively.
4761    ///
4762    /// Available via the C wrapper.
4763    pub fn encode_from_utf16(
4764        &mut self,
4765        src: &[u16],
4766        dst: &mut [u8],
4767        last: bool,
4768    ) -> (CoderResult, usize, usize, bool) {
4769        let dst_len = dst.len();
4770        let effective_dst_len = if self.encoding().can_encode_everything() {
4771            dst_len
4772        } else {
4773            if dst_len < NCR_EXTRA {
4774                if src.is_empty() && !(last && self.has_pending_state()) {
4775                    return (CoderResult::InputEmpty, 0, 0, false);
4776                }
4777                return (CoderResult::OutputFull, 0, 0, false);
4778            }
4779            dst_len - NCR_EXTRA
4780        };
4781        let mut had_unmappables = false;
4782        let mut total_read = 0usize;
4783        let mut total_written = 0usize;
4784        loop {
4785            let (result, read, written) = self.encode_from_utf16_without_replacement(
4786                &src[total_read..],
4787                &mut dst[total_written..effective_dst_len],
4788                last,
4789            );
4790            total_read += read;
4791            total_written += written;
4792            match result {
4793                EncoderResult::InputEmpty => {
4794                    return (
4795                        CoderResult::InputEmpty,
4796                        total_read,
4797                        total_written,
4798                        had_unmappables,
4799                    );
4800                }
4801                EncoderResult::OutputFull => {
4802                    return (
4803                        CoderResult::OutputFull,
4804                        total_read,
4805                        total_written,
4806                        had_unmappables,
4807                    );
4808                }
4809                EncoderResult::Unmappable(unmappable) => {
4810                    had_unmappables = true;
4811                    debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4812                    // There are no UTF-16 encoders and even if there were,
4813                    // they'd never have unmappables.
4814                    debug_assert_ne!(self.encoding(), UTF_16BE);
4815                    debug_assert_ne!(self.encoding(), UTF_16LE);
4816                    // Additionally, Iso2022JpEncoder is responsible for
4817                    // transitioning to ASCII when returning with Unmappable
4818                    // from the jis0208 state. That is, when we encode
4819                    // ISO-2022-JP and come here, the encoder is in either the
4820                    // ASCII or the Roman state. We are allowed to generate any
4821                    // printable ASCII excluding \ and ~.
4822                    total_written += write_ncr(unmappable, &mut dst[total_written..]);
4823                    if total_written >= effective_dst_len {
4824                        if total_read == src.len() && !(last && self.has_pending_state()) {
4825                            return (
4826                                CoderResult::InputEmpty,
4827                                total_read,
4828                                total_written,
4829                                had_unmappables,
4830                            );
4831                        }
4832                        return (
4833                            CoderResult::OutputFull,
4834                            total_read,
4835                            total_written,
4836                            had_unmappables,
4837                        );
4838                    }
4839                }
4840            }
4841        }
4842    }
4843
4844    /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4845    ///
4846    /// See the documentation of the struct for documentation for `encode_*`
4847    /// methods collectively.
4848    ///
4849    /// Available via the C wrapper.
4850    pub fn encode_from_utf16_without_replacement(
4851        &mut self,
4852        src: &[u16],
4853        dst: &mut [u8],
4854        last: bool,
4855    ) -> (EncoderResult, usize, usize) {
4856        self.variant.encode_from_utf16_raw(src, dst, last)
4857    }
4858}
4859
4860/// Format an unmappable as NCR without heap allocation.
4861fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4862    // len is the number of decimal digits needed to represent unmappable plus
4863    // 3 (the length of "&#" and ";").
4864    let mut number = unmappable as u32;
4865    let len = if number >= 1_000_000u32 {
4866        10usize
4867    } else if number >= 100_000u32 {
4868        9usize
4869    } else if number >= 10_000u32 {
4870        8usize
4871    } else if number >= 1_000u32 {
4872        7usize
4873    } else if number >= 100u32 {
4874        6usize
4875    } else {
4876        // Review the outcome of https://github.com/whatwg/encoding/issues/15
4877        // to see if this case is possible
4878        5usize
4879    };
4880    debug_assert!(number >= 10u32);
4881    debug_assert!(len <= dst.len());
4882    let mut pos = len - 1;
4883    dst[pos] = b';';
4884    pos -= 1;
4885    loop {
4886        let rightmost = number % 10;
4887        dst[pos] = rightmost as u8 + b'0';
4888        pos -= 1;
4889        if number < 10 {
4890            break;
4891        }
4892        number /= 10;
4893    }
4894    dst[1] = b'#';
4895    dst[0] = b'&';
4896    len
4897}
4898
4899#[inline(always)]
4900fn in_range16(i: u16, start: u16, end: u16) -> bool {
4901    i.wrapping_sub(start) < (end - start)
4902}
4903
4904#[inline(always)]
4905fn in_range32(i: u32, start: u32, end: u32) -> bool {
4906    i.wrapping_sub(start) < (end - start)
4907}
4908
4909#[inline(always)]
4910fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4911    i.wrapping_sub(start) <= (end - start)
4912}
4913
4914#[inline(always)]
4915fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4916    i.wrapping_sub(start) <= (end - start)
4917}
4918
4919#[inline(always)]
4920fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4921    i.wrapping_sub(start) <= (end - start)
4922}
4923
4924#[inline(always)]
4925fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4926    i.wrapping_sub(start) <= (end - start)
4927}
4928
4929#[inline(always)]
4930fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4931    if let Some(n) = opt {
4932        n.checked_add(num)
4933    } else {
4934        None
4935    }
4936}
4937
4938#[inline(always)]
4939fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4940    if let Some(n) = one {
4941        checked_add(n, other)
4942    } else {
4943        None
4944    }
4945}
4946
4947#[inline(always)]
4948fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4949    if let Some(n) = opt {
4950        n.checked_mul(num)
4951    } else {
4952        None
4953    }
4954}
4955
4956#[inline(always)]
4957fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4958    if let Some(n) = opt {
4959        n.checked_div(num)
4960    } else {
4961        None
4962    }
4963}
4964
4965#[cfg(feature = "alloc")]
4966#[inline(always)]
4967fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4968    opt.map(|n| n.next_power_of_two())
4969}
4970
4971#[cfg(feature = "alloc")]
4972#[inline(always)]
4973fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4974    if let Some(a) = one {
4975        if let Some(b) = other {
4976            Some(::core::cmp::min(a, b))
4977        } else {
4978            Some(a)
4979        }
4980    } else {
4981        other
4982    }
4983}
4984
4985// ############## TESTS ###############
4986
4987#[cfg(all(test, feature = "serde"))]
4988#[derive(Serialize, Deserialize, Debug, PartialEq)]
4989struct Demo {
4990    num: u32,
4991    name: String,
4992    enc: &'static Encoding,
4993}
4994
4995#[cfg(test)]
4996mod test_labels_names;
4997
4998#[cfg(all(test, feature = "alloc"))]
4999mod tests {
5000    use super::*;
5001    use alloc::borrow::Cow;
5002
5003    fn sniff_to_utf16(
5004        initial_encoding: &'static Encoding,
5005        expected_encoding: &'static Encoding,
5006        bytes: &[u8],
5007        expect: &[u16],
5008        breaks: &[usize],
5009    ) {
5010        let mut decoder = initial_encoding.new_decoder();
5011
5012        let mut dest: Vec<u16> =
5013            Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
5014        let capacity = dest.capacity();
5015        dest.resize(capacity, 0u16);
5016
5017        let mut total_written = 0usize;
5018        let mut start = 0usize;
5019        for br in breaks {
5020            let (result, read, written, _) =
5021                decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
5022            total_written += written;
5023            assert_eq!(read, *br - start);
5024            match result {
5025                CoderResult::InputEmpty => {}
5026                CoderResult::OutputFull => {
5027                    unreachable!();
5028                }
5029            }
5030            start = *br;
5031        }
5032        let (result, read, written, _) =
5033            decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
5034        total_written += written;
5035        match result {
5036            CoderResult::InputEmpty => {}
5037            CoderResult::OutputFull => {
5038                unreachable!();
5039            }
5040        }
5041        assert_eq!(read, bytes.len() - start);
5042        assert_eq!(total_written, expect.len());
5043        assert_eq!(&dest[..total_written], expect);
5044        assert_eq!(decoder.encoding(), expected_encoding);
5045    }
5046
5047    // Any copyright to the test code below this comment is dedicated to the
5048    // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
5049
5050    #[test]
5051    fn test_bom_sniffing() {
5052        // ASCII
5053        sniff_to_utf16(
5054            WINDOWS_1252,
5055            WINDOWS_1252,
5056            b"\x61\x62",
5057            &[0x0061u16, 0x0062u16],
5058            &[],
5059        );
5060        // UTF-8
5061        sniff_to_utf16(
5062            WINDOWS_1252,
5063            UTF_8,
5064            b"\xEF\xBB\xBF\x61\x62",
5065            &[0x0061u16, 0x0062u16],
5066            &[],
5067        );
5068        sniff_to_utf16(
5069            WINDOWS_1252,
5070            UTF_8,
5071            b"\xEF\xBB\xBF\x61\x62",
5072            &[0x0061u16, 0x0062u16],
5073            &[1],
5074        );
5075        sniff_to_utf16(
5076            WINDOWS_1252,
5077            UTF_8,
5078            b"\xEF\xBB\xBF\x61\x62",
5079            &[0x0061u16, 0x0062u16],
5080            &[2],
5081        );
5082        sniff_to_utf16(
5083            WINDOWS_1252,
5084            UTF_8,
5085            b"\xEF\xBB\xBF\x61\x62",
5086            &[0x0061u16, 0x0062u16],
5087            &[3],
5088        );
5089        sniff_to_utf16(
5090            WINDOWS_1252,
5091            UTF_8,
5092            b"\xEF\xBB\xBF\x61\x62",
5093            &[0x0061u16, 0x0062u16],
5094            &[4],
5095        );
5096        sniff_to_utf16(
5097            WINDOWS_1252,
5098            UTF_8,
5099            b"\xEF\xBB\xBF\x61\x62",
5100            &[0x0061u16, 0x0062u16],
5101            &[2, 3],
5102        );
5103        sniff_to_utf16(
5104            WINDOWS_1252,
5105            UTF_8,
5106            b"\xEF\xBB\xBF\x61\x62",
5107            &[0x0061u16, 0x0062u16],
5108            &[1, 2],
5109        );
5110        sniff_to_utf16(
5111            WINDOWS_1252,
5112            UTF_8,
5113            b"\xEF\xBB\xBF\x61\x62",
5114            &[0x0061u16, 0x0062u16],
5115            &[1, 3],
5116        );
5117        sniff_to_utf16(
5118            WINDOWS_1252,
5119            UTF_8,
5120            b"\xEF\xBB\xBF\x61\x62",
5121            &[0x0061u16, 0x0062u16],
5122            &[1, 2, 3, 4],
5123        );
5124        sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
5125        // Not UTF-8
5126        sniff_to_utf16(
5127            WINDOWS_1252,
5128            WINDOWS_1252,
5129            b"\xEF\xBB\x61\x62",
5130            &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5131            &[],
5132        );
5133        sniff_to_utf16(
5134            WINDOWS_1252,
5135            WINDOWS_1252,
5136            b"\xEF\xBB\x61\x62",
5137            &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5138            &[1],
5139        );
5140        sniff_to_utf16(
5141            WINDOWS_1252,
5142            WINDOWS_1252,
5143            b"\xEF\x61\x62",
5144            &[0x00EFu16, 0x0061u16, 0x0062u16],
5145            &[],
5146        );
5147        sniff_to_utf16(
5148            WINDOWS_1252,
5149            WINDOWS_1252,
5150            b"\xEF\x61\x62",
5151            &[0x00EFu16, 0x0061u16, 0x0062u16],
5152            &[1],
5153        );
5154        sniff_to_utf16(
5155            WINDOWS_1252,
5156            WINDOWS_1252,
5157            b"\xEF\xBB",
5158            &[0x00EFu16, 0x00BBu16],
5159            &[],
5160        );
5161        sniff_to_utf16(
5162            WINDOWS_1252,
5163            WINDOWS_1252,
5164            b"\xEF\xBB",
5165            &[0x00EFu16, 0x00BBu16],
5166            &[1],
5167        );
5168        sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5169        // Not UTF-16
5170        sniff_to_utf16(
5171            WINDOWS_1252,
5172            WINDOWS_1252,
5173            b"\xFE\x61\x62",
5174            &[0x00FEu16, 0x0061u16, 0x0062u16],
5175            &[],
5176        );
5177        sniff_to_utf16(
5178            WINDOWS_1252,
5179            WINDOWS_1252,
5180            b"\xFE\x61\x62",
5181            &[0x00FEu16, 0x0061u16, 0x0062u16],
5182            &[1],
5183        );
5184        sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5185        sniff_to_utf16(
5186            WINDOWS_1252,
5187            WINDOWS_1252,
5188            b"\xFF\x61\x62",
5189            &[0x00FFu16, 0x0061u16, 0x0062u16],
5190            &[],
5191        );
5192        sniff_to_utf16(
5193            WINDOWS_1252,
5194            WINDOWS_1252,
5195            b"\xFF\x61\x62",
5196            &[0x00FFu16, 0x0061u16, 0x0062u16],
5197            &[1],
5198        );
5199        sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5200        // UTF-16
5201        sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5202        sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5203        sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5204        sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5205    }
5206
5207    #[test]
5208    fn test_output_encoding() {
5209        assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5210        assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5211        assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5212        assert_eq!(UTF_8.output_encoding(), UTF_8);
5213        assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5214        assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5215        assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5216        assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5217        assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5218        assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5219    }
5220
5221    #[test]
5222    fn test_label_resolution() {
5223        assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5224        assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5225        assert_eq!(
5226            Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5227            Some(UTF_8)
5228        );
5229        assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5230        assert_eq!(Encoding::for_label(b"bogus"), None);
5231        assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5232    }
5233
5234    #[test]
5235    fn test_decode_valid_windows_1257_to_cow() {
5236        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5237        match cow {
5238            Cow::Borrowed(_) => unreachable!(),
5239            Cow::Owned(s) => {
5240                assert_eq!(s, "abc\u{20AC}\u{00E4}");
5241            }
5242        }
5243        assert_eq!(encoding, WINDOWS_1257);
5244        assert!(!had_errors);
5245    }
5246
5247    #[test]
5248    fn test_decode_invalid_windows_1257_to_cow() {
5249        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5250        match cow {
5251            Cow::Borrowed(_) => unreachable!(),
5252            Cow::Owned(s) => {
5253                assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5254            }
5255        }
5256        assert_eq!(encoding, WINDOWS_1257);
5257        assert!(had_errors);
5258    }
5259
5260    #[test]
5261    fn test_decode_ascii_only_windows_1257_to_cow() {
5262        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5263        match cow {
5264            Cow::Borrowed(s) => {
5265                assert_eq!(s, "abc");
5266            }
5267            Cow::Owned(_) => unreachable!(),
5268        }
5269        assert_eq!(encoding, WINDOWS_1257);
5270        assert!(!had_errors);
5271    }
5272
5273    #[test]
5274    fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5275        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5276        match cow {
5277            Cow::Borrowed(s) => {
5278                assert_eq!(s, "\u{20AC}\u{00E4}");
5279            }
5280            Cow::Owned(_) => unreachable!(),
5281        }
5282        assert_eq!(encoding, UTF_8);
5283        assert!(!had_errors);
5284    }
5285
5286    #[test]
5287    fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5288        let (cow, encoding, had_errors) =
5289            WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5290        match cow {
5291            Cow::Borrowed(_) => unreachable!(),
5292            Cow::Owned(s) => {
5293                assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5294            }
5295        }
5296        assert_eq!(encoding, UTF_8);
5297        assert!(had_errors);
5298    }
5299
5300    #[test]
5301    fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5302        let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5303        match cow {
5304            Cow::Borrowed(s) => {
5305                assert_eq!(s, "\u{20AC}\u{00E4}");
5306            }
5307            Cow::Owned(_) => unreachable!(),
5308        }
5309        assert_eq!(encoding, UTF_8);
5310        assert!(!had_errors);
5311    }
5312
5313    #[test]
5314    fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5315        let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5316        match cow {
5317            Cow::Borrowed(_) => unreachable!(),
5318            Cow::Owned(s) => {
5319                assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5320            }
5321        }
5322        assert_eq!(encoding, UTF_8);
5323        assert!(had_errors);
5324    }
5325
5326    #[test]
5327    fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5328        let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5329        match cow {
5330            Cow::Borrowed(s) => {
5331                assert_eq!(s, "\u{20AC}\u{00E4}");
5332            }
5333            Cow::Owned(_) => unreachable!(),
5334        }
5335        assert!(!had_errors);
5336    }
5337
5338    #[test]
5339    fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5340        let (cow, had_errors) =
5341            WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5342        match cow {
5343            Cow::Borrowed(_) => unreachable!(),
5344            Cow::Owned(s) => {
5345                assert_eq!(
5346                    s,
5347                    "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5348                );
5349            }
5350        }
5351        assert!(!had_errors);
5352    }
5353
5354    #[test]
5355    fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5356        let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5357        match cow {
5358            Cow::Borrowed(_) => unreachable!(),
5359            Cow::Owned(s) => {
5360                assert_eq!(s, "abc\u{20AC}\u{00E4}");
5361            }
5362        }
5363        assert!(!had_errors);
5364    }
5365
5366    #[test]
5367    fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5368        let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5369        match cow {
5370            Cow::Borrowed(_) => unreachable!(),
5371            Cow::Owned(s) => {
5372                assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5373            }
5374        }
5375        assert!(had_errors);
5376    }
5377
5378    #[test]
5379    fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5380        let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5381        match cow {
5382            Cow::Borrowed(s) => {
5383                assert_eq!(s, "abc");
5384            }
5385            Cow::Owned(_) => unreachable!(),
5386        }
5387        assert!(!had_errors);
5388    }
5389
5390    #[test]
5391    fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5392        let (cow, had_errors) =
5393            UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5394        match cow {
5395            Cow::Borrowed(s) => {
5396                assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5397            }
5398            Cow::Owned(_) => unreachable!(),
5399        }
5400        assert!(!had_errors);
5401    }
5402
5403    #[test]
5404    fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5405        let (cow, had_errors) =
5406            UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5407        match cow {
5408            Cow::Borrowed(_) => unreachable!(),
5409            Cow::Owned(s) => {
5410                assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5411            }
5412        }
5413        assert!(had_errors);
5414    }
5415
5416    #[test]
5417    fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5418        let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5419        match cow {
5420            Cow::Borrowed(_) => unreachable!(),
5421            Cow::Owned(s) => {
5422                assert_eq!(s, "abc\u{20AC}\u{00E4}");
5423            }
5424        }
5425        assert!(!had_errors);
5426    }
5427
5428    #[test]
5429    fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5430        let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5431        match cow {
5432            Cow::Borrowed(_) => unreachable!(),
5433            Cow::Owned(s) => {
5434                assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5435            }
5436        }
5437        assert!(had_errors);
5438    }
5439
5440    #[test]
5441    fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5442        let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5443        match cow {
5444            Cow::Borrowed(s) => {
5445                assert_eq!(s, "abc");
5446            }
5447            Cow::Owned(_) => unreachable!(),
5448        }
5449        assert!(!had_errors);
5450    }
5451
5452    #[test]
5453    fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5454        match UTF_8.decode_without_bom_handling_and_without_replacement(
5455            b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5456        ) {
5457            Some(cow) => match cow {
5458                Cow::Borrowed(s) => {
5459                    assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5460                }
5461                Cow::Owned(_) => unreachable!(),
5462            },
5463            None => unreachable!(),
5464        }
5465    }
5466
5467    #[test]
5468    fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5469        assert!(UTF_8
5470            .decode_without_bom_handling_and_without_replacement(
5471                b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5472            )
5473            .is_none());
5474    }
5475
5476    #[test]
5477    fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5478        match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5479            Some(cow) => match cow {
5480                Cow::Borrowed(_) => unreachable!(),
5481                Cow::Owned(s) => {
5482                    assert_eq!(s, "abc\u{20AC}\u{00E4}");
5483                }
5484            },
5485            None => unreachable!(),
5486        }
5487    }
5488
5489    #[test]
5490    fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5491        assert!(WINDOWS_1257
5492            .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5493            .is_none());
5494    }
5495
5496    #[test]
5497    fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5498        match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5499            Some(cow) => match cow {
5500                Cow::Borrowed(s) => {
5501                    assert_eq!(s, "abc");
5502                }
5503                Cow::Owned(_) => unreachable!(),
5504            },
5505            None => unreachable!(),
5506        }
5507    }
5508
5509    #[test]
5510    fn test_encode_ascii_only_windows_1257_to_cow() {
5511        let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5512        match cow {
5513            Cow::Borrowed(s) => {
5514                assert_eq!(s, b"abc");
5515            }
5516            Cow::Owned(_) => unreachable!(),
5517        }
5518        assert_eq!(encoding, WINDOWS_1257);
5519        assert!(!had_errors);
5520    }
5521
5522    #[test]
5523    fn test_encode_valid_windows_1257_to_cow() {
5524        let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5525        match cow {
5526            Cow::Borrowed(_) => unreachable!(),
5527            Cow::Owned(s) => {
5528                assert_eq!(s, b"abc\x80\xE4");
5529            }
5530        }
5531        assert_eq!(encoding, WINDOWS_1257);
5532        assert!(!had_errors);
5533    }
5534
5535    #[test]
5536    fn test_utf16_space_with_one_bom_byte() {
5537        let mut decoder = UTF_16LE.new_decoder();
5538        let mut dst = [0u16; 12];
5539        {
5540            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5541            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5542            assert_eq!(result, CoderResult::InputEmpty);
5543        }
5544        {
5545            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5546            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5547            assert_eq!(result, CoderResult::InputEmpty);
5548        }
5549    }
5550
5551    #[test]
5552    fn test_utf8_space_with_one_bom_byte() {
5553        let mut decoder = UTF_8.new_decoder();
5554        let mut dst = [0u16; 12];
5555        {
5556            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5557            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5558            assert_eq!(result, CoderResult::InputEmpty);
5559        }
5560        {
5561            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5562            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5563            assert_eq!(result, CoderResult::InputEmpty);
5564        }
5565    }
5566
5567    #[test]
5568    fn test_utf16_space_with_two_bom_bytes() {
5569        let mut decoder = UTF_16LE.new_decoder();
5570        let mut dst = [0u16; 12];
5571        {
5572            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5573            let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5574            assert_eq!(result, CoderResult::InputEmpty);
5575        }
5576        {
5577            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5578            let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5579            assert_eq!(result, CoderResult::InputEmpty);
5580        }
5581        {
5582            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5583            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5584            assert_eq!(result, CoderResult::InputEmpty);
5585        }
5586    }
5587
5588    #[test]
5589    fn test_utf8_space_with_two_bom_bytes() {
5590        let mut decoder = UTF_8.new_decoder();
5591        let mut dst = [0u16; 12];
5592        {
5593            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5594            let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5595            assert_eq!(result, CoderResult::InputEmpty);
5596        }
5597        {
5598            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5599            let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5600            assert_eq!(result, CoderResult::InputEmpty);
5601        }
5602        {
5603            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5604            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5605            assert_eq!(result, CoderResult::InputEmpty);
5606        }
5607    }
5608
5609    #[test]
5610    fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5611        let mut decoder = UTF_16LE.new_decoder();
5612        let mut dst = [0u16; 12];
5613        {
5614            let needed = decoder.max_utf16_buffer_length(2).unwrap();
5615            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5616            assert_eq!(result, CoderResult::InputEmpty);
5617        }
5618    }
5619
5620    #[test]
5621    fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5622        let mut dst = [0u8; 8];
5623        let mut encoder = ISO_2022_JP.new_encoder();
5624        {
5625            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5626            assert_eq!(result, CoderResult::InputEmpty);
5627        }
5628        {
5629            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5630            assert_eq!(result, CoderResult::InputEmpty);
5631        }
5632    }
5633
5634    #[test]
5635    fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5636        let mut dst = [0u8; 16];
5637        let mut encoder = ISO_2022_JP.new_encoder();
5638        {
5639            let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5640            assert_eq!(result, CoderResult::InputEmpty);
5641        }
5642        {
5643            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5644            assert_eq!(result, CoderResult::InputEmpty);
5645        }
5646        {
5647            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5648            assert_eq!(result, CoderResult::OutputFull);
5649        }
5650    }
5651
5652    #[test]
5653    fn test_buffer_end_iso_2022_jp_from_utf8() {
5654        let mut dst = [0u8; 18];
5655        {
5656            let mut encoder = ISO_2022_JP.new_encoder();
5657            let (result, _, _, _) =
5658                encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5659            assert_eq!(result, CoderResult::InputEmpty);
5660        }
5661        {
5662            let mut encoder = ISO_2022_JP.new_encoder();
5663            let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5664            assert_eq!(result, CoderResult::OutputFull);
5665        }
5666        {
5667            let mut encoder = ISO_2022_JP.new_encoder();
5668            let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5669            assert_eq!(result, CoderResult::InputEmpty);
5670        }
5671        {
5672            let mut encoder = ISO_2022_JP.new_encoder();
5673            let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5674            assert_eq!(result, CoderResult::InputEmpty);
5675        }
5676    }
5677
5678    #[test]
5679    fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5680        let mut dst = [0u8; 8];
5681        let mut encoder = ISO_2022_JP.new_encoder();
5682        {
5683            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5684            assert_eq!(result, CoderResult::InputEmpty);
5685        }
5686        {
5687            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5688            assert_eq!(result, CoderResult::InputEmpty);
5689        }
5690    }
5691
5692    #[test]
5693    fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5694        let mut dst = [0u8; 16];
5695        let mut encoder = ISO_2022_JP.new_encoder();
5696        {
5697            let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5698            assert_eq!(result, CoderResult::InputEmpty);
5699        }
5700        {
5701            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5702            assert_eq!(result, CoderResult::InputEmpty);
5703        }
5704        {
5705            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5706            assert_eq!(result, CoderResult::OutputFull);
5707        }
5708    }
5709
5710    #[test]
5711    fn test_buffer_end_iso_2022_jp_from_utf16() {
5712        let mut dst = [0u8; 18];
5713        {
5714            let mut encoder = ISO_2022_JP.new_encoder();
5715            let (result, _, _, _) =
5716                encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5717            assert_eq!(result, CoderResult::InputEmpty);
5718        }
5719        {
5720            let mut encoder = ISO_2022_JP.new_encoder();
5721            let (result, _, _, _) =
5722                encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5723            assert_eq!(result, CoderResult::OutputFull);
5724        }
5725        {
5726            let mut encoder = ISO_2022_JP.new_encoder();
5727            let (result, _, _, _) =
5728                encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5729            assert_eq!(result, CoderResult::InputEmpty);
5730        }
5731        {
5732            let mut encoder = ISO_2022_JP.new_encoder();
5733            let (result, _, _, _) =
5734                encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5735            assert_eq!(result, CoderResult::InputEmpty);
5736        }
5737    }
5738
5739    #[test]
5740    fn test_buffer_end_utf16be() {
5741        let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5742        let mut dest = [0u8; 4];
5743
5744        assert_eq!(
5745            decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5746            (CoderResult::InputEmpty, 2, 0, false)
5747        );
5748
5749        let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5750    }
5751
5752    #[test]
5753    fn test_hash() {
5754        let mut encodings = ::alloc::collections::btree_set::BTreeSet::new();
5755        encodings.insert(UTF_8);
5756        encodings.insert(ISO_2022_JP);
5757        assert!(encodings.contains(UTF_8));
5758        assert!(encodings.contains(ISO_2022_JP));
5759        assert!(!encodings.contains(WINDOWS_1252));
5760        encodings.remove(ISO_2022_JP);
5761        assert!(!encodings.contains(ISO_2022_JP));
5762    }
5763
5764    #[test]
5765    fn test_iso_2022_jp_ncr_extra_from_utf16() {
5766        let mut dst = [0u8; 17];
5767        {
5768            let mut encoder = ISO_2022_JP.new_encoder();
5769            let (result, _, _, _) =
5770                encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5771            assert_eq!(result, CoderResult::OutputFull);
5772        }
5773    }
5774
5775    #[test]
5776    fn test_iso_2022_jp_ncr_extra_from_utf8() {
5777        let mut dst = [0u8; 17];
5778        {
5779            let mut encoder = ISO_2022_JP.new_encoder();
5780            let (result, _, _, _) =
5781                encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5782            assert_eq!(result, CoderResult::OutputFull);
5783        }
5784    }
5785
5786    #[test]
5787    fn test_max_length_with_bom_to_utf8() {
5788        let mut output = [0u8; 20];
5789        let mut decoder = REPLACEMENT.new_decoder();
5790        let input = b"\xEF\xBB\xBFA";
5791        {
5792            let needed = decoder
5793                .max_utf8_buffer_length_without_replacement(input.len())
5794                .unwrap();
5795            let (result, read, written) =
5796                decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5797            assert_eq!(result, DecoderResult::InputEmpty);
5798            assert_eq!(read, input.len());
5799            assert_eq!(written, 1);
5800            assert_eq!(output[0], 0x41);
5801        }
5802    }
5803
5804    #[cfg(feature = "serde")]
5805    #[test]
5806    fn test_serde() {
5807        let demo = Demo {
5808            num: 42,
5809            name: "foo".into(),
5810            enc: UTF_8,
5811        };
5812
5813        let serialized = serde_json::to_string(&demo).unwrap();
5814
5815        let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5816        assert_eq!(deserialized, demo);
5817
5818        let bincoded = bincode::serialize(&demo).unwrap();
5819        let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5820        assert_eq!(debincoded, demo);
5821    }
5822
5823    #[test]
5824    fn test_is_single_byte() {
5825        assert!(!BIG5.is_single_byte());
5826        assert!(!EUC_JP.is_single_byte());
5827        assert!(!EUC_KR.is_single_byte());
5828        assert!(!GB18030.is_single_byte());
5829        assert!(!GBK.is_single_byte());
5830        assert!(!REPLACEMENT.is_single_byte());
5831        assert!(!SHIFT_JIS.is_single_byte());
5832        assert!(!UTF_8.is_single_byte());
5833        assert!(!UTF_16BE.is_single_byte());
5834        assert!(!UTF_16LE.is_single_byte());
5835        assert!(!ISO_2022_JP.is_single_byte());
5836
5837        assert!(IBM866.is_single_byte());
5838        assert!(ISO_8859_2.is_single_byte());
5839        assert!(ISO_8859_3.is_single_byte());
5840        assert!(ISO_8859_4.is_single_byte());
5841        assert!(ISO_8859_5.is_single_byte());
5842        assert!(ISO_8859_6.is_single_byte());
5843        assert!(ISO_8859_7.is_single_byte());
5844        assert!(ISO_8859_8.is_single_byte());
5845        assert!(ISO_8859_10.is_single_byte());
5846        assert!(ISO_8859_13.is_single_byte());
5847        assert!(ISO_8859_14.is_single_byte());
5848        assert!(ISO_8859_15.is_single_byte());
5849        assert!(ISO_8859_16.is_single_byte());
5850        assert!(ISO_8859_8_I.is_single_byte());
5851        assert!(KOI8_R.is_single_byte());
5852        assert!(KOI8_U.is_single_byte());
5853        assert!(MACINTOSH.is_single_byte());
5854        assert!(WINDOWS_874.is_single_byte());
5855        assert!(WINDOWS_1250.is_single_byte());
5856        assert!(WINDOWS_1251.is_single_byte());
5857        assert!(WINDOWS_1252.is_single_byte());
5858        assert!(WINDOWS_1253.is_single_byte());
5859        assert!(WINDOWS_1254.is_single_byte());
5860        assert!(WINDOWS_1255.is_single_byte());
5861        assert!(WINDOWS_1256.is_single_byte());
5862        assert!(WINDOWS_1257.is_single_byte());
5863        assert!(WINDOWS_1258.is_single_byte());
5864        assert!(X_MAC_CYRILLIC.is_single_byte());
5865        assert!(X_USER_DEFINED.is_single_byte());
5866    }
5867
5868    #[test]
5869    fn test_latin1_byte_compatible_up_to() {
5870        let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5871        assert_eq!(
5872            BIG5.new_decoder_without_bom_handling()
5873                .latin1_byte_compatible_up_to(buffer)
5874                .unwrap(),
5875            1
5876        );
5877        assert_eq!(
5878            EUC_JP
5879                .new_decoder_without_bom_handling()
5880                .latin1_byte_compatible_up_to(buffer)
5881                .unwrap(),
5882            1
5883        );
5884        assert_eq!(
5885            EUC_KR
5886                .new_decoder_without_bom_handling()
5887                .latin1_byte_compatible_up_to(buffer)
5888                .unwrap(),
5889            1
5890        );
5891        assert_eq!(
5892            GB18030
5893                .new_decoder_without_bom_handling()
5894                .latin1_byte_compatible_up_to(buffer)
5895                .unwrap(),
5896            1
5897        );
5898        assert_eq!(
5899            GBK.new_decoder_without_bom_handling()
5900                .latin1_byte_compatible_up_to(buffer)
5901                .unwrap(),
5902            1
5903        );
5904        assert!(REPLACEMENT
5905            .new_decoder_without_bom_handling()
5906            .latin1_byte_compatible_up_to(buffer)
5907            .is_none());
5908        assert_eq!(
5909            SHIFT_JIS
5910                .new_decoder_without_bom_handling()
5911                .latin1_byte_compatible_up_to(buffer)
5912                .unwrap(),
5913            1
5914        );
5915        assert_eq!(
5916            UTF_8
5917                .new_decoder_without_bom_handling()
5918                .latin1_byte_compatible_up_to(buffer)
5919                .unwrap(),
5920            1
5921        );
5922        assert!(UTF_16BE
5923            .new_decoder_without_bom_handling()
5924            .latin1_byte_compatible_up_to(buffer)
5925            .is_none());
5926        assert!(UTF_16LE
5927            .new_decoder_without_bom_handling()
5928            .latin1_byte_compatible_up_to(buffer)
5929            .is_none());
5930        assert_eq!(
5931            ISO_2022_JP
5932                .new_decoder_without_bom_handling()
5933                .latin1_byte_compatible_up_to(buffer)
5934                .unwrap(),
5935            1
5936        );
5937
5938        assert_eq!(
5939            IBM866
5940                .new_decoder_without_bom_handling()
5941                .latin1_byte_compatible_up_to(buffer)
5942                .unwrap(),
5943            1
5944        );
5945        assert_eq!(
5946            ISO_8859_2
5947                .new_decoder_without_bom_handling()
5948                .latin1_byte_compatible_up_to(buffer)
5949                .unwrap(),
5950            2
5951        );
5952        assert_eq!(
5953            ISO_8859_3
5954                .new_decoder_without_bom_handling()
5955                .latin1_byte_compatible_up_to(buffer)
5956                .unwrap(),
5957            2
5958        );
5959        assert_eq!(
5960            ISO_8859_4
5961                .new_decoder_without_bom_handling()
5962                .latin1_byte_compatible_up_to(buffer)
5963                .unwrap(),
5964            2
5965        );
5966        assert_eq!(
5967            ISO_8859_5
5968                .new_decoder_without_bom_handling()
5969                .latin1_byte_compatible_up_to(buffer)
5970                .unwrap(),
5971            2
5972        );
5973        assert_eq!(
5974            ISO_8859_6
5975                .new_decoder_without_bom_handling()
5976                .latin1_byte_compatible_up_to(buffer)
5977                .unwrap(),
5978            2
5979        );
5980        assert_eq!(
5981            ISO_8859_7
5982                .new_decoder_without_bom_handling()
5983                .latin1_byte_compatible_up_to(buffer)
5984                .unwrap(),
5985            2
5986        );
5987        assert_eq!(
5988            ISO_8859_8
5989                .new_decoder_without_bom_handling()
5990                .latin1_byte_compatible_up_to(buffer)
5991                .unwrap(),
5992            3
5993        );
5994        assert_eq!(
5995            ISO_8859_10
5996                .new_decoder_without_bom_handling()
5997                .latin1_byte_compatible_up_to(buffer)
5998                .unwrap(),
5999            2
6000        );
6001        assert_eq!(
6002            ISO_8859_13
6003                .new_decoder_without_bom_handling()
6004                .latin1_byte_compatible_up_to(buffer)
6005                .unwrap(),
6006            4
6007        );
6008        assert_eq!(
6009            ISO_8859_14
6010                .new_decoder_without_bom_handling()
6011                .latin1_byte_compatible_up_to(buffer)
6012                .unwrap(),
6013            4
6014        );
6015        assert_eq!(
6016            ISO_8859_15
6017                .new_decoder_without_bom_handling()
6018                .latin1_byte_compatible_up_to(buffer)
6019                .unwrap(),
6020            6
6021        );
6022        assert_eq!(
6023            ISO_8859_16
6024                .new_decoder_without_bom_handling()
6025                .latin1_byte_compatible_up_to(buffer)
6026                .unwrap(),
6027            4
6028        );
6029        assert_eq!(
6030            ISO_8859_8_I
6031                .new_decoder_without_bom_handling()
6032                .latin1_byte_compatible_up_to(buffer)
6033                .unwrap(),
6034            3
6035        );
6036        assert_eq!(
6037            KOI8_R
6038                .new_decoder_without_bom_handling()
6039                .latin1_byte_compatible_up_to(buffer)
6040                .unwrap(),
6041            1
6042        );
6043        assert_eq!(
6044            KOI8_U
6045                .new_decoder_without_bom_handling()
6046                .latin1_byte_compatible_up_to(buffer)
6047                .unwrap(),
6048            1
6049        );
6050        assert_eq!(
6051            MACINTOSH
6052                .new_decoder_without_bom_handling()
6053                .latin1_byte_compatible_up_to(buffer)
6054                .unwrap(),
6055            1
6056        );
6057        assert_eq!(
6058            WINDOWS_874
6059                .new_decoder_without_bom_handling()
6060                .latin1_byte_compatible_up_to(buffer)
6061                .unwrap(),
6062            2
6063        );
6064        assert_eq!(
6065            WINDOWS_1250
6066                .new_decoder_without_bom_handling()
6067                .latin1_byte_compatible_up_to(buffer)
6068                .unwrap(),
6069            4
6070        );
6071        assert_eq!(
6072            WINDOWS_1251
6073                .new_decoder_without_bom_handling()
6074                .latin1_byte_compatible_up_to(buffer)
6075                .unwrap(),
6076            1
6077        );
6078        assert_eq!(
6079            WINDOWS_1252
6080                .new_decoder_without_bom_handling()
6081                .latin1_byte_compatible_up_to(buffer)
6082                .unwrap(),
6083            5
6084        );
6085        assert_eq!(
6086            WINDOWS_1253
6087                .new_decoder_without_bom_handling()
6088                .latin1_byte_compatible_up_to(buffer)
6089                .unwrap(),
6090            3
6091        );
6092        assert_eq!(
6093            WINDOWS_1254
6094                .new_decoder_without_bom_handling()
6095                .latin1_byte_compatible_up_to(buffer)
6096                .unwrap(),
6097            4
6098        );
6099        assert_eq!(
6100            WINDOWS_1255
6101                .new_decoder_without_bom_handling()
6102                .latin1_byte_compatible_up_to(buffer)
6103                .unwrap(),
6104            3
6105        );
6106        assert_eq!(
6107            WINDOWS_1256
6108                .new_decoder_without_bom_handling()
6109                .latin1_byte_compatible_up_to(buffer)
6110                .unwrap(),
6111            1
6112        );
6113        assert_eq!(
6114            WINDOWS_1257
6115                .new_decoder_without_bom_handling()
6116                .latin1_byte_compatible_up_to(buffer)
6117                .unwrap(),
6118            4
6119        );
6120        assert_eq!(
6121            WINDOWS_1258
6122                .new_decoder_without_bom_handling()
6123                .latin1_byte_compatible_up_to(buffer)
6124                .unwrap(),
6125            4
6126        );
6127        assert_eq!(
6128            X_MAC_CYRILLIC
6129                .new_decoder_without_bom_handling()
6130                .latin1_byte_compatible_up_to(buffer)
6131                .unwrap(),
6132            1
6133        );
6134        assert_eq!(
6135            X_USER_DEFINED
6136                .new_decoder_without_bom_handling()
6137                .latin1_byte_compatible_up_to(buffer)
6138                .unwrap(),
6139            1
6140        );
6141
6142        assert!(UTF_8
6143            .new_decoder()
6144            .latin1_byte_compatible_up_to(buffer)
6145            .is_none());
6146
6147        let mut decoder = UTF_8.new_decoder();
6148        let mut output = [0u16; 4];
6149        let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6150        assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6151        let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6152        assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6153        let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6154        assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6155    }
6156}
encoding_rs/lib.rs

encoding_rs/
lib.rs