encoding_rs/lib.rs
1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10// The above license applies to code in this file. The label data in
11// this file is generated from WHATWG's encodings.json, which came under
12// the following license:
13
14// Copyright © WHATWG (Apple, Google, Mozilla, Microsoft).
15//
16// Redistribution and use in source and binary forms, with or without
17// modification, are permitted provided that the following conditions are met:
18//
19// 1. Redistributions of source code must retain the above copyright notice, this
20// list of conditions and the following disclaimer.
21//
22// 2. Redistributions in binary form must reproduce the above copyright notice,
23// this list of conditions and the following disclaimer in the documentation
24// and/or other materials provided with the distribution.
25//
26// 3. Neither the name of the copyright holder nor the names of its
27// contributors may be used to endorse or promote products derived from
28// this software without specific prior written permission.
29//
30// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
31// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
33// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
34// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
36// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
37// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
38// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
39// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40
41#![cfg_attr(
42 feature = "cargo-clippy",
43 allow(doc_markdown, inline_always, new_ret_no_self)
44)]
45
46//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
47//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
48//! Gecko-oriented means that converting to and from UTF-16 is supported in
49//! addition to converting to and from UTF-8, that the performance and
50//! streamability goals are browser-oriented, and that FFI-friendliness is a
51//! goal.
52//!
53//! Additionally, the `mem` module provides functions that are useful for
54//! applications that need to be able to deal with legacy in-memory
55//! representations of Unicode.
56//!
57//! For expectation setting, please be sure to read the sections
58//! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
59//! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
60//!
61//! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
62//! design and internals of the crate.
63//!
64//! # Availability
65//!
66//! The code is available under the
67//! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
68//! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
69//! See the
70//! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
71//! file for details.
72//! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
73//! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
74//!
75//! # Integration with `std::io`
76//!
77//! This crate doesn't implement traits from `std::io`. However, for the case of
78//! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
79//! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
80//! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
81//!
82//! # Examples
83//!
84//! Example programs:
85//!
86//! * [Rust](https://github.com/hsivonen/recode_rs)
87//! * [C](https://github.com/hsivonen/recode_c)
88//! * [C++](https://github.com/hsivonen/recode_cpp)
89//!
90//! Decode using the non-streaming API:
91//!
92//! ```
93//! #[cfg(feature = "alloc")] {
94//! use encoding_rs::*;
95//!
96//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
97//! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
98//!
99//! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
100//! assert_eq!(&cow[..], expectation);
101//! assert_eq!(encoding_used, SHIFT_JIS);
102//! assert!(!had_errors);
103//! }
104//! ```
105//!
106//! Decode using the streaming API with minimal `unsafe`:
107//!
108//! ```
109//! use encoding_rs::*;
110//!
111//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
112//!
113//! // Use an array of byte slices to demonstrate content arriving piece by
114//! // piece from the network.
115//! let bytes: [&'static [u8]; 4] = [b"\x83",
116//! b"n\x83\x8D\x81",
117//! b"[\x81E\x83\x8F\x81[\x83",
118//! b"\x8B\x83h"];
119//!
120//! // Very short output buffer to demonstrate the output buffer getting full.
121//! // Normally, you'd use something like `[0u8; 2048]`.
122//! let mut buffer_bytes = [0u8; 8];
123//! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
124//!
125//! // How many bytes in the buffer currently hold significant data.
126//! let mut bytes_in_buffer = 0usize;
127//!
128//! // Collect the output to a string for demonstration purposes.
129//! let mut output = String::new();
130//!
131//! // The `Decoder`
132//! let mut decoder = SHIFT_JIS.new_decoder();
133//!
134//! // Track whether we see errors.
135//! let mut total_had_errors = false;
136//!
137//! // Decode using a fixed-size intermediate buffer (for demonstrating the
138//! // use of a fixed-size buffer; normally when the output of an incremental
139//! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
140//! // avoid the intermediate buffer).
141//! for input in &bytes[..] {
142//! // The number of bytes already read from current `input` in total.
143//! let mut total_read_from_current_input = 0usize;
144//!
145//! loop {
146//! let (result, read, written, had_errors) =
147//! decoder.decode_to_str(&input[total_read_from_current_input..],
148//! &mut buffer[bytes_in_buffer..],
149//! false);
150//! total_read_from_current_input += read;
151//! bytes_in_buffer += written;
152//! total_had_errors |= had_errors;
153//! match result {
154//! CoderResult::InputEmpty => {
155//! // We have consumed the current input buffer. Break out of
156//! // the inner loop to get the next input buffer from the
157//! // outer loop.
158//! break;
159//! },
160//! CoderResult::OutputFull => {
161//! // Write the current buffer out and consider the buffer
162//! // empty.
163//! output.push_str(&buffer[..bytes_in_buffer]);
164//! bytes_in_buffer = 0usize;
165//! continue;
166//! }
167//! }
168//! }
169//! }
170//!
171//! // Process EOF
172//! loop {
173//! let (result, _, written, had_errors) =
174//! decoder.decode_to_str(b"",
175//! &mut buffer[bytes_in_buffer..],
176//! true);
177//! bytes_in_buffer += written;
178//! total_had_errors |= had_errors;
179//! // Write the current buffer out and consider the buffer empty.
180//! // Need to do this here for both `match` arms, because we exit the
181//! // loop on `CoderResult::InputEmpty`.
182//! output.push_str(&buffer[..bytes_in_buffer]);
183//! bytes_in_buffer = 0usize;
184//! match result {
185//! CoderResult::InputEmpty => {
186//! // Done!
187//! break;
188//! },
189//! CoderResult::OutputFull => {
190//! continue;
191//! }
192//! }
193//! }
194//!
195//! assert_eq!(&output[..], expectation);
196//! assert!(!total_had_errors);
197//! ```
198//!
199//! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
200//!
201//! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
202//! __so this crate does not provide encoders for those encodings__!
203//! Along with the replacement encoding, their _output encoding_ (i.e. the
204//! encoding used for form submission and error handling in the query string
205//! of URLs) is UTF-8, so you get an UTF-8 encoder if you request an encoder
206//! for them.
207//!
208//! Additionally, the Encoding Standard factors BOM handling into wrapper
209//! algorithms so that BOM handling isn't part of the definition of the
210//! encodings themselves. The Unicode _encoding schemes_ in the Unicode
211//! Standard define BOM handling or lack thereof as part of the encoding
212//! scheme.
213//!
214//! When used with the `_without_bom_handling` entry points, the UTF-16LE
215//! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
216//! the Unicode Standard.
217//!
218//! When used with the `_with_bom_removal` entry points, the UTF-8
219//! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
220//! Standard.
221//!
222//! This crate does not provide a mode that matches the UTF-16 _encoding
223//! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
224//! the entry points without `_bom_` qualifiers is the closest match,
225//! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
226//! not part of the behavior of the UTF-16 _encoding scheme_ per the
227//! Unicode Standard.
228//!
229//! The UTF-32 family of Unicode encoding schemes is not supported
230//! by this crate. The Encoding Standard doesn't define any UTF-32
231//! family encodings, since they aren't necessary for consuming Web
232//! content.
233//!
234//! While gb18030 is capable of representing U+FEFF, the Encoding
235//! Standard does not treat the gb18030 byte representation of U+FEFF
236//! as a BOM, so neither does this crate.
237//!
238//! ## ISO-8859-1
239//!
240//! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
241//! the Encoding Standard. Therefore, an encoding that maps the unsigned
242//! byte value to the same Unicode scalar value is not available via
243//! `Encoding` in this crate.
244//!
245//! However, the functions whose name starts with `convert` and contains
246//! `latin1` in the `mem` module support such conversions, which are known as
247//! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
248//! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
249//! in the [Infra Standard](https://infra.spec.whatwg.org/).
250//!
251//! ## Web / Browser Focus
252//!
253//! Both in terms of scope and performance, the focus is on the Web. For scope,
254//! this means that encoding_rs implements the Encoding Standard fully and
255//! doesn't implement encodings that are not specified in the Encoding
256//! Standard. For performance, this means that decoding performance is
257//! important as well as performance for encoding into UTF-8 or encoding the
258//! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
259//! be encoded into legacy encodings in only two places in the Web platform: in
260//! the query part of URLs, in which case it's a matter of relatively rare
261//! error handling, and in form submission, in which case the user action and
262//! networking tend to hide the performance of the encoder.
263//!
264//! Deemphasizing performance of encoding non-Basic Latin text into legacy
265//! encodings enables smaller code size thanks to the encoder side using the
266//! decode-optimized data tables without having encode-optimized data tables at
267//! all. Even in decoders, smaller lookup table size is preferred over avoiding
268//! multiplication operations.
269//!
270//! Additionally, performance is a non-goal for the ASCII-incompatible
271//! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
272//! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
273//! of implementation.
274//!
275//! Despite the browser focus, the hope is that non-browser applications
276//! that wish to consume Web content or submit Web forms in a Web-compatible
277//! way will find encoding_rs useful. While encoding_rs does not try to match
278//! Windows behavior, many of the encodings are close enough to legacy
279//! encodings implemented by Windows that applications that need to consume
280//! data in legacy Windows encodins may find encoding_rs useful. The
281//! [codepage](https://crates.io/crates/codepage) crate maps from Windows
282//! code page identifiers onto encoding_rs `Encoding`s and vice versa.
283//!
284//! For decoding email, UTF-7 support is needed (unfortunately) in additition
285//! to the encodings defined in the Encoding Standard. The
286//! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
287//! UTF-7 decoding for email purposes.
288//!
289//! For single-byte DOS encodings beyond the ones supported by the Encoding
290//! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
291//!
292//! # Preparing Text for the Encoders
293//!
294//! Normalizing text into Unicode Normalization Form C prior to encoding text
295//! into a legacy encoding minimizes unmappable characters. Text can be
296//! normalized to Unicode Normalization Form C using the
297//! [`icu_normalizer`](https://crates.io/crates/icu_normalizer) crate, which
298//! is part of [ICU4X](https://icu4x.unicode.org/).
299//!
300//! The exception is windows-1258, which after normalizing to Unicode
301//! Normalization Form C requires tone marks to be decomposed in order to
302//! minimize unmappable characters. Vietnamese tone marks can be decomposed
303//! using the [`detone`](https://crates.io/crates/detone) crate.
304//!
305//! # Streaming & Non-Streaming; Rust & C/C++
306//!
307//! The API in Rust has two modes of operation: streaming and non-streaming.
308//! The streaming API is the foundation of the implementation and should be
309//! used when processing data that arrives piecemeal from an i/o stream. The
310//! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
311//! to C callers. The non-streaming part of the API is for Rust callers only and
312//! is smart about borrowing instead of copying when possible. When
313//! streamability is not needed, the non-streaming API should be preferrer in
314//! order to avoid copying data when a borrow suffices.
315//!
316//! There is no analogous C API exposed via FFI, mainly because C doesn't have
317//! standard types for growable byte buffers and Unicode strings that know
318//! their length.
319//!
320//! The C API (header file generated at `target/include/encoding_rs.h` when
321//! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
322//! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
323//! The C binding comes with a [C++17 wrapper][2] that uses standard library +
324//! [GSL][3] types and that recreates the non-streaming API in C++ on top of
325//! the streaming API. A C++ wrapper with XPCOM/MFBT types is available as
326//! [`mozilla::Encoding`][4].
327//!
328//! The `Encoding` type is common to both the streaming and non-streaming
329//! modes. In the streaming mode, decoding operations are performed with a
330//! `Decoder` and encoding operations with an `Encoder` object obtained via
331//! `Encoding`. In the non-streaming mode, decoding and encoding operations are
332//! performed using methods on `Encoding` objects themselves, so the `Decoder`
333//! and `Encoder` objects are not used at all.
334//!
335//! [1]: https://github.com/hsivonen/encoding_c
336//! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
337//! [3]: https://github.com/Microsoft/GSL/
338//! [4]: https://searchfox.org/mozilla-central/source/intl/Encoding.h
339//!
340//! # Memory management
341//!
342//! The non-streaming mode never performs heap allocations (even the methods
343//! that write into a `Vec<u8>` or a `String` by taking them as arguments do
344//! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
345//! is, the non-streaming mode uses caller-allocated buffers exclusively.
346//!
347//! The methods of the streaming mode that return a `Vec<u8>` or a `String`
348//! perform heap allocations but only to allocate the backing buffer of the
349//! `Vec<u8>` or the `String`.
350//!
351//! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
352//! `Drop` cleanup.
353//!
354//! # Buffer reading and writing behavior
355//!
356//! Based on experience gained with the `java.nio.charset` encoding converter
357//! API and with the Gecko uconv encoding converter API, the buffer reading
358//! and writing behaviors of encoding_rs are asymmetric: input buffers are
359//! fully drained but output buffers are not always fully filled.
360//!
361//! When reading from an input buffer, encoding_rs always consumes all input
362//! up to the next error or to the end of the buffer. In particular, when
363//! decoding, even if the input buffer ends in the middle of a byte sequence
364//! for a character, the decoder consumes all input. This has the benefit that
365//! the caller of the API can always fill the next buffer from the start from
366//! whatever source the bytes come from and never has to first copy the last
367//! bytes of the previous buffer to the start of the next buffer. However, when
368//! encoding, the UTF-8 input buffers have to end at a character boundary, which
369//! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
370//! boundaries falling in the middle of a surrogate pair result in both
371//! suggorates being treated individually as unpaired surrogates.
372//!
373//! Additionally, decoders guarantee that they can be fed even one byte at a
374//! time and encoders guarantee that they can be fed even one code point at a
375//! time. This has the benefit of not placing restrictions on the size of
376//! chunks the content arrives e.g. from network.
377//!
378//! When writing into an output buffer, encoding_rs makes sure that the code
379//! unit sequence for a character is never split across output buffer
380//! boundaries. This may result in wasted space at the end of an output buffer,
381//! but the advantages are that the output side of both decoders and encoders
382//! is greatly simplified compared to designs that attempt to fill output
383//! buffers exactly even when that entails splitting a code unit sequence and
384//! when encoding_rs methods return to the caller, the output produces thus
385//! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
386//! the output needs to be considered as a whole, because the latest output
387//! buffer taken alone might not be valid taken alone if the transition away
388//! from the ASCII state occurred in an earlier output buffer. However, since
389//! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
390//! state as being in error despite the encoder generating a transition to the
391//! ASCII state at the end, the claim about the partial output taken as a whole
392//! being valid is true even for ISO-2022-JP.)
393//!
394//! # Error Reporting
395//!
396//! Based on experience gained with the `java.nio.charset` encoding converter
397//! API and with the Gecko uconv encoding converter API, the error reporting
398//! behaviors of encoding_rs are asymmetric: decoder errors include offsets
399//! that leave it up to the caller to extract the erroneous bytes from the
400//! input stream if the caller wishes to do so but encoder errors provide the
401//! code point associated with the error without requiring the caller to
402//! extract it from the input on its own.
403//!
404//! On the encoder side, an error is always triggered by the most recently
405//! pushed Unicode scalar, which makes it simple to pass the `char` to the
406//! caller. Also, it's very typical for the caller to wish to do something with
407//! this data: generate a numeric escape for the character. Additionally, the
408//! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
409//! certain cases, so requiring the caller to extract the character from the
410//! input buffer would require the caller to handle ISO-2022-JP details.
411//! Furthermore, requiring the caller to extract the character from the input
412//! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
413//! the job of an encoding conversion library.
414//!
415//! On the decoder side, errors are triggered in more complex ways. For
416//! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
417//! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
418//! the buffer boundary when processing 'A'. Thus, the bytes in error might not
419//! be the ones most recently pushed to the decoder and the error might not even
420//! be in the current buffer.
421//!
422//! Some encoding conversion APIs address the problem by not acknowledging
423//! trailing bytes of an input buffer as consumed if it's still possible for
424//! future bytes to cause the trailing bytes to be in error. This way, error
425//! reporting can always refer to the most recently pushed buffer. This has the
426//! problem that the caller of the API has to copy the unconsumed trailing
427//! bytes to the start of the next buffer before being able to fill the rest
428//! of the next buffer. This is annoying, error-prone and inefficient.
429//!
430//! A possible solution would be making the decoder remember recently consumed
431//! bytes in order to be able to include a copy of the erroneous bytes when
432//! reporting an error. This has two problem: First, callers a rarely
433//! interested in the erroneous bytes, so attempts to identify them are most
434//! often just overhead anyway. Second, the rare applications that are
435//! interested typically care about the location of the error in the input
436//! stream.
437//!
438//! To keep the API convenient for common uses and the overhead low while making
439//! it possible to develop applications, such as HTML validators, that care
440//! about which bytes were in error, encoding_rs reports the length of the
441//! erroneous sequence and the number of bytes consumed after the erroneous
442//! sequence. As long as the caller doesn't discard the 6 most recent bytes,
443//! this makes it possible for callers that care about the erroneous bytes to
444//! locate them.
445//!
446//! # No Convenience API for Custom Replacements
447//!
448//! The Web Platform and, therefore, the Encoding Standard supports only one
449//! error recovery mode for decoders and only one error recovery mode for
450//! encoders. The supported error recovery mode for decoders is emitting the
451//! REPLACEMENT CHARACTER on error. The supported error recovery mode for
452//! encoders is emitting an HTML decimal numeric character reference for
453//! unmappable characters.
454//!
455//! Since encoding_rs is Web-focused, these are the only error recovery modes
456//! for which convenient support is provided. Moreover, on the decoder side,
457//! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
458//! on error (other than treating errors as fatal). In particular, simply
459//! ignoring errors is a
460//! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
461//! so it would be a bad idea for encoding_rs to provide a mode that encouraged
462//! callers to ignore errors.
463//!
464//! On the encoder side, there are plausible alternatives for HTML decimal
465//! numeric character references. For example, when outputting CSS, CSS-style
466//! escapes would seem to make sense. However, instead of facilitating the
467//! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
468//! position that you shouldn't generate output in encodings other than UTF-8,
469//! except where backward compatibility with interacting with the legacy Web
470//! requires it. The legacy Web requires it only when parsing the query strings
471//! of URLs and when submitting forms, and those two both use HTML decimal
472//! numeric character references.
473//!
474//! While encoding_rs doesn't make encoder replacements other than HTML decimal
475//! numeric character references easy, it does make them _possible_.
476//! `encode_from_utf8()`, which emits HTML decimal numeric character references
477//! for unmappable characters, is implemented on top of
478//! `encode_from_utf8_without_replacement()`. Applications that really, really
479//! want other replacement schemes for unmappable characters can likewise
480//! implement them on top of `encode_from_utf8_without_replacement()`.
481//!
482//! # No Extensibility by Design
483//!
484//! The set of encodings supported by encoding_rs is not extensible by design.
485//! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
486//! rather than `trait`s. encoding_rs takes the design position that all future
487//! text interchange should be done using UTF-8, which can represent all of
488//! Unicode. (It is, in fact, the only encoding supported by the Encoding
489//! Standard and encoding_rs that can represent all of Unicode and that has
490//! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
491//! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
492//! legacy compatibility and not due to non-UTF-8 encodings having benefits
493//! other than being able to consume legacy content.
494//!
495//! Considering that UTF-8 can represent all of Unicode and is already supported
496//! by all Web browsers, introducing a new encoding wouldn't add to the
497//! expressiveness but would add to compatibility problems. In that sense,
498//! adding new encodings to the Web Platform doesn't make sense, and, in fact,
499//! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
500//! the Web Platform. On the other hand, the set of legacy encodings that must
501//! be supported for a Web browser to be able to be successful is not going to
502//! expand. Empirically, the set of encodings specified in the Encoding Standard
503//! is already sufficient and the set of legacy encodings won't grow
504//! retroactively.
505//!
506//! Since extensibility doesn't make sense considering the Web focus of
507//! encoding_rs and adding encodings to Web clients would be actively harmful,
508//! it makes sense to make the set of encodings that encoding_rs supports
509//! non-extensible and to take the (admittedly small) benefits arising from
510//! that, such as the size of `Decoder` and `Encoder` objects being known ahead
511//! of time, which enables stack allocation thereof.
512//!
513//! This does have downsides for applications that might want to put encoding_rs
514//! to non-Web uses if those non-Web uses involve legacy encodings that aren't
515//! needed for Web uses. The needs of such applications should not complicate
516//! encoding_rs itself, though. It is up to those applications to provide a
517//! framework that delegates the operations with encodings that encoding_rs
518//! supports to encoding_rs and operations with other encodings to something
519//! else (as opposed to encoding_rs itself providing an extensibility
520//! framework).
521//!
522//! # Panics
523//!
524//! Methods in encoding_rs can panic if the API is used against the requirements
525//! stated in the documentation, if a state that's supposed to be impossible
526//! is reached due to an internal bug or on integer overflow. When used
527//! according to documentation with buffer sizes that stay below integer
528//! overflow, in the absence of internal bugs, encoding_rs does not panic.
529//!
530//! Panics arising from API misuse aren't documented beyond this on individual
531//! methods.
532//!
533//! # At-Risk Parts of the API
534//!
535//! The foreseeable source of partially backward-incompatible API change is the
536//! way the instances of `Encoding` are made available.
537//!
538//! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
539//! initialized with `static`s of type `&'static Encoding`, the non-reference
540//! `FOO_INIT` public `Encoding` instances will be removed from the public API.
541//!
542//! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
543//! unique when the constant is used in different crates, the reference-typed
544//! `static`s for the encoding instances will be changed from `static` to
545//! `const` and the non-reference-typed `_INIT` instances will be removed.
546//!
547//! # Mapping Spec Concepts onto the API
548//!
549//! <table>
550//! <thead>
551//! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
552//! </thead>
553//! <tbody>
554//! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&'static Encoding</code></td><td><code>&'static Encoding</code></td></tr>
555//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
556//! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
557//! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
558//! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
559//! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
560//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
561//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
562//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// … (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
563//! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// …</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
564//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// …</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
565//! </tbody>
566//! </table>
567//!
568//! # Compatibility with the rust-encoding API
569//!
570//! The crate
571//! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
572//! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
573//! the API of rust-encoding 0.2.32 on top of encoding_rs.
574//!
575//! # Mapping rust-encoding concepts to encoding_rs concepts
576//!
577//! The following table provides a mapping from rust-encoding constructs to
578//! encoding_rs ones.
579//!
580//! <table>
581//! <thead>
582//! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
583//! </thead>
584//! <tbody>
585//! <tr><td><code>encoding::EncodingRef</code></td><td><code>&'static encoding_rs::Encoding</code></td></tr>
586//! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
587//! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
588//! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
589//! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
590//! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
591//! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
592//! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
593//! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
594//! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
595//! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
596//! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
597//! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
598//! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
599//! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
600//! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
601//! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
602//! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
603//! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
604//! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
605//! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
606//! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
607//! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
608//! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
609//! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
610//! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
611//! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
612//! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
613//! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
614//! </tbody>
615//! </table>
616//!
617//! # Relationship with Windows Code Pages
618//!
619//! Despite the Web and browser focus, the encodings defined by the Encoding
620//! Standard and implemented by this crate may be useful for decoding legacy
621//! data that uses Windows code pages. The following table names the single-byte
622//! encodings
623//! that have a closely related Windows code page, the number of the closest
624//! code page, a column indicating whether Windows maps unassigned code points
625//! to the Unicode Private Use Area instead of U+FFFD and a remark number
626//! indicating remarks in the list after the table.
627//!
628//! <table>
629//! <thead>
630//! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
631//! </thead>
632//! <tbody>
633//! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
634//! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
635//! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
636//! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
637//! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
638//! <tr><td>windows-874</td><td>874</td><td>•</td><td></td></tr>
639//! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
640//! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
641//! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
642//! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
643//! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
644//! <tr><td>windows-1253</td><td>1253</td><td>•</td><td></td></tr>
645//! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
646//! <tr><td>windows-1255</td><td>1255</td><td>•</td><td></td></tr>
647//! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
648//! <tr><td>windows-1257</td><td>1257</td><td>•</td><td></td></tr>
649//! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
650//! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
651//! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
652//! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
653//! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
654//! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
655//! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
656//! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
657//! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
658//! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
659//! <tr><td>ISO-8859-6</td><td>28596</td><td>•</td><td></td></tr>
660//! <tr><td>ISO-8859-7</td><td>28597</td><td>•</td><td>3</td></tr>
661//! <tr><td>ISO-8859-8</td><td>28598</td><td>•</td><td>4</td></tr>
662//! <tr><td>ISO-8859-13</td><td>28603</td><td>•</td><td></td></tr>
663//! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
664//! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
665//! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
666//! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
667//! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
668//! </tbody>
669//! </table>
670//!
671//! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
672//! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
673//! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
674//! which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
675//! decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
676//! LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
677//! instead of U+2019 RIGHT SINGLE QUOTATION MARK.
678//! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
679//! of LRM and RLM.
680//! 5. Remarks from the previous item apply.
681//!
682//! The differences between this crate and Windows in the case of multibyte encodings
683//! are not yet fully documented here. The lack of remarks above should not be taken
684//! as indication of lack of differences.
685//!
686//! # Notable Differences from IANA Naming
687//!
688//! In some cases, the Encoding Standard specifies the popular unextended encoding
689//! name where in IANA terms one of the other labels would be more precise considering
690//! the extensions that the Encoding Standard has unified into the encoding.
691//!
692//! <table>
693//! <thead>
694//! <tr><th>Encoding</th><th>IANA</th></tr>
695//! </thead>
696//! <tbody>
697//! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
698//! <tr><td>EUC-KR</td><td>windows-949</td></tr>
699//! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
700//! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
701//! </tbody>
702//! </table>
703//!
704//! In other cases where the Encoding Standard unifies unextended and extended
705//! variants of an encoding, the encoding gets the name of the extended
706//! variant.
707//!
708//! <table>
709//! <thead>
710//! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
711//! </thead>
712//! <tbody>
713//! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
714//! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
715//! <tr><td>TIS-620</td><td>windows-874</td></tr>
716//! </tbody>
717//! </table>
718//!
719//! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
720//! for discussion about the UTF-16 family.
721
722#![no_std]
723#![cfg_attr(feature = "simd-accel", feature(core_intrinsics, portable_simd))]
724
725#[cfg(feature = "alloc")]
726#[cfg_attr(test, macro_use)]
727extern crate alloc;
728
729extern crate core;
730#[macro_use]
731extern crate cfg_if;
732
733#[cfg(feature = "serde")]
734extern crate serde;
735
736#[cfg(all(test, feature = "serde"))]
737extern crate bincode;
738#[cfg(all(test, feature = "serde"))]
739#[macro_use]
740extern crate serde_derive;
741#[cfg(all(test, feature = "serde"))]
742extern crate serde_json;
743
744#[macro_use]
745mod macros;
746
747#[cfg(all(
748 feature = "simd-accel",
749 any(
750 target_feature = "sse2",
751 all(target_endian = "little", target_arch = "aarch64"),
752 all(target_endian = "little", target_feature = "neon")
753 )
754))]
755mod simd_funcs;
756
757#[cfg(all(test, feature = "alloc"))]
758mod testing;
759
760mod big5;
761mod euc_jp;
762mod euc_kr;
763mod gb18030;
764mod gb18030_2022;
765mod iso_2022_jp;
766mod replacement;
767mod shift_jis;
768mod single_byte;
769mod utf_16;
770mod utf_8;
771mod x_user_defined;
772
773mod ascii;
774mod data;
775mod handles;
776mod variant;
777
778pub mod mem;
779
780use crate::ascii::ascii_valid_up_to;
781use crate::ascii::iso_2022_jp_ascii_valid_up_to;
782use crate::utf_8::utf8_valid_up_to;
783use crate::variant::*;
784
785#[cfg(feature = "alloc")]
786use alloc::borrow::Cow;
787#[cfg(feature = "alloc")]
788use alloc::string::String;
789#[cfg(feature = "alloc")]
790use alloc::vec::Vec;
791use core::cmp::Ordering;
792use core::hash::Hash;
793use core::hash::Hasher;
794
795#[cfg(feature = "serde")]
796use serde::de::Visitor;
797#[cfg(feature = "serde")]
798use serde::{Deserialize, Deserializer, Serialize, Serializer};
799
800/// This has to be the max length of an NCR instead of max
801/// minus one, because we can't rely on getting the minus
802/// one from the space reserved for the current unmappable,
803/// because the ISO-2022-JP encoder can fill up that space
804/// with a state transition escape.
805const NCR_EXTRA: usize = 10; // 
806
807// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
808// Instead, please regenerate using generate-encoding-data.py
809
810const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
811
812/// The initializer for the [Big5](static.BIG5.html) encoding.
813///
814/// For use only for taking the address of this form when
815/// Rust prohibits the use of the non-`_INIT` form directly,
816/// such as in initializers of other `static`s. If in doubt,
817/// use the corresponding non-`_INIT` reference-typed `static`.
818///
819/// This part of the public API will go away if Rust changes
820/// to make the referent of `pub const FOO: &'static Encoding`
821/// unique cross-crate or if Rust starts allowing static arrays
822/// to be initialized with `pub static FOO: &'static Encoding`
823/// items.
824pub static BIG5_INIT: Encoding = Encoding {
825 name: "Big5",
826 variant: VariantEncoding::Big5,
827};
828
829/// The Big5 encoding.
830///
831/// This is Big5 with HKSCS with mappings to more recent Unicode assignments
832/// instead of the Private Use Area code points that have been used historically.
833/// It is believed to be able to decode existing Web content in a way that makes
834/// sense.
835///
836/// To avoid form submissions generating data that Web servers don't understand,
837/// the encoder doesn't use the HKSCS byte sequences that precede the unextended
838/// Big5 in the lexical order.
839///
840/// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
841/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
842///
843/// This encoding is designed to be suited for decoding the Windows code page 950
844/// and its HKSCS patched "951" variant such that the text makes sense, given
845/// assignments that Unicode has made after those encodings used Private Use
846/// Area characters.
847///
848/// This will change from `static` to `const` if Rust changes
849/// to make the referent of `pub const FOO: &'static Encoding`
850/// unique cross-crate, so don't take the address of this
851/// `static`.
852pub static BIG5: &'static Encoding = &BIG5_INIT;
853
854/// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
855///
856/// For use only for taking the address of this form when
857/// Rust prohibits the use of the non-`_INIT` form directly,
858/// such as in initializers of other `static`s. If in doubt,
859/// use the corresponding non-`_INIT` reference-typed `static`.
860///
861/// This part of the public API will go away if Rust changes
862/// to make the referent of `pub const FOO: &'static Encoding`
863/// unique cross-crate or if Rust starts allowing static arrays
864/// to be initialized with `pub static FOO: &'static Encoding`
865/// items.
866pub static EUC_JP_INIT: Encoding = Encoding {
867 name: "EUC-JP",
868 variant: VariantEncoding::EucJp,
869};
870
871/// The EUC-JP encoding.
872///
873/// This is the legacy Unix encoding for Japanese.
874///
875/// For compatibility with Web servers that don't expect three-byte sequences
876/// in form submissions, the encoder doesn't generate three-byte sequences.
877/// That is, the JIS X 0212 support is decode-only.
878///
879/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
880/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
881///
882/// This encoding roughly matches the Windows code page 20932. There are error
883/// handling differences and a handful of 2-byte sequences that decode differently.
884/// Additionall, Windows doesn't support 3-byte sequences.
885///
886/// This will change from `static` to `const` if Rust changes
887/// to make the referent of `pub const FOO: &'static Encoding`
888/// unique cross-crate, so don't take the address of this
889/// `static`.
890pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
891
892/// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
893///
894/// For use only for taking the address of this form when
895/// Rust prohibits the use of the non-`_INIT` form directly,
896/// such as in initializers of other `static`s. If in doubt,
897/// use the corresponding non-`_INIT` reference-typed `static`.
898///
899/// This part of the public API will go away if Rust changes
900/// to make the referent of `pub const FOO: &'static Encoding`
901/// unique cross-crate or if Rust starts allowing static arrays
902/// to be initialized with `pub static FOO: &'static Encoding`
903/// items.
904pub static EUC_KR_INIT: Encoding = Encoding {
905 name: "EUC-KR",
906 variant: VariantEncoding::EucKr,
907};
908
909/// The EUC-KR encoding.
910///
911/// This is the Korean encoding for Windows. It extends the Unix legacy encoding
912/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
913/// Classic), with all the characters from the Hangul Syllables block of Unicode.
914///
915/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
916/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
917///
918/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
919/// to U+0080 and some byte sequences that are error per the Encoding Standard to
920/// the question mark or the Private Use Area.
921///
922/// This will change from `static` to `const` if Rust changes
923/// to make the referent of `pub const FOO: &'static Encoding`
924/// unique cross-crate, so don't take the address of this
925/// `static`.
926pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
927
928/// The initializer for the [GBK](static.GBK.html) encoding.
929///
930/// For use only for taking the address of this form when
931/// Rust prohibits the use of the non-`_INIT` form directly,
932/// such as in initializers of other `static`s. If in doubt,
933/// use the corresponding non-`_INIT` reference-typed `static`.
934///
935/// This part of the public API will go away if Rust changes
936/// to make the referent of `pub const FOO: &'static Encoding`
937/// unique cross-crate or if Rust starts allowing static arrays
938/// to be initialized with `pub static FOO: &'static Encoding`
939/// items.
940pub static GBK_INIT: Encoding = Encoding {
941 name: "GBK",
942 variant: VariantEncoding::Gbk,
943};
944
945/// The GBK encoding.
946///
947/// The decoder for this encoding is the same as the decoder for gb18030.
948/// The encoder side of this encoding is GBK with Windows code page 936 euro
949/// sign behavior and with the changes to two-byte sequences made in GB18030-2022.
950/// GBK extends GB2312-80 to cover the CJK Unified Ideographs Unicode block as
951/// well as a handful of ideographs from the CJK Unified Ideographs Extension A
952/// and CJK Compatibility Ideographs blocks.
953///
954/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
955/// unified with the gb18030 encoder in the Encoding Standard out of concern
956/// that servers that expect GBK form submissions might not be able to handle
957/// the four-byte sequences.
958///
959/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
960/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
961///
962/// The encoder of this encoding roughly matches the Windows code page 936.
963/// The decoder side is a superset.
964///
965/// This will change from `static` to `const` if Rust changes
966/// to make the referent of `pub const FOO: &'static Encoding`
967/// unique cross-crate, so don't take the address of this
968/// `static`.
969pub static GBK: &'static Encoding = &GBK_INIT;
970
971/// The initializer for the [IBM866](static.IBM866.html) encoding.
972///
973/// For use only for taking the address of this form when
974/// Rust prohibits the use of the non-`_INIT` form directly,
975/// such as in initializers of other `static`s. If in doubt,
976/// use the corresponding non-`_INIT` reference-typed `static`.
977///
978/// This part of the public API will go away if Rust changes
979/// to make the referent of `pub const FOO: &'static Encoding`
980/// unique cross-crate or if Rust starts allowing static arrays
981/// to be initialized with `pub static FOO: &'static Encoding`
982/// items.
983pub static IBM866_INIT: Encoding = Encoding {
984 name: "IBM866",
985 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
986};
987
988/// The IBM866 encoding.
989///
990/// This the most notable one of the DOS Cyrillic code pages. It has the same
991/// box drawing characters as code page 437, so it can be used for decoding
992/// DOS-era ASCII + box drawing data.
993///
994/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
995/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
996///
997/// This encoding matches the Windows code page 866.
998///
999/// This will change from `static` to `const` if Rust changes
1000/// to make the referent of `pub const FOO: &'static Encoding`
1001/// unique cross-crate, so don't take the address of this
1002/// `static`.
1003pub static IBM866: &'static Encoding = &IBM866_INIT;
1004
1005/// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
1006///
1007/// For use only for taking the address of this form when
1008/// Rust prohibits the use of the non-`_INIT` form directly,
1009/// such as in initializers of other `static`s. If in doubt,
1010/// use the corresponding non-`_INIT` reference-typed `static`.
1011///
1012/// This part of the public API will go away if Rust changes
1013/// to make the referent of `pub const FOO: &'static Encoding`
1014/// unique cross-crate or if Rust starts allowing static arrays
1015/// to be initialized with `pub static FOO: &'static Encoding`
1016/// items.
1017pub static ISO_2022_JP_INIT: Encoding = Encoding {
1018 name: "ISO-2022-JP",
1019 variant: VariantEncoding::Iso2022Jp,
1020};
1021
1022/// The ISO-2022-JP encoding.
1023///
1024/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
1025/// byte range to encode non-Basic Latin characters. It's the only encoding
1026/// supported by this crate whose encoder is stateful.
1027///
1028/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
1029/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
1030///
1031/// This encoding roughly matches the Windows code page 50220. Notably, Windows
1032/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
1033/// error handling.
1034///
1035/// This will change from `static` to `const` if Rust changes
1036/// to make the referent of `pub const FOO: &'static Encoding`
1037/// unique cross-crate, so don't take the address of this
1038/// `static`.
1039pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
1040
1041/// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1042///
1043/// For use only for taking the address of this form when
1044/// Rust prohibits the use of the non-`_INIT` form directly,
1045/// such as in initializers of other `static`s. If in doubt,
1046/// use the corresponding non-`_INIT` reference-typed `static`.
1047///
1048/// This part of the public API will go away if Rust changes
1049/// to make the referent of `pub const FOO: &'static Encoding`
1050/// unique cross-crate or if Rust starts allowing static arrays
1051/// to be initialized with `pub static FOO: &'static Encoding`
1052/// items.
1053pub static ISO_8859_10_INIT: Encoding = Encoding {
1054 name: "ISO-8859-10",
1055 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1056};
1057
1058/// The ISO-8859-10 encoding.
1059///
1060/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1061/// is also known as Latin 6.
1062///
1063/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1064/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1065///
1066/// The Windows code page number for this encoding is 28600, but kernel32.dll
1067/// does not support this encoding.
1068///
1069/// This will change from `static` to `const` if Rust changes
1070/// to make the referent of `pub const FOO: &'static Encoding`
1071/// unique cross-crate, so don't take the address of this
1072/// `static`.
1073pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1074
1075/// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1076///
1077/// For use only for taking the address of this form when
1078/// Rust prohibits the use of the non-`_INIT` form directly,
1079/// such as in initializers of other `static`s. If in doubt,
1080/// use the corresponding non-`_INIT` reference-typed `static`.
1081///
1082/// This part of the public API will go away if Rust changes
1083/// to make the referent of `pub const FOO: &'static Encoding`
1084/// unique cross-crate or if Rust starts allowing static arrays
1085/// to be initialized with `pub static FOO: &'static Encoding`
1086/// items.
1087pub static ISO_8859_13_INIT: Encoding = Encoding {
1088 name: "ISO-8859-13",
1089 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1090};
1091
1092/// The ISO-8859-13 encoding.
1093///
1094/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1095/// is also known as Latin 7.
1096///
1097/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1098/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1099///
1100/// This encoding matches the Windows code page 28603, except Windows decodes
1101/// unassigned code points to the Private Use Area of Unicode.
1102///
1103/// This will change from `static` to `const` if Rust changes
1104/// to make the referent of `pub const FOO: &'static Encoding`
1105/// unique cross-crate, so don't take the address of this
1106/// `static`.
1107pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1108
1109/// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1110///
1111/// For use only for taking the address of this form when
1112/// Rust prohibits the use of the non-`_INIT` form directly,
1113/// such as in initializers of other `static`s. If in doubt,
1114/// use the corresponding non-`_INIT` reference-typed `static`.
1115///
1116/// This part of the public API will go away if Rust changes
1117/// to make the referent of `pub const FOO: &'static Encoding`
1118/// unique cross-crate or if Rust starts allowing static arrays
1119/// to be initialized with `pub static FOO: &'static Encoding`
1120/// items.
1121pub static ISO_8859_14_INIT: Encoding = Encoding {
1122 name: "ISO-8859-14",
1123 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1124};
1125
1126/// The ISO-8859-14 encoding.
1127///
1128/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1129/// is also known as Latin 8.
1130///
1131/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1132/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1133///
1134/// The Windows code page number for this encoding is 28604, but kernel32.dll
1135/// does not support this encoding.
1136///
1137/// This will change from `static` to `const` if Rust changes
1138/// to make the referent of `pub const FOO: &'static Encoding`
1139/// unique cross-crate, so don't take the address of this
1140/// `static`.
1141pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1142
1143/// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1144///
1145/// For use only for taking the address of this form when
1146/// Rust prohibits the use of the non-`_INIT` form directly,
1147/// such as in initializers of other `static`s. If in doubt,
1148/// use the corresponding non-`_INIT` reference-typed `static`.
1149///
1150/// This part of the public API will go away if Rust changes
1151/// to make the referent of `pub const FOO: &'static Encoding`
1152/// unique cross-crate or if Rust starts allowing static arrays
1153/// to be initialized with `pub static FOO: &'static Encoding`
1154/// items.
1155pub static ISO_8859_15_INIT: Encoding = Encoding {
1156 name: "ISO-8859-15",
1157 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1158};
1159
1160/// The ISO-8859-15 encoding.
1161///
1162/// This is the revised Western European part of the ISO/IEC 8859 encoding
1163/// family. This encoding is also known as Latin 9.
1164///
1165/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1166/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1167///
1168/// This encoding matches the Windows code page 28605.
1169///
1170/// This will change from `static` to `const` if Rust changes
1171/// to make the referent of `pub const FOO: &'static Encoding`
1172/// unique cross-crate, so don't take the address of this
1173/// `static`.
1174pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1175
1176/// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1177///
1178/// For use only for taking the address of this form when
1179/// Rust prohibits the use of the non-`_INIT` form directly,
1180/// such as in initializers of other `static`s. If in doubt,
1181/// use the corresponding non-`_INIT` reference-typed `static`.
1182///
1183/// This part of the public API will go away if Rust changes
1184/// to make the referent of `pub const FOO: &'static Encoding`
1185/// unique cross-crate or if Rust starts allowing static arrays
1186/// to be initialized with `pub static FOO: &'static Encoding`
1187/// items.
1188pub static ISO_8859_16_INIT: Encoding = Encoding {
1189 name: "ISO-8859-16",
1190 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1191};
1192
1193/// The ISO-8859-16 encoding.
1194///
1195/// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1196/// family. This encoding is also known as Latin 10.
1197///
1198/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1199/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1200///
1201/// The Windows code page number for this encoding is 28606, but kernel32.dll
1202/// does not support this encoding.
1203///
1204/// This will change from `static` to `const` if Rust changes
1205/// to make the referent of `pub const FOO: &'static Encoding`
1206/// unique cross-crate, so don't take the address of this
1207/// `static`.
1208pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1209
1210/// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1211///
1212/// For use only for taking the address of this form when
1213/// Rust prohibits the use of the non-`_INIT` form directly,
1214/// such as in initializers of other `static`s. If in doubt,
1215/// use the corresponding non-`_INIT` reference-typed `static`.
1216///
1217/// This part of the public API will go away if Rust changes
1218/// to make the referent of `pub const FOO: &'static Encoding`
1219/// unique cross-crate or if Rust starts allowing static arrays
1220/// to be initialized with `pub static FOO: &'static Encoding`
1221/// items.
1222pub static ISO_8859_2_INIT: Encoding = Encoding {
1223 name: "ISO-8859-2",
1224 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1225};
1226
1227/// The ISO-8859-2 encoding.
1228///
1229/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1230///
1231/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1232/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1233///
1234/// This encoding matches the Windows code page 28592.
1235///
1236/// This will change from `static` to `const` if Rust changes
1237/// to make the referent of `pub const FOO: &'static Encoding`
1238/// unique cross-crate, so don't take the address of this
1239/// `static`.
1240pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1241
1242/// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1243///
1244/// For use only for taking the address of this form when
1245/// Rust prohibits the use of the non-`_INIT` form directly,
1246/// such as in initializers of other `static`s. If in doubt,
1247/// use the corresponding non-`_INIT` reference-typed `static`.
1248///
1249/// This part of the public API will go away if Rust changes
1250/// to make the referent of `pub const FOO: &'static Encoding`
1251/// unique cross-crate or if Rust starts allowing static arrays
1252/// to be initialized with `pub static FOO: &'static Encoding`
1253/// items.
1254pub static ISO_8859_3_INIT: Encoding = Encoding {
1255 name: "ISO-8859-3",
1256 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1257};
1258
1259/// The ISO-8859-3 encoding.
1260///
1261/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1262///
1263/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1264/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1265///
1266/// This encoding matches the Windows code page 28593.
1267///
1268/// This will change from `static` to `const` if Rust changes
1269/// to make the referent of `pub const FOO: &'static Encoding`
1270/// unique cross-crate, so don't take the address of this
1271/// `static`.
1272pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1273
1274/// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1275///
1276/// For use only for taking the address of this form when
1277/// Rust prohibits the use of the non-`_INIT` form directly,
1278/// such as in initializers of other `static`s. If in doubt,
1279/// use the corresponding non-`_INIT` reference-typed `static`.
1280///
1281/// This part of the public API will go away if Rust changes
1282/// to make the referent of `pub const FOO: &'static Encoding`
1283/// unique cross-crate or if Rust starts allowing static arrays
1284/// to be initialized with `pub static FOO: &'static Encoding`
1285/// items.
1286pub static ISO_8859_4_INIT: Encoding = Encoding {
1287 name: "ISO-8859-4",
1288 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1289};
1290
1291/// The ISO-8859-4 encoding.
1292///
1293/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1294///
1295/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1296/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1297///
1298/// This encoding matches the Windows code page 28594.
1299///
1300/// This will change from `static` to `const` if Rust changes
1301/// to make the referent of `pub const FOO: &'static Encoding`
1302/// unique cross-crate, so don't take the address of this
1303/// `static`.
1304pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1305
1306/// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1307///
1308/// For use only for taking the address of this form when
1309/// Rust prohibits the use of the non-`_INIT` form directly,
1310/// such as in initializers of other `static`s. If in doubt,
1311/// use the corresponding non-`_INIT` reference-typed `static`.
1312///
1313/// This part of the public API will go away if Rust changes
1314/// to make the referent of `pub const FOO: &'static Encoding`
1315/// unique cross-crate or if Rust starts allowing static arrays
1316/// to be initialized with `pub static FOO: &'static Encoding`
1317/// items.
1318pub static ISO_8859_5_INIT: Encoding = Encoding {
1319 name: "ISO-8859-5",
1320 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1321};
1322
1323/// The ISO-8859-5 encoding.
1324///
1325/// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1326///
1327/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1328/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1329///
1330/// This encoding matches the Windows code page 28595.
1331///
1332/// This will change from `static` to `const` if Rust changes
1333/// to make the referent of `pub const FOO: &'static Encoding`
1334/// unique cross-crate, so don't take the address of this
1335/// `static`.
1336pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1337
1338/// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1339///
1340/// For use only for taking the address of this form when
1341/// Rust prohibits the use of the non-`_INIT` form directly,
1342/// such as in initializers of other `static`s. If in doubt,
1343/// use the corresponding non-`_INIT` reference-typed `static`.
1344///
1345/// This part of the public API will go away if Rust changes
1346/// to make the referent of `pub const FOO: &'static Encoding`
1347/// unique cross-crate or if Rust starts allowing static arrays
1348/// to be initialized with `pub static FOO: &'static Encoding`
1349/// items.
1350pub static ISO_8859_6_INIT: Encoding = Encoding {
1351 name: "ISO-8859-6",
1352 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1353};
1354
1355/// The ISO-8859-6 encoding.
1356///
1357/// This is the Arabic part of the ISO/IEC 8859 encoding family.
1358///
1359/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1360/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1361///
1362/// This encoding matches the Windows code page 28596, except Windows decodes
1363/// unassigned code points to the Private Use Area of Unicode.
1364///
1365/// This will change from `static` to `const` if Rust changes
1366/// to make the referent of `pub const FOO: &'static Encoding`
1367/// unique cross-crate, so don't take the address of this
1368/// `static`.
1369pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1370
1371/// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1372///
1373/// For use only for taking the address of this form when
1374/// Rust prohibits the use of the non-`_INIT` form directly,
1375/// such as in initializers of other `static`s. If in doubt,
1376/// use the corresponding non-`_INIT` reference-typed `static`.
1377///
1378/// This part of the public API will go away if Rust changes
1379/// to make the referent of `pub const FOO: &'static Encoding`
1380/// unique cross-crate or if Rust starts allowing static arrays
1381/// to be initialized with `pub static FOO: &'static Encoding`
1382/// items.
1383pub static ISO_8859_7_INIT: Encoding = Encoding {
1384 name: "ISO-8859-7",
1385 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1386};
1387
1388/// The ISO-8859-7 encoding.
1389///
1390/// This is the Greek part of the ISO/IEC 8859 encoding family.
1391///
1392/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1393/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1394///
1395/// This encoding roughly matches the Windows code page 28597. Windows decodes
1396/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1397/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1398/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1399/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1400/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1401///
1402/// This will change from `static` to `const` if Rust changes
1403/// to make the referent of `pub const FOO: &'static Encoding`
1404/// unique cross-crate, so don't take the address of this
1405/// `static`.
1406pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1407
1408/// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1409///
1410/// For use only for taking the address of this form when
1411/// Rust prohibits the use of the non-`_INIT` form directly,
1412/// such as in initializers of other `static`s. If in doubt,
1413/// use the corresponding non-`_INIT` reference-typed `static`.
1414///
1415/// This part of the public API will go away if Rust changes
1416/// to make the referent of `pub const FOO: &'static Encoding`
1417/// unique cross-crate or if Rust starts allowing static arrays
1418/// to be initialized with `pub static FOO: &'static Encoding`
1419/// items.
1420pub static ISO_8859_8_INIT: Encoding = Encoding {
1421 name: "ISO-8859-8",
1422 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1423};
1424
1425/// The ISO-8859-8 encoding.
1426///
1427/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1428///
1429/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1430/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1431///
1432/// This encoding roughly matches the Windows code page 28598. Windows decodes
1433/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1434/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1435/// the private use area.
1436///
1437/// This will change from `static` to `const` if Rust changes
1438/// to make the referent of `pub const FOO: &'static Encoding`
1439/// unique cross-crate, so don't take the address of this
1440/// `static`.
1441pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1442
1443/// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1444///
1445/// For use only for taking the address of this form when
1446/// Rust prohibits the use of the non-`_INIT` form directly,
1447/// such as in initializers of other `static`s. If in doubt,
1448/// use the corresponding non-`_INIT` reference-typed `static`.
1449///
1450/// This part of the public API will go away if Rust changes
1451/// to make the referent of `pub const FOO: &'static Encoding`
1452/// unique cross-crate or if Rust starts allowing static arrays
1453/// to be initialized with `pub static FOO: &'static Encoding`
1454/// items.
1455pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1456 name: "ISO-8859-8-I",
1457 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1458};
1459
1460/// The ISO-8859-8-I encoding.
1461///
1462/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1463///
1464/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1465/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1466///
1467/// This encoding roughly matches the Windows code page 38598. Windows decodes
1468/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1469/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1470/// the private use area.
1471///
1472/// This will change from `static` to `const` if Rust changes
1473/// to make the referent of `pub const FOO: &'static Encoding`
1474/// unique cross-crate, so don't take the address of this
1475/// `static`.
1476pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1477
1478/// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1479///
1480/// For use only for taking the address of this form when
1481/// Rust prohibits the use of the non-`_INIT` form directly,
1482/// such as in initializers of other `static`s. If in doubt,
1483/// use the corresponding non-`_INIT` reference-typed `static`.
1484///
1485/// This part of the public API will go away if Rust changes
1486/// to make the referent of `pub const FOO: &'static Encoding`
1487/// unique cross-crate or if Rust starts allowing static arrays
1488/// to be initialized with `pub static FOO: &'static Encoding`
1489/// items.
1490pub static KOI8_R_INIT: Encoding = Encoding {
1491 name: "KOI8-R",
1492 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1493};
1494
1495/// The KOI8-R encoding.
1496///
1497/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1498///
1499/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1500/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1501///
1502/// This encoding matches the Windows code page 20866.
1503///
1504/// This will change from `static` to `const` if Rust changes
1505/// to make the referent of `pub const FOO: &'static Encoding`
1506/// unique cross-crate, so don't take the address of this
1507/// `static`.
1508pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1509
1510/// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1511///
1512/// For use only for taking the address of this form when
1513/// Rust prohibits the use of the non-`_INIT` form directly,
1514/// such as in initializers of other `static`s. If in doubt,
1515/// use the corresponding non-`_INIT` reference-typed `static`.
1516///
1517/// This part of the public API will go away if Rust changes
1518/// to make the referent of `pub const FOO: &'static Encoding`
1519/// unique cross-crate or if Rust starts allowing static arrays
1520/// to be initialized with `pub static FOO: &'static Encoding`
1521/// items.
1522pub static KOI8_U_INIT: Encoding = Encoding {
1523 name: "KOI8-U",
1524 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1525};
1526
1527/// The KOI8-U encoding.
1528///
1529/// This is an encoding for Ukrainian adapted from KOI8-R.
1530///
1531/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1532/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1533///
1534/// This encoding matches the Windows code page 21866.
1535///
1536/// This will change from `static` to `const` if Rust changes
1537/// to make the referent of `pub const FOO: &'static Encoding`
1538/// unique cross-crate, so don't take the address of this
1539/// `static`.
1540pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1541
1542/// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1543///
1544/// For use only for taking the address of this form when
1545/// Rust prohibits the use of the non-`_INIT` form directly,
1546/// such as in initializers of other `static`s. If in doubt,
1547/// use the corresponding non-`_INIT` reference-typed `static`.
1548///
1549/// This part of the public API will go away if Rust changes
1550/// to make the referent of `pub const FOO: &'static Encoding`
1551/// unique cross-crate or if Rust starts allowing static arrays
1552/// to be initialized with `pub static FOO: &'static Encoding`
1553/// items.
1554pub static SHIFT_JIS_INIT: Encoding = Encoding {
1555 name: "Shift_JIS",
1556 variant: VariantEncoding::ShiftJis,
1557};
1558
1559/// The Shift_JIS encoding.
1560///
1561/// This is the Japanese encoding for Windows.
1562///
1563/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1564/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1565///
1566/// This encoding matches the Windows code page 932, except Windows decodes some byte
1567/// sequences that are error per the Encoding Standard to the question mark or the
1568/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1569///
1570/// This will change from `static` to `const` if Rust changes
1571/// to make the referent of `pub const FOO: &'static Encoding`
1572/// unique cross-crate, so don't take the address of this
1573/// `static`.
1574pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1575
1576/// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1577///
1578/// For use only for taking the address of this form when
1579/// Rust prohibits the use of the non-`_INIT` form directly,
1580/// such as in initializers of other `static`s. If in doubt,
1581/// use the corresponding non-`_INIT` reference-typed `static`.
1582///
1583/// This part of the public API will go away if Rust changes
1584/// to make the referent of `pub const FOO: &'static Encoding`
1585/// unique cross-crate or if Rust starts allowing static arrays
1586/// to be initialized with `pub static FOO: &'static Encoding`
1587/// items.
1588pub static UTF_16BE_INIT: Encoding = Encoding {
1589 name: "UTF-16BE",
1590 variant: VariantEncoding::Utf16Be,
1591};
1592
1593/// The UTF-16BE encoding.
1594///
1595/// This decode-only encoding uses 16-bit code units due to Unicode originally
1596/// having been designed as a 16-bit reportoire. In the absence of a byte order
1597/// mark the big endian byte order is assumed.
1598///
1599/// There is no corresponding encoder in this crate or in the Encoding
1600/// Standard. The output encoding of this encoding is UTF-8.
1601///
1602/// This encoding matches the Windows code page 1201.
1603///
1604/// This will change from `static` to `const` if Rust changes
1605/// to make the referent of `pub const FOO: &'static Encoding`
1606/// unique cross-crate, so don't take the address of this
1607/// `static`.
1608pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1609
1610/// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1611///
1612/// For use only for taking the address of this form when
1613/// Rust prohibits the use of the non-`_INIT` form directly,
1614/// such as in initializers of other `static`s. If in doubt,
1615/// use the corresponding non-`_INIT` reference-typed `static`.
1616///
1617/// This part of the public API will go away if Rust changes
1618/// to make the referent of `pub const FOO: &'static Encoding`
1619/// unique cross-crate or if Rust starts allowing static arrays
1620/// to be initialized with `pub static FOO: &'static Encoding`
1621/// items.
1622pub static UTF_16LE_INIT: Encoding = Encoding {
1623 name: "UTF-16LE",
1624 variant: VariantEncoding::Utf16Le,
1625};
1626
1627/// The UTF-16LE encoding.
1628///
1629/// This decode-only encoding uses 16-bit code units due to Unicode originally
1630/// having been designed as a 16-bit reportoire. In the absence of a byte order
1631/// mark the little endian byte order is assumed.
1632///
1633/// There is no corresponding encoder in this crate or in the Encoding
1634/// Standard. The output encoding of this encoding is UTF-8.
1635///
1636/// This encoding matches the Windows code page 1200.
1637///
1638/// This will change from `static` to `const` if Rust changes
1639/// to make the referent of `pub const FOO: &'static Encoding`
1640/// unique cross-crate, so don't take the address of this
1641/// `static`.
1642pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1643
1644/// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1645///
1646/// For use only for taking the address of this form when
1647/// Rust prohibits the use of the non-`_INIT` form directly,
1648/// such as in initializers of other `static`s. If in doubt,
1649/// use the corresponding non-`_INIT` reference-typed `static`.
1650///
1651/// This part of the public API will go away if Rust changes
1652/// to make the referent of `pub const FOO: &'static Encoding`
1653/// unique cross-crate or if Rust starts allowing static arrays
1654/// to be initialized with `pub static FOO: &'static Encoding`
1655/// items.
1656pub static UTF_8_INIT: Encoding = Encoding {
1657 name: "UTF-8",
1658 variant: VariantEncoding::Utf8,
1659};
1660
1661/// The UTF-8 encoding.
1662///
1663/// This is the encoding that should be used for all new development it can
1664/// represent all of Unicode.
1665///
1666/// This encoding matches the Windows code page 65001, except Windows differs
1667/// in the number of errors generated for some erroneous byte sequences.
1668///
1669/// This will change from `static` to `const` if Rust changes
1670/// to make the referent of `pub const FOO: &'static Encoding`
1671/// unique cross-crate, so don't take the address of this
1672/// `static`.
1673pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1674
1675/// The initializer for the [gb18030](static.GB18030.html) encoding.
1676///
1677/// For use only for taking the address of this form when
1678/// Rust prohibits the use of the non-`_INIT` form directly,
1679/// such as in initializers of other `static`s. If in doubt,
1680/// use the corresponding non-`_INIT` reference-typed `static`.
1681///
1682/// This part of the public API will go away if Rust changes
1683/// to make the referent of `pub const FOO: &'static Encoding`
1684/// unique cross-crate or if Rust starts allowing static arrays
1685/// to be initialized with `pub static FOO: &'static Encoding`
1686/// items.
1687pub static GB18030_INIT: Encoding = Encoding {
1688 name: "gb18030",
1689 variant: VariantEncoding::Gb18030,
1690};
1691
1692/// The gb18030 encoding.
1693///
1694/// This encoding matches GB18030-2022 except the two-byte sequence 0xA3 0xA0
1695/// maps to U+3000 for compatibility with existing Web content and the four-byte
1696/// sequences for the non-PUA characters that got two-byte sequences still decode
1697/// to the same non-PUA characters as in GB18030-2005. As a result, this encoding
1698/// can represent all of Unicode except for 19 private-use characters.
1699///
1700/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1701/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1702///
1703/// This encoding matches the Windows code page 54936.
1704///
1705/// This will change from `static` to `const` if Rust changes
1706/// to make the referent of `pub const FOO: &'static Encoding`
1707/// unique cross-crate, so don't take the address of this
1708/// `static`.
1709pub static GB18030: &'static Encoding = &GB18030_INIT;
1710
1711/// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1712///
1713/// For use only for taking the address of this form when
1714/// Rust prohibits the use of the non-`_INIT` form directly,
1715/// such as in initializers of other `static`s. If in doubt,
1716/// use the corresponding non-`_INIT` reference-typed `static`.
1717///
1718/// This part of the public API will go away if Rust changes
1719/// to make the referent of `pub const FOO: &'static Encoding`
1720/// unique cross-crate or if Rust starts allowing static arrays
1721/// to be initialized with `pub static FOO: &'static Encoding`
1722/// items.
1723pub static MACINTOSH_INIT: Encoding = Encoding {
1724 name: "macintosh",
1725 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1726};
1727
1728/// The macintosh encoding.
1729///
1730/// This is the MacRoman encoding from Mac OS Classic.
1731///
1732/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1733/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1734///
1735/// This encoding matches the Windows code page 10000, except Windows decodes
1736/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1737///
1738/// This will change from `static` to `const` if Rust changes
1739/// to make the referent of `pub const FOO: &'static Encoding`
1740/// unique cross-crate, so don't take the address of this
1741/// `static`.
1742pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1743
1744/// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1745///
1746/// For use only for taking the address of this form when
1747/// Rust prohibits the use of the non-`_INIT` form directly,
1748/// such as in initializers of other `static`s. If in doubt,
1749/// use the corresponding non-`_INIT` reference-typed `static`.
1750///
1751/// This part of the public API will go away if Rust changes
1752/// to make the referent of `pub const FOO: &'static Encoding`
1753/// unique cross-crate or if Rust starts allowing static arrays
1754/// to be initialized with `pub static FOO: &'static Encoding`
1755/// items.
1756pub static REPLACEMENT_INIT: Encoding = Encoding {
1757 name: "replacement",
1758 variant: VariantEncoding::Replacement,
1759};
1760
1761/// The replacement encoding.
1762///
1763/// This decode-only encoding decodes all non-zero-length streams to a single
1764/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1765/// ASCII-compatible fallback encoding (typically windows-1252) for some
1766/// encodings that are no longer supported by the Web Platform and that
1767/// would be dangerous to treat as ASCII-compatible.
1768///
1769/// There is no corresponding encoder. The output encoding of this encoding
1770/// is UTF-8.
1771///
1772/// This encoding does not have a Windows code page number.
1773///
1774/// This will change from `static` to `const` if Rust changes
1775/// to make the referent of `pub const FOO: &'static Encoding`
1776/// unique cross-crate, so don't take the address of this
1777/// `static`.
1778pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1779
1780/// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1781///
1782/// For use only for taking the address of this form when
1783/// Rust prohibits the use of the non-`_INIT` form directly,
1784/// such as in initializers of other `static`s. If in doubt,
1785/// use the corresponding non-`_INIT` reference-typed `static`.
1786///
1787/// This part of the public API will go away if Rust changes
1788/// to make the referent of `pub const FOO: &'static Encoding`
1789/// unique cross-crate or if Rust starts allowing static arrays
1790/// to be initialized with `pub static FOO: &'static Encoding`
1791/// items.
1792pub static WINDOWS_1250_INIT: Encoding = Encoding {
1793 name: "windows-1250",
1794 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1795};
1796
1797/// The windows-1250 encoding.
1798///
1799/// This is the Central European encoding for Windows.
1800///
1801/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1802/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1803///
1804/// This encoding matches the Windows code page 1250.
1805///
1806/// This will change from `static` to `const` if Rust changes
1807/// to make the referent of `pub const FOO: &'static Encoding`
1808/// unique cross-crate, so don't take the address of this
1809/// `static`.
1810pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1811
1812/// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1813///
1814/// For use only for taking the address of this form when
1815/// Rust prohibits the use of the non-`_INIT` form directly,
1816/// such as in initializers of other `static`s. If in doubt,
1817/// use the corresponding non-`_INIT` reference-typed `static`.
1818///
1819/// This part of the public API will go away if Rust changes
1820/// to make the referent of `pub const FOO: &'static Encoding`
1821/// unique cross-crate or if Rust starts allowing static arrays
1822/// to be initialized with `pub static FOO: &'static Encoding`
1823/// items.
1824pub static WINDOWS_1251_INIT: Encoding = Encoding {
1825 name: "windows-1251",
1826 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1827};
1828
1829/// The windows-1251 encoding.
1830///
1831/// This is the Cyrillic encoding for Windows.
1832///
1833/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1834/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1835///
1836/// This encoding matches the Windows code page 1251.
1837///
1838/// This will change from `static` to `const` if Rust changes
1839/// to make the referent of `pub const FOO: &'static Encoding`
1840/// unique cross-crate, so don't take the address of this
1841/// `static`.
1842pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1843
1844/// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1845///
1846/// For use only for taking the address of this form when
1847/// Rust prohibits the use of the non-`_INIT` form directly,
1848/// such as in initializers of other `static`s. If in doubt,
1849/// use the corresponding non-`_INIT` reference-typed `static`.
1850///
1851/// This part of the public API will go away if Rust changes
1852/// to make the referent of `pub const FOO: &'static Encoding`
1853/// unique cross-crate or if Rust starts allowing static arrays
1854/// to be initialized with `pub static FOO: &'static Encoding`
1855/// items.
1856pub static WINDOWS_1252_INIT: Encoding = Encoding {
1857 name: "windows-1252",
1858 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1859};
1860
1861/// The windows-1252 encoding.
1862///
1863/// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1864/// which is known as Latin 1.
1865///
1866/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1867/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1868///
1869/// This encoding matches the Windows code page 1252.
1870///
1871/// This will change from `static` to `const` if Rust changes
1872/// to make the referent of `pub const FOO: &'static Encoding`
1873/// unique cross-crate, so don't take the address of this
1874/// `static`.
1875pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1876
1877/// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1878///
1879/// For use only for taking the address of this form when
1880/// Rust prohibits the use of the non-`_INIT` form directly,
1881/// such as in initializers of other `static`s. If in doubt,
1882/// use the corresponding non-`_INIT` reference-typed `static`.
1883///
1884/// This part of the public API will go away if Rust changes
1885/// to make the referent of `pub const FOO: &'static Encoding`
1886/// unique cross-crate or if Rust starts allowing static arrays
1887/// to be initialized with `pub static FOO: &'static Encoding`
1888/// items.
1889pub static WINDOWS_1253_INIT: Encoding = Encoding {
1890 name: "windows-1253",
1891 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1892};
1893
1894/// The windows-1253 encoding.
1895///
1896/// This is the Greek encoding for Windows. It is mostly an extension of
1897/// ISO-8859-7, but U+0386 is mapped to a different byte.
1898///
1899/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1900/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1901///
1902/// This encoding matches the Windows code page 1253, except Windows decodes
1903/// unassigned code points to the Private Use Area of Unicode.
1904///
1905/// This will change from `static` to `const` if Rust changes
1906/// to make the referent of `pub const FOO: &'static Encoding`
1907/// unique cross-crate, so don't take the address of this
1908/// `static`.
1909pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1910
1911/// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1912///
1913/// For use only for taking the address of this form when
1914/// Rust prohibits the use of the non-`_INIT` form directly,
1915/// such as in initializers of other `static`s. If in doubt,
1916/// use the corresponding non-`_INIT` reference-typed `static`.
1917///
1918/// This part of the public API will go away if Rust changes
1919/// to make the referent of `pub const FOO: &'static Encoding`
1920/// unique cross-crate or if Rust starts allowing static arrays
1921/// to be initialized with `pub static FOO: &'static Encoding`
1922/// items.
1923pub static WINDOWS_1254_INIT: Encoding = Encoding {
1924 name: "windows-1254",
1925 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1926};
1927
1928/// The windows-1254 encoding.
1929///
1930/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1931/// which is known as Latin 5.
1932///
1933/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1934/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1935///
1936/// This encoding matches the Windows code page 1254.
1937///
1938/// This will change from `static` to `const` if Rust changes
1939/// to make the referent of `pub const FOO: &'static Encoding`
1940/// unique cross-crate, so don't take the address of this
1941/// `static`.
1942pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1943
1944/// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1945///
1946/// For use only for taking the address of this form when
1947/// Rust prohibits the use of the non-`_INIT` form directly,
1948/// such as in initializers of other `static`s. If in doubt,
1949/// use the corresponding non-`_INIT` reference-typed `static`.
1950///
1951/// This part of the public API will go away if Rust changes
1952/// to make the referent of `pub const FOO: &'static Encoding`
1953/// unique cross-crate or if Rust starts allowing static arrays
1954/// to be initialized with `pub static FOO: &'static Encoding`
1955/// items.
1956pub static WINDOWS_1255_INIT: Encoding = Encoding {
1957 name: "windows-1255",
1958 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1959};
1960
1961/// The windows-1255 encoding.
1962///
1963/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1964/// except for a currency sign swap.
1965///
1966/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1967/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1968///
1969/// This encoding matches the Windows code page 1255, except Windows decodes
1970/// unassigned code points to the Private Use Area of Unicode.
1971///
1972/// This will change from `static` to `const` if Rust changes
1973/// to make the referent of `pub const FOO: &'static Encoding`
1974/// unique cross-crate, so don't take the address of this
1975/// `static`.
1976pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1977
1978/// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1979///
1980/// For use only for taking the address of this form when
1981/// Rust prohibits the use of the non-`_INIT` form directly,
1982/// such as in initializers of other `static`s. If in doubt,
1983/// use the corresponding non-`_INIT` reference-typed `static`.
1984///
1985/// This part of the public API will go away if Rust changes
1986/// to make the referent of `pub const FOO: &'static Encoding`
1987/// unique cross-crate or if Rust starts allowing static arrays
1988/// to be initialized with `pub static FOO: &'static Encoding`
1989/// items.
1990pub static WINDOWS_1256_INIT: Encoding = Encoding {
1991 name: "windows-1256",
1992 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1993};
1994
1995/// The windows-1256 encoding.
1996///
1997/// This is the Arabic encoding for Windows.
1998///
1999/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
2000/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
2001///
2002/// This encoding matches the Windows code page 1256.
2003///
2004/// This will change from `static` to `const` if Rust changes
2005/// to make the referent of `pub const FOO: &'static Encoding`
2006/// unique cross-crate, so don't take the address of this
2007/// `static`.
2008pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
2009
2010/// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
2011///
2012/// For use only for taking the address of this form when
2013/// Rust prohibits the use of the non-`_INIT` form directly,
2014/// such as in initializers of other `static`s. If in doubt,
2015/// use the corresponding non-`_INIT` reference-typed `static`.
2016///
2017/// This part of the public API will go away if Rust changes
2018/// to make the referent of `pub const FOO: &'static Encoding`
2019/// unique cross-crate or if Rust starts allowing static arrays
2020/// to be initialized with `pub static FOO: &'static Encoding`
2021/// items.
2022pub static WINDOWS_1257_INIT: Encoding = Encoding {
2023 name: "windows-1257",
2024 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
2025};
2026
2027/// The windows-1257 encoding.
2028///
2029/// This is the Baltic encoding for Windows.
2030///
2031/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
2032/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
2033///
2034/// This encoding matches the Windows code page 1257, except Windows decodes
2035/// unassigned code points to the Private Use Area of Unicode.
2036///
2037/// This will change from `static` to `const` if Rust changes
2038/// to make the referent of `pub const FOO: &'static Encoding`
2039/// unique cross-crate, so don't take the address of this
2040/// `static`.
2041pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
2042
2043/// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2044///
2045/// For use only for taking the address of this form when
2046/// Rust prohibits the use of the non-`_INIT` form directly,
2047/// such as in initializers of other `static`s. If in doubt,
2048/// use the corresponding non-`_INIT` reference-typed `static`.
2049///
2050/// This part of the public API will go away if Rust changes
2051/// to make the referent of `pub const FOO: &'static Encoding`
2052/// unique cross-crate or if Rust starts allowing static arrays
2053/// to be initialized with `pub static FOO: &'static Encoding`
2054/// items.
2055pub static WINDOWS_1258_INIT: Encoding = Encoding {
2056 name: "windows-1258",
2057 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2058};
2059
2060/// The windows-1258 encoding.
2061///
2062/// This is the Vietnamese encoding for Windows.
2063///
2064/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2065/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2066///
2067/// This encoding matches the Windows code page 1258 when used in the
2068/// non-normalizing mode. Unlike with the other single-byte encodings, the
2069/// result of decoding is not necessarily in Normalization Form C. On the
2070/// other hand, input in the Normalization Form C is not encoded without
2071/// replacement. In general, it's a bad idea to encode to encodings other
2072/// than UTF-8, but this encoding is especially hazardous to encode to.
2073///
2074/// This will change from `static` to `const` if Rust changes
2075/// to make the referent of `pub const FOO: &'static Encoding`
2076/// unique cross-crate, so don't take the address of this
2077/// `static`.
2078pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2079
2080/// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2081///
2082/// For use only for taking the address of this form when
2083/// Rust prohibits the use of the non-`_INIT` form directly,
2084/// such as in initializers of other `static`s. If in doubt,
2085/// use the corresponding non-`_INIT` reference-typed `static`.
2086///
2087/// This part of the public API will go away if Rust changes
2088/// to make the referent of `pub const FOO: &'static Encoding`
2089/// unique cross-crate or if Rust starts allowing static arrays
2090/// to be initialized with `pub static FOO: &'static Encoding`
2091/// items.
2092pub static WINDOWS_874_INIT: Encoding = Encoding {
2093 name: "windows-874",
2094 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2095};
2096
2097/// The windows-874 encoding.
2098///
2099/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2100///
2101/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2102/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2103///
2104/// This encoding matches the Windows code page 874, except Windows decodes
2105/// unassigned code points to the Private Use Area of Unicode.
2106///
2107/// This will change from `static` to `const` if Rust changes
2108/// to make the referent of `pub const FOO: &'static Encoding`
2109/// unique cross-crate, so don't take the address of this
2110/// `static`.
2111pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2112
2113/// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2114///
2115/// For use only for taking the address of this form when
2116/// Rust prohibits the use of the non-`_INIT` form directly,
2117/// such as in initializers of other `static`s. If in doubt,
2118/// use the corresponding non-`_INIT` reference-typed `static`.
2119///
2120/// This part of the public API will go away if Rust changes
2121/// to make the referent of `pub const FOO: &'static Encoding`
2122/// unique cross-crate or if Rust starts allowing static arrays
2123/// to be initialized with `pub static FOO: &'static Encoding`
2124/// items.
2125pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2126 name: "x-mac-cyrillic",
2127 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2128};
2129
2130/// The x-mac-cyrillic encoding.
2131///
2132/// This is the MacUkrainian encoding from Mac OS Classic.
2133///
2134/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2135/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2136///
2137/// This encoding matches the Windows code page 10017.
2138///
2139/// This will change from `static` to `const` if Rust changes
2140/// to make the referent of `pub const FOO: &'static Encoding`
2141/// unique cross-crate, so don't take the address of this
2142/// `static`.
2143pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2144
2145/// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2146///
2147/// For use only for taking the address of this form when
2148/// Rust prohibits the use of the non-`_INIT` form directly,
2149/// such as in initializers of other `static`s. If in doubt,
2150/// use the corresponding non-`_INIT` reference-typed `static`.
2151///
2152/// This part of the public API will go away if Rust changes
2153/// to make the referent of `pub const FOO: &'static Encoding`
2154/// unique cross-crate or if Rust starts allowing static arrays
2155/// to be initialized with `pub static FOO: &'static Encoding`
2156/// items.
2157pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2158 name: "x-user-defined",
2159 variant: VariantEncoding::UserDefined,
2160};
2161
2162/// The x-user-defined encoding.
2163///
2164/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2165/// them to the Private Use Area of Unicode. It was used for loading binary
2166/// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2167/// the `"arraybuffer"` response type.
2168///
2169/// This encoding does not have a Windows code page number.
2170///
2171/// This will change from `static` to `const` if Rust changes
2172/// to make the referent of `pub const FOO: &'static Encoding`
2173/// unique cross-crate, so don't take the address of this
2174/// `static`.
2175pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2176
2177static LABELS_SORTED: [&'static str; 228] = [
2178 "l1",
2179 "l2",
2180 "l3",
2181 "l4",
2182 "l5",
2183 "l6",
2184 "l9",
2185 "866",
2186 "mac",
2187 "koi",
2188 "gbk",
2189 "big5",
2190 "utf8",
2191 "koi8",
2192 "sjis",
2193 "ucs-2",
2194 "ms932",
2195 "cp866",
2196 "utf-8",
2197 "cp819",
2198 "ascii",
2199 "x-gbk",
2200 "greek",
2201 "cp1250",
2202 "cp1251",
2203 "latin1",
2204 "gb2312",
2205 "cp1252",
2206 "latin2",
2207 "cp1253",
2208 "latin3",
2209 "cp1254",
2210 "latin4",
2211 "cp1255",
2212 "csbig5",
2213 "latin5",
2214 "utf-16",
2215 "cp1256",
2216 "ibm866",
2217 "latin6",
2218 "cp1257",
2219 "cp1258",
2220 "greek8",
2221 "ibm819",
2222 "arabic",
2223 "visual",
2224 "korean",
2225 "euc-jp",
2226 "koi8-r",
2227 "koi8_r",
2228 "euc-kr",
2229 "x-sjis",
2230 "koi8-u",
2231 "hebrew",
2232 "tis-620",
2233 "gb18030",
2234 "ksc5601",
2235 "gb_2312",
2236 "dos-874",
2237 "cn-big5",
2238 "unicode",
2239 "chinese",
2240 "logical",
2241 "cskoi8r",
2242 "cseuckr",
2243 "koi8-ru",
2244 "x-cp1250",
2245 "ksc_5601",
2246 "x-cp1251",
2247 "iso88591",
2248 "csgb2312",
2249 "x-cp1252",
2250 "iso88592",
2251 "x-cp1253",
2252 "iso88593",
2253 "ecma-114",
2254 "x-cp1254",
2255 "iso88594",
2256 "x-cp1255",
2257 "iso88595",
2258 "x-x-big5",
2259 "x-cp1256",
2260 "csibm866",
2261 "iso88596",
2262 "x-cp1257",
2263 "iso88597",
2264 "asmo-708",
2265 "ecma-118",
2266 "elot_928",
2267 "x-cp1258",
2268 "iso88598",
2269 "iso88599",
2270 "cyrillic",
2271 "utf-16be",
2272 "utf-16le",
2273 "us-ascii",
2274 "ms_kanji",
2275 "x-euc-jp",
2276 "iso885910",
2277 "iso8859-1",
2278 "iso885911",
2279 "iso8859-2",
2280 "iso8859-3",
2281 "iso885913",
2282 "iso8859-4",
2283 "iso885914",
2284 "iso8859-5",
2285 "iso885915",
2286 "iso8859-6",
2287 "iso8859-7",
2288 "iso8859-8",
2289 "iso-ir-58",
2290 "iso8859-9",
2291 "csunicode",
2292 "macintosh",
2293 "shift-jis",
2294 "shift_jis",
2295 "iso-ir-100",
2296 "iso8859-10",
2297 "iso-ir-110",
2298 "gb_2312-80",
2299 "iso-8859-1",
2300 "iso_8859-1",
2301 "iso-ir-101",
2302 "iso8859-11",
2303 "iso-8859-2",
2304 "iso_8859-2",
2305 "hz-gb-2312",
2306 "iso-8859-3",
2307 "iso_8859-3",
2308 "iso8859-13",
2309 "iso-8859-4",
2310 "iso_8859-4",
2311 "iso8859-14",
2312 "iso-ir-144",
2313 "iso-8859-5",
2314 "iso_8859-5",
2315 "iso8859-15",
2316 "iso-8859-6",
2317 "iso_8859-6",
2318 "iso-ir-126",
2319 "iso-8859-7",
2320 "iso_8859-7",
2321 "iso-ir-127",
2322 "iso-ir-157",
2323 "iso-8859-8",
2324 "iso_8859-8",
2325 "iso-ir-138",
2326 "iso-ir-148",
2327 "iso-8859-9",
2328 "iso_8859-9",
2329 "iso-ir-109",
2330 "iso-ir-149",
2331 "big5-hkscs",
2332 "csshiftjis",
2333 "iso-8859-10",
2334 "iso-8859-11",
2335 "csisolatin1",
2336 "csisolatin2",
2337 "iso-8859-13",
2338 "csisolatin3",
2339 "iso-8859-14",
2340 "windows-874",
2341 "csisolatin4",
2342 "iso-8859-15",
2343 "iso_8859-15",
2344 "csisolatin5",
2345 "iso-8859-16",
2346 "csisolatin6",
2347 "windows-949",
2348 "csisolatin9",
2349 "csiso88596e",
2350 "csiso88598e",
2351 "unicodefffe",
2352 "unicodefeff",
2353 "csmacintosh",
2354 "csiso88596i",
2355 "csiso88598i",
2356 "windows-31j",
2357 "x-mac-roman",
2358 "iso-2022-cn",
2359 "iso-2022-jp",
2360 "csiso2022jp",
2361 "iso-2022-kr",
2362 "csiso2022kr",
2363 "replacement",
2364 "windows-1250",
2365 "windows-1251",
2366 "windows-1252",
2367 "windows-1253",
2368 "windows-1254",
2369 "windows-1255",
2370 "windows-1256",
2371 "windows-1257",
2372 "windows-1258",
2373 "iso-8859-6-e",
2374 "iso-8859-8-e",
2375 "iso-8859-6-i",
2376 "iso-8859-8-i",
2377 "sun_eu_greek",
2378 "csksc56011987",
2379 "unicode20utf8",
2380 "unicode11utf8",
2381 "ks_c_5601-1987",
2382 "ansi_x3.4-1968",
2383 "ks_c_5601-1989",
2384 "x-mac-cyrillic",
2385 "x-user-defined",
2386 "csiso58gb231280",
2387 "iso-10646-ucs-2",
2388 "iso_8859-1:1987",
2389 "iso_8859-2:1987",
2390 "iso_8859-6:1987",
2391 "iso_8859-7:1987",
2392 "iso_8859-3:1988",
2393 "iso_8859-4:1988",
2394 "iso_8859-5:1988",
2395 "iso_8859-8:1988",
2396 "x-unicode20utf8",
2397 "iso_8859-9:1989",
2398 "csisolatingreek",
2399 "x-mac-ukrainian",
2400 "iso-2022-cn-ext",
2401 "csisolatinarabic",
2402 "csisolatinhebrew",
2403 "unicode-1-1-utf-8",
2404 "csisolatincyrillic",
2405 "cseucpkdfmtjapanese",
2406];
2407
2408static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 228] = [
2409 &WINDOWS_1252_INIT,
2410 &ISO_8859_2_INIT,
2411 &ISO_8859_3_INIT,
2412 &ISO_8859_4_INIT,
2413 &WINDOWS_1254_INIT,
2414 &ISO_8859_10_INIT,
2415 &ISO_8859_15_INIT,
2416 &IBM866_INIT,
2417 &MACINTOSH_INIT,
2418 &KOI8_R_INIT,
2419 &GBK_INIT,
2420 &BIG5_INIT,
2421 &UTF_8_INIT,
2422 &KOI8_R_INIT,
2423 &SHIFT_JIS_INIT,
2424 &UTF_16LE_INIT,
2425 &SHIFT_JIS_INIT,
2426 &IBM866_INIT,
2427 &UTF_8_INIT,
2428 &WINDOWS_1252_INIT,
2429 &WINDOWS_1252_INIT,
2430 &GBK_INIT,
2431 &ISO_8859_7_INIT,
2432 &WINDOWS_1250_INIT,
2433 &WINDOWS_1251_INIT,
2434 &WINDOWS_1252_INIT,
2435 &GBK_INIT,
2436 &WINDOWS_1252_INIT,
2437 &ISO_8859_2_INIT,
2438 &WINDOWS_1253_INIT,
2439 &ISO_8859_3_INIT,
2440 &WINDOWS_1254_INIT,
2441 &ISO_8859_4_INIT,
2442 &WINDOWS_1255_INIT,
2443 &BIG5_INIT,
2444 &WINDOWS_1254_INIT,
2445 &UTF_16LE_INIT,
2446 &WINDOWS_1256_INIT,
2447 &IBM866_INIT,
2448 &ISO_8859_10_INIT,
2449 &WINDOWS_1257_INIT,
2450 &WINDOWS_1258_INIT,
2451 &ISO_8859_7_INIT,
2452 &WINDOWS_1252_INIT,
2453 &ISO_8859_6_INIT,
2454 &ISO_8859_8_INIT,
2455 &EUC_KR_INIT,
2456 &EUC_JP_INIT,
2457 &KOI8_R_INIT,
2458 &KOI8_R_INIT,
2459 &EUC_KR_INIT,
2460 &SHIFT_JIS_INIT,
2461 &KOI8_U_INIT,
2462 &ISO_8859_8_INIT,
2463 &WINDOWS_874_INIT,
2464 &GB18030_INIT,
2465 &EUC_KR_INIT,
2466 &GBK_INIT,
2467 &WINDOWS_874_INIT,
2468 &BIG5_INIT,
2469 &UTF_16LE_INIT,
2470 &GBK_INIT,
2471 &ISO_8859_8_I_INIT,
2472 &KOI8_R_INIT,
2473 &EUC_KR_INIT,
2474 &KOI8_U_INIT,
2475 &WINDOWS_1250_INIT,
2476 &EUC_KR_INIT,
2477 &WINDOWS_1251_INIT,
2478 &WINDOWS_1252_INIT,
2479 &GBK_INIT,
2480 &WINDOWS_1252_INIT,
2481 &ISO_8859_2_INIT,
2482 &WINDOWS_1253_INIT,
2483 &ISO_8859_3_INIT,
2484 &ISO_8859_6_INIT,
2485 &WINDOWS_1254_INIT,
2486 &ISO_8859_4_INIT,
2487 &WINDOWS_1255_INIT,
2488 &ISO_8859_5_INIT,
2489 &BIG5_INIT,
2490 &WINDOWS_1256_INIT,
2491 &IBM866_INIT,
2492 &ISO_8859_6_INIT,
2493 &WINDOWS_1257_INIT,
2494 &ISO_8859_7_INIT,
2495 &ISO_8859_6_INIT,
2496 &ISO_8859_7_INIT,
2497 &ISO_8859_7_INIT,
2498 &WINDOWS_1258_INIT,
2499 &ISO_8859_8_INIT,
2500 &WINDOWS_1254_INIT,
2501 &ISO_8859_5_INIT,
2502 &UTF_16BE_INIT,
2503 &UTF_16LE_INIT,
2504 &WINDOWS_1252_INIT,
2505 &SHIFT_JIS_INIT,
2506 &EUC_JP_INIT,
2507 &ISO_8859_10_INIT,
2508 &WINDOWS_1252_INIT,
2509 &WINDOWS_874_INIT,
2510 &ISO_8859_2_INIT,
2511 &ISO_8859_3_INIT,
2512 &ISO_8859_13_INIT,
2513 &ISO_8859_4_INIT,
2514 &ISO_8859_14_INIT,
2515 &ISO_8859_5_INIT,
2516 &ISO_8859_15_INIT,
2517 &ISO_8859_6_INIT,
2518 &ISO_8859_7_INIT,
2519 &ISO_8859_8_INIT,
2520 &GBK_INIT,
2521 &WINDOWS_1254_INIT,
2522 &UTF_16LE_INIT,
2523 &MACINTOSH_INIT,
2524 &SHIFT_JIS_INIT,
2525 &SHIFT_JIS_INIT,
2526 &WINDOWS_1252_INIT,
2527 &ISO_8859_10_INIT,
2528 &ISO_8859_4_INIT,
2529 &GBK_INIT,
2530 &WINDOWS_1252_INIT,
2531 &WINDOWS_1252_INIT,
2532 &ISO_8859_2_INIT,
2533 &WINDOWS_874_INIT,
2534 &ISO_8859_2_INIT,
2535 &ISO_8859_2_INIT,
2536 &REPLACEMENT_INIT,
2537 &ISO_8859_3_INIT,
2538 &ISO_8859_3_INIT,
2539 &ISO_8859_13_INIT,
2540 &ISO_8859_4_INIT,
2541 &ISO_8859_4_INIT,
2542 &ISO_8859_14_INIT,
2543 &ISO_8859_5_INIT,
2544 &ISO_8859_5_INIT,
2545 &ISO_8859_5_INIT,
2546 &ISO_8859_15_INIT,
2547 &ISO_8859_6_INIT,
2548 &ISO_8859_6_INIT,
2549 &ISO_8859_7_INIT,
2550 &ISO_8859_7_INIT,
2551 &ISO_8859_7_INIT,
2552 &ISO_8859_6_INIT,
2553 &ISO_8859_10_INIT,
2554 &ISO_8859_8_INIT,
2555 &ISO_8859_8_INIT,
2556 &ISO_8859_8_INIT,
2557 &WINDOWS_1254_INIT,
2558 &WINDOWS_1254_INIT,
2559 &WINDOWS_1254_INIT,
2560 &ISO_8859_3_INIT,
2561 &EUC_KR_INIT,
2562 &BIG5_INIT,
2563 &SHIFT_JIS_INIT,
2564 &ISO_8859_10_INIT,
2565 &WINDOWS_874_INIT,
2566 &WINDOWS_1252_INIT,
2567 &ISO_8859_2_INIT,
2568 &ISO_8859_13_INIT,
2569 &ISO_8859_3_INIT,
2570 &ISO_8859_14_INIT,
2571 &WINDOWS_874_INIT,
2572 &ISO_8859_4_INIT,
2573 &ISO_8859_15_INIT,
2574 &ISO_8859_15_INIT,
2575 &WINDOWS_1254_INIT,
2576 &ISO_8859_16_INIT,
2577 &ISO_8859_10_INIT,
2578 &EUC_KR_INIT,
2579 &ISO_8859_15_INIT,
2580 &ISO_8859_6_INIT,
2581 &ISO_8859_8_INIT,
2582 &UTF_16BE_INIT,
2583 &UTF_16LE_INIT,
2584 &MACINTOSH_INIT,
2585 &ISO_8859_6_INIT,
2586 &ISO_8859_8_I_INIT,
2587 &SHIFT_JIS_INIT,
2588 &MACINTOSH_INIT,
2589 &REPLACEMENT_INIT,
2590 &ISO_2022_JP_INIT,
2591 &ISO_2022_JP_INIT,
2592 &REPLACEMENT_INIT,
2593 &REPLACEMENT_INIT,
2594 &REPLACEMENT_INIT,
2595 &WINDOWS_1250_INIT,
2596 &WINDOWS_1251_INIT,
2597 &WINDOWS_1252_INIT,
2598 &WINDOWS_1253_INIT,
2599 &WINDOWS_1254_INIT,
2600 &WINDOWS_1255_INIT,
2601 &WINDOWS_1256_INIT,
2602 &WINDOWS_1257_INIT,
2603 &WINDOWS_1258_INIT,
2604 &ISO_8859_6_INIT,
2605 &ISO_8859_8_INIT,
2606 &ISO_8859_6_INIT,
2607 &ISO_8859_8_I_INIT,
2608 &ISO_8859_7_INIT,
2609 &EUC_KR_INIT,
2610 &UTF_8_INIT,
2611 &UTF_8_INIT,
2612 &EUC_KR_INIT,
2613 &WINDOWS_1252_INIT,
2614 &EUC_KR_INIT,
2615 &X_MAC_CYRILLIC_INIT,
2616 &X_USER_DEFINED_INIT,
2617 &GBK_INIT,
2618 &UTF_16LE_INIT,
2619 &WINDOWS_1252_INIT,
2620 &ISO_8859_2_INIT,
2621 &ISO_8859_6_INIT,
2622 &ISO_8859_7_INIT,
2623 &ISO_8859_3_INIT,
2624 &ISO_8859_4_INIT,
2625 &ISO_8859_5_INIT,
2626 &ISO_8859_8_INIT,
2627 &UTF_8_INIT,
2628 &WINDOWS_1254_INIT,
2629 &ISO_8859_7_INIT,
2630 &X_MAC_CYRILLIC_INIT,
2631 &REPLACEMENT_INIT,
2632 &ISO_8859_6_INIT,
2633 &ISO_8859_8_INIT,
2634 &UTF_8_INIT,
2635 &ISO_8859_5_INIT,
2636 &EUC_JP_INIT,
2637];
2638
2639// END GENERATED CODE
2640
2641/// An encoding as defined in the [Encoding Standard][1].
2642///
2643/// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2644/// and, in most cases, vice versa. Each encoding has a name, an output
2645/// encoding, and one or more labels.
2646///
2647/// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2648/// encoding in formats and protocols. The _name_ of the encoding is the
2649/// preferred label in the case appropriate for returning from the
2650/// [`characterSet`][2] property of the `Document` DOM interface.
2651///
2652/// The _output encoding_ is the encoding used for form submission and URL
2653/// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2654/// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2655/// encodings.
2656///
2657/// [1]: https://encoding.spec.whatwg.org/
2658/// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2659///
2660/// # Streaming vs. Non-Streaming
2661///
2662/// When you have the entire input in a single buffer, you can use the
2663/// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2664/// [`decode_without_bom_handling()`][5],
2665/// [`decode_without_bom_handling_and_without_replacement()`][6] and
2666/// [`encode()`][7]. (These methods are available to Rust callers only and are
2667/// not available in the C API.) Unlike the rest of the API available to Rust,
2668/// these methods perform heap allocations. You should the `Decoder` and
2669/// `Encoder` objects when your input is split into multiple buffers or when
2670/// you want to control the allocation of the output buffers.
2671///
2672/// [3]: #method.decode
2673/// [4]: #method.decode_with_bom_removal
2674/// [5]: #method.decode_without_bom_handling
2675/// [6]: #method.decode_without_bom_handling_and_without_replacement
2676/// [7]: #method.encode
2677///
2678/// # Instances
2679///
2680/// All instances of `Encoding` are statically allocated and have the `'static`
2681/// lifetime. There is precisely one unique `Encoding` instance for each
2682/// encoding defined in the Encoding Standard.
2683///
2684/// To obtain a reference to a particular encoding whose identity you know at
2685/// compile time, use a `static` that refers to encoding. There is a `static`
2686/// for each encoding. The `static`s are named in all caps with hyphens
2687/// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2688/// name). For example, if you know at compile time that you will want to
2689/// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2690/// in C/C++).
2691///
2692/// Additionally, there are non-reference-typed forms ending with `_INIT` to
2693/// work around the problem that `static`s of the type `&'static Encoding`
2694/// cannot be used to initialize items of an array whose type is
2695/// `[&'static Encoding; N]`.
2696///
2697/// If you don't know what encoding you need at compile time and need to
2698/// dynamically get an encoding by label, use
2699/// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2700///
2701/// Instances of `Encoding` can be compared with `==` (in both Rust and in
2702/// C/C++).
2703pub struct Encoding {
2704 name: &'static str,
2705 variant: VariantEncoding,
2706}
2707
2708impl Encoding {
2709 /// Implements the
2710 /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2711 /// algorithm.
2712 ///
2713 /// If, after ASCII-lowercasing and removing leading and trailing
2714 /// whitespace, the argument matches a label defined in the Encoding
2715 /// Standard, `Some(&'static Encoding)` representing the corresponding
2716 /// encoding is returned. If there is no match, `None` is returned.
2717 ///
2718 /// This is the right method to use if the action upon the method returning
2719 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2720 /// When the action upon the method returning `None` is not to proceed with
2721 /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2722 /// appropriate.
2723 ///
2724 /// The argument is of type `&[u8]` instead of `&str` to save callers
2725 /// that are extracting the label from a non-UTF-8 protocol the trouble
2726 /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2727 /// on it.)
2728 ///
2729 /// Available via the C wrapper.
2730 ///
2731 /// # Example
2732 /// ```
2733 /// use encoding_rs::Encoding;
2734 ///
2735 /// assert_eq!(Some(encoding_rs::UTF_8), Encoding::for_label(b"utf-8"));
2736 /// assert_eq!(Some(encoding_rs::UTF_8), Encoding::for_label(b"unicode11utf8"));
2737 ///
2738 /// assert_eq!(Some(encoding_rs::ISO_8859_2), Encoding::for_label(b"latin2"));
2739 ///
2740 /// assert_eq!(Some(encoding_rs::UTF_16BE), Encoding::for_label(b"utf-16be"));
2741 ///
2742 /// assert_eq!(None, Encoding::for_label(b"unrecognized label"));
2743 /// ```
2744 pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2745 let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2746 let mut trimmed_pos = 0usize;
2747 let mut iter = label.into_iter();
2748 // before
2749 loop {
2750 match iter.next() {
2751 None => {
2752 return None;
2753 }
2754 Some(byte) => {
2755 // The characters used in labels are:
2756 // a-z (except q, but excluding it below seems excessive)
2757 // 0-9
2758 // . _ - :
2759 match *byte {
2760 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2761 continue;
2762 }
2763 b'A'..=b'Z' => {
2764 trimmed[trimmed_pos] = *byte + 0x20u8;
2765 trimmed_pos = 1usize;
2766 break;
2767 }
2768 b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2769 trimmed[trimmed_pos] = *byte;
2770 trimmed_pos = 1usize;
2771 break;
2772 }
2773 _ => {
2774 return None;
2775 }
2776 }
2777 }
2778 }
2779 }
2780 // inside
2781 loop {
2782 match iter.next() {
2783 None => {
2784 break;
2785 }
2786 Some(byte) => {
2787 match *byte {
2788 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2789 break;
2790 }
2791 b'A'..=b'Z' => {
2792 if trimmed_pos == LONGEST_LABEL_LENGTH {
2793 // There's no encoding with a label this long
2794 return None;
2795 }
2796 trimmed[trimmed_pos] = *byte + 0x20u8;
2797 trimmed_pos += 1usize;
2798 continue;
2799 }
2800 b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2801 if trimmed_pos == LONGEST_LABEL_LENGTH {
2802 // There's no encoding with a label this long
2803 return None;
2804 }
2805 trimmed[trimmed_pos] = *byte;
2806 trimmed_pos += 1usize;
2807 continue;
2808 }
2809 _ => {
2810 return None;
2811 }
2812 }
2813 }
2814 }
2815 }
2816 // after
2817 loop {
2818 match iter.next() {
2819 None => {
2820 break;
2821 }
2822 Some(byte) => {
2823 match *byte {
2824 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2825 continue;
2826 }
2827 _ => {
2828 // There's no label with space in the middle
2829 return None;
2830 }
2831 }
2832 }
2833 }
2834 }
2835 let candidate = &trimmed[..trimmed_pos];
2836 match LABELS_SORTED.binary_search_by(|probe| {
2837 let bytes = probe.as_bytes();
2838 let c = bytes.len().cmp(&candidate.len());
2839 if c != Ordering::Equal {
2840 return c;
2841 }
2842 let probe_iter = bytes.iter().rev();
2843 let candidate_iter = candidate.iter().rev();
2844 probe_iter.cmp(candidate_iter)
2845 }) {
2846 Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2847 Err(_) => None,
2848 }
2849 }
2850
2851 /// This method behaves the same as `for_label()`, except when `for_label()`
2852 /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2853 ///
2854 /// This method is useful in scenarios where a fatal error is required
2855 /// upon invalid label, because in those cases the caller typically wishes
2856 /// to treat the labels that map to the replacement encoding as fatal
2857 /// errors, too.
2858 ///
2859 /// It is not OK to use this method when the action upon the method returning
2860 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2861 /// case, the `for_label()` method should be used instead in order to avoid
2862 /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2863 ///
2864 /// Available via the C wrapper.
2865 #[inline]
2866 pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2867 match Encoding::for_label(label) {
2868 None => None,
2869 Some(encoding) => {
2870 if encoding == REPLACEMENT {
2871 None
2872 } else {
2873 Some(encoding)
2874 }
2875 }
2876 }
2877 }
2878
2879 /// Performs non-incremental BOM sniffing.
2880 ///
2881 /// The argument must either be a buffer representing the entire input
2882 /// stream (non-streaming case) or a buffer representing at least the first
2883 /// three bytes of the input stream (streaming case).
2884 ///
2885 /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2886 /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2887 /// or UTF-16BE BOM or `None` otherwise.
2888 ///
2889 /// Available via the C wrapper.
2890 #[inline]
2891 pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2892 if buffer.starts_with(b"\xEF\xBB\xBF") {
2893 Some((UTF_8, 3))
2894 } else if buffer.starts_with(b"\xFF\xFE") {
2895 Some((UTF_16LE, 2))
2896 } else if buffer.starts_with(b"\xFE\xFF") {
2897 Some((UTF_16BE, 2))
2898 } else {
2899 None
2900 }
2901 }
2902
2903 /// Returns the name of this encoding.
2904 ///
2905 /// This name is appropriate to return as-is from the DOM
2906 /// `document.characterSet` property.
2907 ///
2908 /// Available via the C wrapper.
2909 #[inline]
2910 pub fn name(&'static self) -> &'static str {
2911 self.name
2912 }
2913
2914 /// Checks whether the _output encoding_ of this encoding can encode every
2915 /// `char`. (Only true if the output encoding is UTF-8.)
2916 ///
2917 /// Available via the C wrapper.
2918 #[inline]
2919 pub fn can_encode_everything(&'static self) -> bool {
2920 self.output_encoding() == UTF_8
2921 }
2922
2923 /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2924 /// U+0000...U+007F and vice versa.
2925 ///
2926 /// Available via the C wrapper.
2927 #[inline]
2928 pub fn is_ascii_compatible(&'static self) -> bool {
2929 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2930 }
2931
2932 /// Checks whether this encoding maps one byte to one Basic Multilingual
2933 /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2934 /// vice versa (for mappable characters).
2935 ///
2936 /// `true` iff this encoding is on the list of [Legacy single-byte
2937 /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2938 /// in the spec or x-user-defined.
2939 ///
2940 /// Available via the C wrapper.
2941 #[inline]
2942 pub fn is_single_byte(&'static self) -> bool {
2943 self.variant.is_single_byte()
2944 }
2945
2946 /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2947 /// U+0000...U+007F and vice versa.
2948 #[cfg(feature = "alloc")]
2949 #[inline]
2950 fn is_potentially_borrowable(&'static self) -> bool {
2951 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2952 }
2953
2954 /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2955 /// UTF-16BE, UTF-16LE, and replacement and the encoding itself otherwise.
2956 ///
2957 /// _Note:_ The _output encoding_ concept is needed for form submission and
2958 /// error handling in the query strings of URLs in the Web Platform.
2959 ///
2960 /// Available via the C wrapper.
2961 #[inline]
2962 pub fn output_encoding(&'static self) -> &'static Encoding {
2963 if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2964 UTF_8
2965 } else {
2966 self
2967 }
2968 }
2969
2970 /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2971 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2972 /// entire input is available as a single buffer (i.e. the end of the
2973 /// buffer marks the end of the stream).
2974 ///
2975 /// The BOM, if any, does not appear in the output.
2976 ///
2977 /// This method implements the (non-streaming version of) the
2978 /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2979 ///
2980 /// The second item in the returned tuple is the encoding that was actually
2981 /// used (which may differ from this encoding thanks to BOM sniffing).
2982 ///
2983 /// The third item in the returned tuple indicates whether there were
2984 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2985 ///
2986 /// _Note:_ It is wrong to use this when the input buffer represents only
2987 /// a segment of the input instead of the whole input. Use `new_decoder()`
2988 /// when decoding segmented input.
2989 ///
2990 /// This method performs a one or two heap allocations for the backing
2991 /// buffer of the `String` when unable to borrow. (One allocation if not
2992 /// errors and potentially another one in the presence of errors.) The
2993 /// first allocation assumes jemalloc and may not be optimal with
2994 /// allocators that do not use power-of-two buckets. A borrow is performed
2995 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2996 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2997 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2998 /// transitions.
2999 ///
3000 /// # Panics
3001 ///
3002 /// If the size calculation for a heap-allocated backing buffer overflows
3003 /// `usize`.
3004 ///
3005 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3006 /// by default).
3007 #[cfg(feature = "alloc")]
3008 #[inline]
3009 pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
3010 let (encoding, without_bom) = match Encoding::for_bom(bytes) {
3011 Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
3012 None => (self, bytes),
3013 };
3014 let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
3015 (cow, encoding, had_errors)
3016 }
3017
3018 /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
3019 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
3020 /// entire input is available as a single buffer (i.e. the end of the
3021 /// buffer marks the end of the stream).
3022 ///
3023 /// Only an initial byte sequence that is a BOM for this encoding is removed.
3024 ///
3025 /// When invoked on `UTF_8`, this method implements the (non-streaming
3026 /// version of) the
3027 /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
3028 /// concept.
3029 ///
3030 /// The second item in the returned pair indicates whether there were
3031 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3032 ///
3033 /// _Note:_ It is wrong to use this when the input buffer represents only
3034 /// a segment of the input instead of the whole input. Use
3035 /// `new_decoder_with_bom_removal()` when decoding segmented input.
3036 ///
3037 /// This method performs a one or two heap allocations for the backing
3038 /// buffer of the `String` when unable to borrow. (One allocation if not
3039 /// errors and potentially another one in the presence of errors.) The
3040 /// first allocation assumes jemalloc and may not be optimal with
3041 /// allocators that do not use power-of-two buckets. A borrow is performed
3042 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3043 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3044 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3045 /// transitions.
3046 ///
3047 /// # Panics
3048 ///
3049 /// If the size calculation for a heap-allocated backing buffer overflows
3050 /// `usize`.
3051 ///
3052 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3053 /// by default).
3054 #[cfg(feature = "alloc")]
3055 #[inline]
3056 pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3057 let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
3058 &bytes[3..]
3059 } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
3060 || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
3061 {
3062 &bytes[2..]
3063 } else {
3064 bytes
3065 };
3066 self.decode_without_bom_handling(without_bom)
3067 }
3068
3069 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3070 /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
3071 /// the entire input is available as a single buffer (i.e. the end of the
3072 /// buffer marks the end of the stream).
3073 ///
3074 /// When invoked on `UTF_8`, this method implements the (non-streaming
3075 /// version of) the
3076 /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
3077 /// spec concept.
3078 ///
3079 /// The second item in the returned pair indicates whether there were
3080 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3081 ///
3082 /// _Note:_ It is wrong to use this when the input buffer represents only
3083 /// a segment of the input instead of the whole input. Use
3084 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3085 ///
3086 /// This method performs a one or two heap allocations for the backing
3087 /// buffer of the `String` when unable to borrow. (One allocation if not
3088 /// errors and potentially another one in the presence of errors.) The
3089 /// first allocation assumes jemalloc and may not be optimal with
3090 /// allocators that do not use power-of-two buckets. A borrow is performed
3091 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3092 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3093 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3094 /// transitions.
3095 ///
3096 /// # Panics
3097 ///
3098 /// If the size calculation for a heap-allocated backing buffer overflows
3099 /// `usize`.
3100 ///
3101 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3102 /// by default).
3103 #[cfg(feature = "alloc")]
3104 pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3105 let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3106 let valid_up_to = if self == UTF_8 {
3107 utf8_valid_up_to(bytes)
3108 } else if self == ISO_2022_JP {
3109 iso_2022_jp_ascii_valid_up_to(bytes)
3110 } else {
3111 ascii_valid_up_to(bytes)
3112 };
3113 if valid_up_to == bytes.len() {
3114 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3115 return (Cow::Borrowed(str), false);
3116 }
3117 let decoder = self.new_decoder_without_bom_handling();
3118
3119 let rounded_without_replacement = checked_next_power_of_two(checked_add(
3120 valid_up_to,
3121 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3122 ));
3123 let with_replacement = checked_add(
3124 valid_up_to,
3125 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3126 );
3127 let mut string = String::with_capacity(
3128 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3129 );
3130 unsafe {
3131 let vec = string.as_mut_vec();
3132 vec.set_len(valid_up_to);
3133 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3134 }
3135 (decoder, string, valid_up_to)
3136 } else {
3137 let decoder = self.new_decoder_without_bom_handling();
3138 let rounded_without_replacement = checked_next_power_of_two(
3139 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3140 );
3141 let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3142 let string = String::with_capacity(
3143 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3144 );
3145 (decoder, string, 0)
3146 };
3147
3148 let mut total_had_errors = false;
3149 loop {
3150 let (result, read, had_errors) =
3151 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3152 total_read += read;
3153 total_had_errors |= had_errors;
3154 match result {
3155 CoderResult::InputEmpty => {
3156 debug_assert_eq!(total_read, bytes.len());
3157 return (Cow::Owned(string), total_had_errors);
3158 }
3159 CoderResult::OutputFull => {
3160 // Allocate for the worst case. That is, we should come
3161 // here at most once per invocation of this method.
3162 let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3163 string.reserve(needed.unwrap());
3164 }
3165 }
3166 }
3167 }
3168
3169 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3170 /// _with malformed sequences treated as fatal_ when the entire input is
3171 /// available as a single buffer (i.e. the end of the buffer marks the end
3172 /// of the stream).
3173 ///
3174 /// When invoked on `UTF_8`, this method implements the (non-streaming
3175 /// version of) the
3176 /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3177 /// spec concept.
3178 ///
3179 /// Returns `None` if a malformed sequence was encountered and the result
3180 /// of the decode as `Some(String)` otherwise.
3181 ///
3182 /// _Note:_ It is wrong to use this when the input buffer represents only
3183 /// a segment of the input instead of the whole input. Use
3184 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3185 ///
3186 /// This method performs a single heap allocation for the backing
3187 /// buffer of the `String` when unable to borrow. A borrow is performed if
3188 /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3189 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3190 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3191 /// transitions.
3192 ///
3193 /// # Panics
3194 ///
3195 /// If the size calculation for a heap-allocated backing buffer overflows
3196 /// `usize`.
3197 ///
3198 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3199 /// by default).
3200 #[cfg(feature = "alloc")]
3201 pub fn decode_without_bom_handling_and_without_replacement<'a>(
3202 &'static self,
3203 bytes: &'a [u8],
3204 ) -> Option<Cow<'a, str>> {
3205 if self == UTF_8 {
3206 let valid_up_to = utf8_valid_up_to(bytes);
3207 if valid_up_to == bytes.len() {
3208 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3209 return Some(Cow::Borrowed(str));
3210 }
3211 return None;
3212 }
3213 let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3214 let valid_up_to = if self == ISO_2022_JP {
3215 iso_2022_jp_ascii_valid_up_to(bytes)
3216 } else {
3217 ascii_valid_up_to(bytes)
3218 };
3219 if valid_up_to == bytes.len() {
3220 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3221 return Some(Cow::Borrowed(str));
3222 }
3223 let decoder = self.new_decoder_without_bom_handling();
3224 let mut string = String::with_capacity(
3225 checked_add(
3226 valid_up_to,
3227 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3228 )
3229 .unwrap(),
3230 );
3231 unsafe {
3232 let vec = string.as_mut_vec();
3233 vec.set_len(valid_up_to);
3234 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3235 }
3236 (decoder, string, &bytes[valid_up_to..])
3237 } else {
3238 let decoder = self.new_decoder_without_bom_handling();
3239 let string = String::with_capacity(
3240 decoder
3241 .max_utf8_buffer_length_without_replacement(bytes.len())
3242 .unwrap(),
3243 );
3244 (decoder, string, bytes)
3245 };
3246 let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3247 match result {
3248 DecoderResult::InputEmpty => {
3249 debug_assert_eq!(read, input.len());
3250 Some(Cow::Owned(string))
3251 }
3252 DecoderResult::Malformed(_, _) => None,
3253 DecoderResult::OutputFull => unreachable!(),
3254 }
3255 }
3256
3257 /// Encode complete input to `Cow<'a, [u8]>` using the
3258 /// [_output encoding_](Encoding::output_encoding) of this encoding with
3259 /// unmappable characters replaced with decimal numeric character references
3260 /// when the entire input is available as a single buffer (i.e. the end of
3261 /// the buffer marks the end of the stream).
3262 ///
3263 /// This method implements the (non-streaming version of) the
3264 /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3265 /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3266 /// spec concept, it is slightly more efficient to use
3267 /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3268 /// method on `UTF_8`.
3269 ///
3270 /// The second item in the returned tuple is the encoding that was actually
3271 /// used (*which may differ from this encoding thanks to some encodings
3272 /// having UTF-8 as their output encoding*).
3273 ///
3274 /// The third item in the returned tuple indicates whether there were
3275 /// unmappable characters (that were replaced with HTML numeric character
3276 /// references).
3277 ///
3278 /// _Note:_ It is wrong to use this when the input buffer represents only
3279 /// a segment of the input instead of the whole input. Use `new_encoder()`
3280 /// when encoding segmented output.
3281 ///
3282 /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3283 /// ASCII-compatible encoding, this method returns a borrow of the input
3284 /// without a heap allocation. Otherwise, this method performs a single
3285 /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3286 /// unmappable characters and potentially multiple heap allocations if
3287 /// there are. These allocations are tuned for jemalloc and may not be
3288 /// optimal when using a different allocator that doesn't use power-of-two
3289 /// buckets.
3290 ///
3291 /// # Panics
3292 ///
3293 /// If the size calculation for a heap-allocated backing buffer overflows
3294 /// `usize`.
3295 ///
3296 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3297 /// by default).
3298 #[cfg(feature = "alloc")]
3299 pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3300 let output_encoding = self.output_encoding();
3301 if output_encoding == UTF_8 {
3302 return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3303 }
3304 debug_assert!(output_encoding.is_potentially_borrowable());
3305 let bytes = string.as_bytes();
3306 let valid_up_to = if output_encoding == ISO_2022_JP {
3307 iso_2022_jp_ascii_valid_up_to(bytes)
3308 } else {
3309 ascii_valid_up_to(bytes)
3310 };
3311 if valid_up_to == bytes.len() {
3312 return (Cow::Borrowed(bytes), output_encoding, false);
3313 }
3314 let mut encoder = output_encoding.new_encoder();
3315 let mut vec: Vec<u8> = Vec::with_capacity(
3316 (checked_add(
3317 valid_up_to,
3318 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3319 ))
3320 .unwrap()
3321 .next_power_of_two(),
3322 );
3323 unsafe {
3324 vec.set_len(valid_up_to);
3325 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3326 }
3327 let mut total_read = valid_up_to;
3328 let mut total_had_errors = false;
3329 loop {
3330 let (result, read, had_errors) =
3331 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3332 total_read += read;
3333 total_had_errors |= had_errors;
3334 match result {
3335 CoderResult::InputEmpty => {
3336 debug_assert_eq!(total_read, string.len());
3337 return (Cow::Owned(vec), output_encoding, total_had_errors);
3338 }
3339 CoderResult::OutputFull => {
3340 // reserve_exact wants to know how much more on top of current
3341 // length--not current capacity.
3342 let needed = encoder
3343 .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3344 let rounded = (checked_add(vec.capacity(), needed))
3345 .unwrap()
3346 .next_power_of_two();
3347 let additional = rounded - vec.len();
3348 vec.reserve_exact(additional);
3349 }
3350 }
3351 }
3352 }
3353
3354 fn new_variant_decoder(&'static self) -> VariantDecoder {
3355 self.variant.new_variant_decoder()
3356 }
3357
3358 /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3359 ///
3360 /// BOM sniffing may cause the returned decoder to morph into a decoder
3361 /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. The BOM
3362 /// does not appear in the output.
3363 ///
3364 /// Available via the C wrapper.
3365 #[inline]
3366 pub fn new_decoder(&'static self) -> Decoder {
3367 Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3368 }
3369
3370 /// Instantiates a new decoder for this encoding with BOM removal.
3371 ///
3372 /// If the input starts with bytes that are the BOM for this encoding,
3373 /// those bytes are removed. However, the decoder never morphs into a
3374 /// decoder for another encoding: A BOM for another encoding is treated as
3375 /// (potentially malformed) input to the decoding algorithm for this
3376 /// encoding.
3377 ///
3378 /// Available via the C wrapper.
3379 #[inline]
3380 pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3381 Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3382 }
3383
3384 /// Instantiates a new decoder for this encoding with BOM handling disabled.
3385 ///
3386 /// If the input starts with bytes that look like a BOM, those bytes are
3387 /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3388 /// for another encoding.)
3389 ///
3390 /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3391 /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3392 /// instead of this method to cause the BOM to be removed.
3393 ///
3394 /// Available via the C wrapper.
3395 #[inline]
3396 pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3397 Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3398 }
3399
3400 /// Instantiates a new encoder for the [_output encoding_](Encoding::output_encoding)
3401 /// of this encoding.
3402 ///
3403 /// _Note:_ The output encoding of UTF-16BE, UTF-16LE, and replacement is UTF-8. There
3404 /// is no encoder for UTF-16BE, UTF-16LE, and replacement themselves.
3405 ///
3406 /// Available via the C wrapper.
3407 #[inline]
3408 pub fn new_encoder(&'static self) -> Encoder {
3409 let enc = self.output_encoding();
3410 enc.variant.new_encoder(enc)
3411 }
3412
3413 /// Validates UTF-8.
3414 ///
3415 /// Returns the index of the first byte that makes the input malformed as
3416 /// UTF-8 or the length of the slice if the slice is entirely valid.
3417 ///
3418 /// This is currently faster than the corresponding standard library
3419 /// functionality. If this implementation gets upstreamed to the standard
3420 /// library, this method may be removed in the future.
3421 ///
3422 /// Available via the C wrapper.
3423 pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3424 utf8_valid_up_to(bytes)
3425 }
3426
3427 /// Validates ASCII.
3428 ///
3429 /// Returns the index of the first byte that makes the input malformed as
3430 /// ASCII or the length of the slice if the slice is entirely valid.
3431 ///
3432 /// Available via the C wrapper.
3433 pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3434 ascii_valid_up_to(bytes)
3435 }
3436
3437 /// Validates ISO-2022-JP ASCII-state data.
3438 ///
3439 /// Returns the index of the first byte that makes the input not
3440 /// representable in the ASCII state of ISO-2022-JP or the length of the
3441 /// slice if the slice is entirely representable in the ASCII state of
3442 /// ISO-2022-JP.
3443 ///
3444 /// Available via the C wrapper.
3445 pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3446 iso_2022_jp_ascii_valid_up_to(bytes)
3447 }
3448}
3449
3450impl PartialEq for Encoding {
3451 #[inline]
3452 fn eq(&self, other: &Encoding) -> bool {
3453 (self as *const Encoding) == (other as *const Encoding)
3454 }
3455}
3456
3457impl Eq for Encoding {}
3458
3459#[cfg(test)]
3460impl PartialOrd for Encoding {
3461 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
3462 (self as *const Encoding as usize).partial_cmp(&(other as *const Encoding as usize))
3463 }
3464}
3465
3466#[cfg(test)]
3467impl Ord for Encoding {
3468 fn cmp(&self, other: &Self) -> Ordering {
3469 (self as *const Encoding as usize).cmp(&(other as *const Encoding as usize))
3470 }
3471}
3472
3473impl Hash for Encoding {
3474 #[inline]
3475 fn hash<H: Hasher>(&self, state: &mut H) {
3476 (self as *const Encoding).hash(state);
3477 }
3478}
3479
3480impl core::fmt::Debug for Encoding {
3481 #[inline]
3482 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
3483 write!(f, "Encoding {{ {} }}", self.name)
3484 }
3485}
3486
3487#[cfg(feature = "serde")]
3488impl Serialize for Encoding {
3489 #[inline]
3490 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3491 where
3492 S: Serializer,
3493 {
3494 serializer.serialize_str(self.name)
3495 }
3496}
3497
3498#[cfg(feature = "serde")]
3499struct EncodingVisitor;
3500
3501#[cfg(feature = "serde")]
3502impl<'de> Visitor<'de> for EncodingVisitor {
3503 type Value = &'static Encoding;
3504
3505 fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
3506 formatter.write_str("a valid encoding label")
3507 }
3508
3509 fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3510 where
3511 E: serde::de::Error,
3512 {
3513 if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3514 Ok(enc)
3515 } else {
3516 Err(E::custom(alloc::format!(
3517 "invalid encoding label: {}",
3518 value
3519 )))
3520 }
3521 }
3522}
3523
3524#[cfg(feature = "serde")]
3525impl<'de> Deserialize<'de> for &'static Encoding {
3526 fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3527 where
3528 D: Deserializer<'de>,
3529 {
3530 deserializer.deserialize_str(EncodingVisitor)
3531 }
3532}
3533
3534/// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3535#[derive(PartialEq, Debug, Copy, Clone)]
3536enum DecoderLifeCycle {
3537 /// The decoder has seen no input yet.
3538 AtStart,
3539 /// The decoder has seen no input yet but expects UTF-8.
3540 AtUtf8Start,
3541 /// The decoder has seen no input yet but expects UTF-16BE.
3542 AtUtf16BeStart,
3543 /// The decoder has seen no input yet but expects UTF-16LE.
3544 AtUtf16LeStart,
3545 /// The decoder has seen EF.
3546 SeenUtf8First,
3547 /// The decoder has seen EF, BB.
3548 SeenUtf8Second,
3549 /// The decoder has seen FE.
3550 SeenUtf16BeFirst,
3551 /// The decoder has seen FF.
3552 SeenUtf16LeFirst,
3553 /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3554 /// underlying decoder reported EF as an error, so we need to remember to
3555 /// push BB before the next buffer.
3556 ConvertingWithPendingBB,
3557 /// No longer looking for a BOM and EOF not yet seen.
3558 Converting,
3559 /// EOF has been seen.
3560 Finished,
3561}
3562
3563/// Communicate the BOM handling mode.
3564#[derive(Debug, Copy, Clone)]
3565enum BomHandling {
3566 /// Don't handle the BOM
3567 Off,
3568 /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3569 Sniff,
3570 /// Remove the BOM only if it's the BOM for this encoding
3571 Remove,
3572}
3573
3574/// Result of a (potentially partial) decode or encode operation with
3575/// replacement.
3576#[must_use]
3577#[derive(Debug, PartialEq, Eq)]
3578pub enum CoderResult {
3579 /// The input was exhausted.
3580 ///
3581 /// If this result was returned from a call where `last` was `true`, the
3582 /// conversion process has completed. Otherwise, the caller should call a
3583 /// decode or encode method again with more input.
3584 InputEmpty,
3585
3586 /// The converter cannot produce another unit of output, because the output
3587 /// buffer does not have enough space left.
3588 ///
3589 /// The caller must provide more output space upon the next call and re-push
3590 /// the remaining input to the converter.
3591 OutputFull,
3592}
3593
3594/// Result of a (potentially partial) decode operation without replacement.
3595#[must_use]
3596#[derive(Debug, PartialEq, Eq)]
3597pub enum DecoderResult {
3598 /// The input was exhausted.
3599 ///
3600 /// If this result was returned from a call where `last` was `true`, the
3601 /// decoding process has completed. Otherwise, the caller should call a
3602 /// decode method again with more input.
3603 InputEmpty,
3604
3605 /// The decoder cannot produce another unit of output, because the output
3606 /// buffer does not have enough space left.
3607 ///
3608 /// The caller must provide more output space upon the next call and re-push
3609 /// the remaining input to the decoder.
3610 OutputFull,
3611
3612 /// The decoder encountered a malformed byte sequence.
3613 ///
3614 /// The caller must either treat this as a fatal error or must append one
3615 /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3616 /// the remaining input to the decoder.
3617 ///
3618 /// The first wrapped integer indicates the length of the malformed byte
3619 /// sequence. The second wrapped integer indicates the number of bytes
3620 /// that were consumed after the malformed sequence. If the second
3621 /// integer is zero, the last byte that was consumed is the last byte of
3622 /// the malformed sequence. Note that the malformed bytes may have been part
3623 /// of an earlier input buffer.
3624 ///
3625 /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3626 /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3627 /// of the two is 6, which happens with ISO-2022-JP.
3628 Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3629}
3630
3631/// A converter that decodes a byte stream into Unicode according to a
3632/// character encoding in a streaming (incremental) manner.
3633///
3634/// The various `decode_*` methods take an input buffer (`src`) and an output
3635/// buffer `dst` both of which are caller-allocated. There are variants for
3636/// both UTF-8 and UTF-16 output buffers.
3637///
3638/// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3639/// into `dst` until one of the following three things happens:
3640///
3641/// 1. A malformed byte sequence is encountered (`*_without_replacement`
3642/// variants only).
3643///
3644/// 2. The output buffer has been filled so near capacity that the decoder
3645/// cannot be sure that processing an additional byte of input wouldn't
3646/// cause so much output that the output buffer would overflow.
3647///
3648/// 3. All the input bytes have been processed.
3649///
3650/// The `decode_*` method then returns tuple of a status indicating which one
3651/// of the three reasons to return happened, how many input bytes were read,
3652/// how many output code units (`u8` when decoding into UTF-8 and `u16`
3653/// when decoding to UTF-16) were written (except when decoding into `String`,
3654/// whose length change indicates this), and in the case of the
3655/// variants performing replacement, a boolean indicating whether an error was
3656/// replaced with the REPLACEMENT CHARACTER during the call.
3657///
3658/// The number of bytes "written" is what's logically written. Garbage may be
3659/// written in the output buffer beyond the point logically written to.
3660/// Therefore, if you wish to decode into an `&mut str`, you should use the
3661/// methods that take an `&mut str` argument instead of the ones that take an
3662/// `&mut [u8]` argument. The former take care of overwriting the trailing
3663/// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3664/// latter don't.
3665///
3666/// In the case of the `*_without_replacement` variants, the status is a
3667/// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3668/// `InputEmpty` corresponding to the three cases listed above).
3669///
3670/// In the case of methods whose name does not end with
3671/// `*_without_replacement`, malformed sequences are automatically replaced
3672/// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3673/// return early.
3674///
3675/// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3676/// space. When decoding to UTF-16, the output buffer must have at least two
3677/// UTF-16 code units (`u16`) of space.
3678///
3679/// When decoding to UTF-8 without replacement, the methods are guaranteed
3680/// not to return indicating that more output space is needed if the length
3681/// of the output buffer is at least the length returned by
3682/// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3683/// with replacement, the length of the output buffer that guarantees the
3684/// methods not to return indicating that more output space is needed is given
3685/// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3686/// or without replacement, the length of the output buffer that guarantees
3687/// the methods not to return indicating that more output space is needed is
3688/// given by [`max_utf16_buffer_length()`][4].
3689///
3690/// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3691/// and the output after each `decode_*` call is guaranteed to consist of
3692/// complete characters. (I.e. the code unit sequence for the last character is
3693/// guaranteed not to be split across output buffers.)
3694///
3695/// The boolean argument `last` indicates that the end of the stream is reached
3696/// when all the bytes in `src` have been consumed.
3697///
3698/// A `Decoder` object can be used to incrementally decode a byte stream.
3699///
3700/// During the processing of a single stream, the caller must call `decode_*`
3701/// zero or more times with `last` set to `false` and then call `decode_*` at
3702/// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3703/// the processing of the stream has ended. Otherwise, the caller must call
3704/// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3705/// a fatal error).
3706///
3707/// Once the stream has ended, the `Decoder` object must not be used anymore.
3708/// That is, you need to create another one to process another stream.
3709///
3710/// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3711/// the caller does not wish to treat it as a fatal error, the input buffer
3712/// `src` may not have been completely consumed. In that case, the caller must
3713/// pass the unconsumed contents of `src` to `decode_*` again upon the next
3714/// call.
3715///
3716/// [1]: enum.DecoderResult.html
3717/// [2]: #method.max_utf8_buffer_length_without_replacement
3718/// [3]: #method.max_utf8_buffer_length
3719/// [4]: #method.max_utf16_buffer_length
3720///
3721/// # Infinite loops
3722///
3723/// When converting with a fixed-size output buffer whose size is too small to
3724/// accommodate one character or (when applicable) one numeric character
3725/// reference of output, an infinite loop ensues. When converting with a
3726/// fixed-size output buffer, it generally makes sense to make the buffer
3727/// fairly large (e.g. couple of kilobytes).
3728pub struct Decoder {
3729 encoding: &'static Encoding,
3730 variant: VariantDecoder,
3731 life_cycle: DecoderLifeCycle,
3732}
3733
3734impl Decoder {
3735 fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3736 Decoder {
3737 encoding: enc,
3738 variant: decoder,
3739 life_cycle: match sniffing {
3740 BomHandling::Off => DecoderLifeCycle::Converting,
3741 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3742 BomHandling::Remove => {
3743 if enc == UTF_8 {
3744 DecoderLifeCycle::AtUtf8Start
3745 } else if enc == UTF_16BE {
3746 DecoderLifeCycle::AtUtf16BeStart
3747 } else if enc == UTF_16LE {
3748 DecoderLifeCycle::AtUtf16LeStart
3749 } else {
3750 DecoderLifeCycle::Converting
3751 }
3752 }
3753 },
3754 }
3755 }
3756
3757 /// The `Encoding` this `Decoder` is for.
3758 ///
3759 /// BOM sniffing can change the return value of this method during the life
3760 /// of the decoder.
3761 ///
3762 /// Available via the C wrapper.
3763 #[inline]
3764 pub fn encoding(&self) -> &'static Encoding {
3765 self.encoding
3766 }
3767
3768 /// Query the worst-case UTF-8 output size _with replacement_.
3769 ///
3770 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3771 /// that will not overflow given the current state of the decoder and
3772 /// `byte_length` number of additional input bytes when decoding with
3773 /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3774 /// sequence or `None` if `usize` would overflow.
3775 ///
3776 /// Available via the C wrapper.
3777 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3778 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3779 // BOM getting pushed to the underlying decoder.
3780 match self.life_cycle {
3781 DecoderLifeCycle::Converting
3782 | DecoderLifeCycle::AtUtf8Start
3783 | DecoderLifeCycle::AtUtf16LeStart
3784 | DecoderLifeCycle::AtUtf16BeStart => {
3785 return self.variant.max_utf8_buffer_length(byte_length);
3786 }
3787 DecoderLifeCycle::AtStart => {
3788 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3789 if let Some(utf16_bom) = checked_add(
3790 1,
3791 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3792 ) {
3793 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3794 let encoding = self.encoding();
3795 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3796 // No need to consider the internal state of the underlying decoder,
3797 // because it is at start, because no data has reached it yet.
3798 return Some(utf_bom);
3799 } else if let Some(non_bom) =
3800 self.variant.max_utf8_buffer_length(byte_length)
3801 {
3802 return Some(core::cmp::max(utf_bom, non_bom));
3803 }
3804 }
3805 }
3806 }
3807 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3808 // Add two bytes even when only one byte has been seen,
3809 // because the one byte can become a lead byte in multibyte
3810 // decoders, but only after the decoder has been queried
3811 // for max length, so the decoder's own logic for adding
3812 // one for a pending lead cannot work.
3813 if let Some(sum) = byte_length.checked_add(2) {
3814 if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3815 if self.encoding() == UTF_8 {
3816 // No need to consider the internal state of the underlying decoder,
3817 // because it is at start, because no data has reached it yet.
3818 return Some(utf8_bom);
3819 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3820 return Some(core::cmp::max(utf8_bom, non_bom));
3821 }
3822 }
3823 }
3824 }
3825 DecoderLifeCycle::ConvertingWithPendingBB => {
3826 if let Some(sum) = byte_length.checked_add(2) {
3827 return self.variant.max_utf8_buffer_length(sum);
3828 }
3829 }
3830 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3831 // Add two bytes even when only one byte has been seen,
3832 // because the one byte can become a lead byte in multibyte
3833 // decoders, but only after the decoder has been queried
3834 // for max length, so the decoder's own logic for adding
3835 // one for a pending lead cannot work.
3836 if let Some(sum) = byte_length.checked_add(2) {
3837 if let Some(utf16_bom) =
3838 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3839 {
3840 let encoding = self.encoding();
3841 if encoding == UTF_16LE || encoding == UTF_16BE {
3842 // No need to consider the internal state of the underlying decoder,
3843 // because it is at start, because no data has reached it yet.
3844 return Some(utf16_bom);
3845 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3846 return Some(core::cmp::max(utf16_bom, non_bom));
3847 }
3848 }
3849 }
3850 }
3851 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3852 }
3853 None
3854 }
3855
3856 /// Query the worst-case UTF-8 output size _without replacement_.
3857 ///
3858 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3859 /// that will not overflow given the current state of the decoder and
3860 /// `byte_length` number of additional input bytes when decoding without
3861 /// replacement error handling or `None` if `usize` would overflow.
3862 ///
3863 /// Note that this value may be too small for the `_with_replacement` case.
3864 /// Use `max_utf8_buffer_length()` for that case.
3865 ///
3866 /// Available via the C wrapper.
3867 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3868 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3869 // BOM getting pushed to the underlying decoder.
3870 match self.life_cycle {
3871 DecoderLifeCycle::Converting
3872 | DecoderLifeCycle::AtUtf8Start
3873 | DecoderLifeCycle::AtUtf16LeStart
3874 | DecoderLifeCycle::AtUtf16BeStart => {
3875 return self
3876 .variant
3877 .max_utf8_buffer_length_without_replacement(byte_length);
3878 }
3879 DecoderLifeCycle::AtStart => {
3880 if let Some(utf8_bom) = byte_length.checked_add(3) {
3881 if let Some(utf16_bom) = checked_add(
3882 1,
3883 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3884 ) {
3885 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3886 let encoding = self.encoding();
3887 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3888 // No need to consider the internal state of the underlying decoder,
3889 // because it is at start, because no data has reached it yet.
3890 return Some(utf_bom);
3891 } else if let Some(non_bom) = self
3892 .variant
3893 .max_utf8_buffer_length_without_replacement(byte_length)
3894 {
3895 return Some(core::cmp::max(utf_bom, non_bom));
3896 }
3897 }
3898 }
3899 }
3900 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3901 // Add two bytes even when only one byte has been seen,
3902 // because the one byte can become a lead byte in multibyte
3903 // decoders, but only after the decoder has been queried
3904 // for max length, so the decoder's own logic for adding
3905 // one for a pending lead cannot work.
3906 if let Some(sum) = byte_length.checked_add(2) {
3907 if let Some(utf8_bom) = sum.checked_add(3) {
3908 if self.encoding() == UTF_8 {
3909 // No need to consider the internal state of the underlying decoder,
3910 // because it is at start, because no data has reached it yet.
3911 return Some(utf8_bom);
3912 } else if let Some(non_bom) =
3913 self.variant.max_utf8_buffer_length_without_replacement(sum)
3914 {
3915 return Some(core::cmp::max(utf8_bom, non_bom));
3916 }
3917 }
3918 }
3919 }
3920 DecoderLifeCycle::ConvertingWithPendingBB => {
3921 if let Some(sum) = byte_length.checked_add(2) {
3922 return self.variant.max_utf8_buffer_length_without_replacement(sum);
3923 }
3924 }
3925 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3926 // Add two bytes even when only one byte has been seen,
3927 // because the one byte can become a lead byte in multibyte
3928 // decoders, but only after the decoder has been queried
3929 // for max length, so the decoder's own logic for adding
3930 // one for a pending lead cannot work.
3931 if let Some(sum) = byte_length.checked_add(2) {
3932 if let Some(utf16_bom) =
3933 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3934 {
3935 let encoding = self.encoding();
3936 if encoding == UTF_16LE || encoding == UTF_16BE {
3937 // No need to consider the internal state of the underlying decoder,
3938 // because it is at start, because no data has reached it yet.
3939 return Some(utf16_bom);
3940 } else if let Some(non_bom) =
3941 self.variant.max_utf8_buffer_length_without_replacement(sum)
3942 {
3943 return Some(core::cmp::max(utf16_bom, non_bom));
3944 }
3945 }
3946 }
3947 }
3948 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3949 }
3950 None
3951 }
3952
3953 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3954 /// replaced with the REPLACEMENT CHARACTER.
3955 ///
3956 /// See the documentation of the struct for documentation for `decode_*`
3957 /// methods collectively.
3958 ///
3959 /// Available via the C wrapper.
3960 pub fn decode_to_utf8(
3961 &mut self,
3962 src: &[u8],
3963 dst: &mut [u8],
3964 last: bool,
3965 ) -> (CoderResult, usize, usize, bool) {
3966 let mut had_errors = false;
3967 let mut total_read = 0usize;
3968 let mut total_written = 0usize;
3969 loop {
3970 let (result, read, written) = self.decode_to_utf8_without_replacement(
3971 &src[total_read..],
3972 &mut dst[total_written..],
3973 last,
3974 );
3975 total_read += read;
3976 total_written += written;
3977 match result {
3978 DecoderResult::InputEmpty => {
3979 return (
3980 CoderResult::InputEmpty,
3981 total_read,
3982 total_written,
3983 had_errors,
3984 );
3985 }
3986 DecoderResult::OutputFull => {
3987 return (
3988 CoderResult::OutputFull,
3989 total_read,
3990 total_written,
3991 had_errors,
3992 );
3993 }
3994 DecoderResult::Malformed(_, _) => {
3995 had_errors = true;
3996 // There should always be space for the U+FFFD, because
3997 // otherwise we'd have gotten OutputFull already.
3998 // XXX: is the above comment actually true for UTF-8 itself?
3999 // TODO: Consider having fewer bound checks here.
4000 dst[total_written] = 0xEFu8;
4001 total_written += 1;
4002 dst[total_written] = 0xBFu8;
4003 total_written += 1;
4004 dst[total_written] = 0xBDu8;
4005 total_written += 1;
4006 }
4007 }
4008 }
4009 }
4010
4011 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
4012 /// replaced with the REPLACEMENT CHARACTER with type system signaling
4013 /// of UTF-8 validity.
4014 ///
4015 /// This methods calls `decode_to_utf8` and then zeroes
4016 /// out up to three bytes that aren't logically part of the write in order
4017 /// to retain the UTF-8 validity even for the unwritten part of the buffer.
4018 ///
4019 /// See the documentation of the struct for documentation for `decode_*`
4020 /// methods collectively.
4021 ///
4022 /// Available to Rust only.
4023 pub fn decode_to_str(
4024 &mut self,
4025 src: &[u8],
4026 dst: &mut str,
4027 last: bool,
4028 ) -> (CoderResult, usize, usize, bool) {
4029 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4030 let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
4031 let len = bytes.len();
4032 let mut trail = written;
4033 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4034 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4035 // encodings to avoid overwriting here.
4036 if self.encoding != UTF_8 {
4037 let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4038 while trail < max {
4039 bytes[trail] = 0;
4040 trail += 1;
4041 }
4042 }
4043 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4044 bytes[trail] = 0;
4045 trail += 1;
4046 }
4047 (result, read, written, replaced)
4048 }
4049
4050 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
4051 /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
4052 ///
4053 /// Like the others, this method follows the logic that the output buffer is
4054 /// caller-allocated. This method treats the capacity of the `String` as
4055 /// the output limit. That is, this method guarantees not to cause a
4056 /// reallocation of the backing buffer of `String`.
4057 ///
4058 /// The return value is a tuple that contains the `DecoderResult`, the
4059 /// number of bytes read and a boolean indicating whether replacements
4060 /// were done. The number of bytes written is signaled via the length of
4061 /// the `String` changing.
4062 ///
4063 /// See the documentation of the struct for documentation for `decode_*`
4064 /// methods collectively.
4065 ///
4066 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4067 /// by default).
4068 #[cfg(feature = "alloc")]
4069 pub fn decode_to_string(
4070 &mut self,
4071 src: &[u8],
4072 dst: &mut String,
4073 last: bool,
4074 ) -> (CoderResult, usize, bool) {
4075 unsafe {
4076 let vec = dst.as_mut_vec();
4077 let old_len = vec.len();
4078 let capacity = vec.capacity();
4079 vec.set_len(capacity);
4080 let (result, read, written, replaced) =
4081 self.decode_to_utf8(src, &mut vec[old_len..], last);
4082 vec.set_len(old_len + written);
4083 (result, read, replaced)
4084 }
4085 }
4086
4087 public_decode_function!(/// Incrementally decode a byte stream into UTF-8
4088 /// _without replacement_.
4089 ///
4090 /// See the documentation of the struct for
4091 /// documentation for `decode_*` methods
4092 /// collectively.
4093 ///
4094 /// Available via the C wrapper.
4095 ,
4096 decode_to_utf8_without_replacement,
4097 decode_to_utf8_raw,
4098 decode_to_utf8_checking_end,
4099 decode_to_utf8_after_one_potential_bom_byte,
4100 decode_to_utf8_after_two_potential_bom_bytes,
4101 decode_to_utf8_checking_end_with_offset,
4102 u8);
4103
4104 /// Incrementally decode a byte stream into UTF-8 with type system signaling
4105 /// of UTF-8 validity.
4106 ///
4107 /// This methods calls `decode_to_utf8` and then zeroes out up to three
4108 /// bytes that aren't logically part of the write in order to retain the
4109 /// UTF-8 validity even for the unwritten part of the buffer.
4110 ///
4111 /// See the documentation of the struct for documentation for `decode_*`
4112 /// methods collectively.
4113 ///
4114 /// Available to Rust only.
4115 pub fn decode_to_str_without_replacement(
4116 &mut self,
4117 src: &[u8],
4118 dst: &mut str,
4119 last: bool,
4120 ) -> (DecoderResult, usize, usize) {
4121 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4122 let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4123 let len = bytes.len();
4124 let mut trail = written;
4125 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4126 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4127 // encodings to avoid overwriting here.
4128 if self.encoding != UTF_8 {
4129 let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4130 while trail < max {
4131 bytes[trail] = 0;
4132 trail += 1;
4133 }
4134 }
4135 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4136 bytes[trail] = 0;
4137 trail += 1;
4138 }
4139 (result, read, written)
4140 }
4141
4142 /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4143 ///
4144 /// Like the others, this method follows the logic that the output buffer is
4145 /// caller-allocated. This method treats the capacity of the `String` as
4146 /// the output limit. That is, this method guarantees not to cause a
4147 /// reallocation of the backing buffer of `String`.
4148 ///
4149 /// The return value is a pair that contains the `DecoderResult` and the
4150 /// number of bytes read. The number of bytes written is signaled via
4151 /// the length of the `String` changing.
4152 ///
4153 /// See the documentation of the struct for documentation for `decode_*`
4154 /// methods collectively.
4155 ///
4156 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4157 /// by default).
4158 #[cfg(feature = "alloc")]
4159 pub fn decode_to_string_without_replacement(
4160 &mut self,
4161 src: &[u8],
4162 dst: &mut String,
4163 last: bool,
4164 ) -> (DecoderResult, usize) {
4165 unsafe {
4166 let vec = dst.as_mut_vec();
4167 let old_len = vec.len();
4168 let capacity = vec.capacity();
4169 vec.set_len(capacity);
4170 let (result, read, written) =
4171 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4172 vec.set_len(old_len + written);
4173 (result, read)
4174 }
4175 }
4176
4177 /// Query the worst-case UTF-16 output size (with or without replacement).
4178 ///
4179 /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4180 /// that will not overflow given the current state of the decoder and
4181 /// `byte_length` number of additional input bytes or `None` if `usize`
4182 /// would overflow.
4183 ///
4184 /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4185 /// return value of this method applies also in the
4186 /// `_without_replacement` case.
4187 ///
4188 /// Available via the C wrapper.
4189 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4190 // Need to consider a) the decoder morphing due to the BOM and b) a partial
4191 // BOM getting pushed to the underlying decoder.
4192 match self.life_cycle {
4193 DecoderLifeCycle::Converting
4194 | DecoderLifeCycle::AtUtf8Start
4195 | DecoderLifeCycle::AtUtf16LeStart
4196 | DecoderLifeCycle::AtUtf16BeStart => {
4197 return self.variant.max_utf16_buffer_length(byte_length);
4198 }
4199 DecoderLifeCycle::AtStart => {
4200 if let Some(utf8_bom) = byte_length.checked_add(1) {
4201 if let Some(utf16_bom) =
4202 checked_add(1, checked_div(byte_length.checked_add(1), 2))
4203 {
4204 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
4205 let encoding = self.encoding();
4206 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4207 // No need to consider the internal state of the underlying decoder,
4208 // because it is at start, because no data has reached it yet.
4209 return Some(utf_bom);
4210 } else if let Some(non_bom) =
4211 self.variant.max_utf16_buffer_length(byte_length)
4212 {
4213 return Some(core::cmp::max(utf_bom, non_bom));
4214 }
4215 }
4216 }
4217 }
4218 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4219 // Add two bytes even when only one byte has been seen,
4220 // because the one byte can become a lead byte in multibyte
4221 // decoders, but only after the decoder has been queried
4222 // for max length, so the decoder's own logic for adding
4223 // one for a pending lead cannot work.
4224 if let Some(sum) = byte_length.checked_add(2) {
4225 if let Some(utf8_bom) = sum.checked_add(1) {
4226 if self.encoding() == UTF_8 {
4227 // No need to consider the internal state of the underlying decoder,
4228 // because it is at start, because no data has reached it yet.
4229 return Some(utf8_bom);
4230 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4231 return Some(core::cmp::max(utf8_bom, non_bom));
4232 }
4233 }
4234 }
4235 }
4236 DecoderLifeCycle::ConvertingWithPendingBB => {
4237 if let Some(sum) = byte_length.checked_add(2) {
4238 return self.variant.max_utf16_buffer_length(sum);
4239 }
4240 }
4241 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4242 // Add two bytes even when only one byte has been seen,
4243 // because the one byte can become a lead byte in multibyte
4244 // decoders, but only after the decoder has been queried
4245 // for max length, so the decoder's own logic for adding
4246 // one for a pending lead cannot work.
4247 if let Some(sum) = byte_length.checked_add(2) {
4248 if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4249 let encoding = self.encoding();
4250 if encoding == UTF_16LE || encoding == UTF_16BE {
4251 // No need to consider the internal state of the underlying decoder,
4252 // because it is at start, because no data has reached it yet.
4253 return Some(utf16_bom);
4254 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4255 return Some(core::cmp::max(utf16_bom, non_bom));
4256 }
4257 }
4258 }
4259 }
4260 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4261 }
4262 None
4263 }
4264
4265 /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4266 /// replaced with the REPLACEMENT CHARACTER.
4267 ///
4268 /// See the documentation of the struct for documentation for `decode_*`
4269 /// methods collectively.
4270 ///
4271 /// Available via the C wrapper.
4272 pub fn decode_to_utf16(
4273 &mut self,
4274 src: &[u8],
4275 dst: &mut [u16],
4276 last: bool,
4277 ) -> (CoderResult, usize, usize, bool) {
4278 let mut had_errors = false;
4279 let mut total_read = 0usize;
4280 let mut total_written = 0usize;
4281 loop {
4282 let (result, read, written) = self.decode_to_utf16_without_replacement(
4283 &src[total_read..],
4284 &mut dst[total_written..],
4285 last,
4286 );
4287 total_read += read;
4288 total_written += written;
4289 match result {
4290 DecoderResult::InputEmpty => {
4291 return (
4292 CoderResult::InputEmpty,
4293 total_read,
4294 total_written,
4295 had_errors,
4296 );
4297 }
4298 DecoderResult::OutputFull => {
4299 return (
4300 CoderResult::OutputFull,
4301 total_read,
4302 total_written,
4303 had_errors,
4304 );
4305 }
4306 DecoderResult::Malformed(_, _) => {
4307 had_errors = true;
4308 // There should always be space for the U+FFFD, because
4309 // otherwise we'd have gotten OutputFull already.
4310 dst[total_written] = 0xFFFD;
4311 total_written += 1;
4312 }
4313 }
4314 }
4315 }
4316
4317 public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4318 /// _without replacement_.
4319 ///
4320 /// See the documentation of the struct for
4321 /// documentation for `decode_*` methods
4322 /// collectively.
4323 ///
4324 /// Available via the C wrapper.
4325 ,
4326 decode_to_utf16_without_replacement,
4327 decode_to_utf16_raw,
4328 decode_to_utf16_checking_end,
4329 decode_to_utf16_after_one_potential_bom_byte,
4330 decode_to_utf16_after_two_potential_bom_bytes,
4331 decode_to_utf16_checking_end_with_offset,
4332 u16);
4333
4334 /// Checks for compatibility with storing Unicode scalar values as unsigned
4335 /// bytes taking into account the state of the decoder.
4336 ///
4337 /// Returns `None` if the decoder is not in a neutral state, including waiting
4338 /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4339 ///
4340 /// Otherwise returns the index of the first byte whose unsigned value doesn't
4341 /// directly correspond to the decoded Unicode scalar value, or the length
4342 /// of the input if all bytes in the input decode directly to scalar values
4343 /// corresponding to the unsigned byte values.
4344 ///
4345 /// Does not change the state of the decoder.
4346 ///
4347 /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4348 /// storage optimizations.
4349 ///
4350 /// Available via the C wrapper.
4351 pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4352 match self.life_cycle {
4353 DecoderLifeCycle::Converting => {
4354 return self.variant.latin1_byte_compatible_up_to(bytes);
4355 }
4356 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4357 _ => None,
4358 }
4359 }
4360}
4361
4362/// Result of a (potentially partial) encode operation without replacement.
4363#[must_use]
4364#[derive(Debug, PartialEq, Eq)]
4365pub enum EncoderResult {
4366 /// The input was exhausted.
4367 ///
4368 /// If this result was returned from a call where `last` was `true`, the
4369 /// decoding process has completed. Otherwise, the caller should call a
4370 /// decode method again with more input.
4371 InputEmpty,
4372
4373 /// The encoder cannot produce another unit of output, because the output
4374 /// buffer does not have enough space left.
4375 ///
4376 /// The caller must provide more output space upon the next call and re-push
4377 /// the remaining input to the decoder.
4378 OutputFull,
4379
4380 /// The encoder encountered an unmappable character.
4381 ///
4382 /// The caller must either treat this as a fatal error or must append
4383 /// a placeholder to the output and then re-push the remaining input to the
4384 /// encoder.
4385 Unmappable(char),
4386}
4387
4388impl EncoderResult {
4389 fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4390 EncoderResult::Unmappable(::core::char::from_u32(u32::from(bmp)).unwrap())
4391 }
4392}
4393
4394/// A converter that encodes a Unicode stream into bytes according to a
4395/// character encoding in a streaming (incremental) manner.
4396///
4397/// The various `encode_*` methods take an input buffer (`src`) and an output
4398/// buffer `dst` both of which are caller-allocated. There are variants for
4399/// both UTF-8 and UTF-16 input buffers.
4400///
4401/// An `encode_*` method encode characters from `src` into bytes characters
4402/// stored into `dst` until one of the following three things happens:
4403///
4404/// 1. An unmappable character is encountered (`*_without_replacement` variants
4405/// only).
4406///
4407/// 2. The output buffer has been filled so near capacity that the decoder
4408/// cannot be sure that processing an additional character of input wouldn't
4409/// cause so much output that the output buffer would overflow.
4410///
4411/// 3. All the input characters have been processed.
4412///
4413/// The `encode_*` method then returns tuple of a status indicating which one
4414/// of the three reasons to return happened, how many input code units (`u8`
4415/// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4416/// how many output bytes were written (except when encoding into `Vec<u8>`,
4417/// whose length change indicates this), and in the case of the variants that
4418/// perform replacement, a boolean indicating whether an unmappable
4419/// character was replaced with a numeric character reference during the call.
4420///
4421/// The number of bytes "written" is what's logically written. Garbage may be
4422/// written in the output buffer beyond the point logically written to.
4423///
4424/// In the case of the methods whose name ends with
4425/// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4426/// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4427/// the three cases listed above).
4428///
4429/// In the case of methods whose name does not end with
4430/// `*_without_replacement`, unmappable characters are automatically replaced
4431/// with the corresponding numeric character references and unmappable
4432/// characters do not cause the methods to return early.
4433///
4434/// When encoding from UTF-8 without replacement, the methods are guaranteed
4435/// not to return indicating that more output space is needed if the length
4436/// of the output buffer is at least the length returned by
4437/// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4438/// UTF-8 with replacement, the length of the output buffer that guarantees the
4439/// methods not to return indicating that more output space is needed in the
4440/// absence of unmappable characters is given by
4441/// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4442/// UTF-16 without replacement, the methods are guaranteed not to return
4443/// indicating that more output space is needed if the length of the output
4444/// buffer is at least the length returned by
4445/// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4446/// from UTF-16 with replacement, the the length of the output buffer that
4447/// guarantees the methods not to return indicating that more output space is
4448/// needed in the absence of unmappable characters is given by
4449/// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4450/// When encoding with replacement, applications are not expected to size the
4451/// buffer for the worst case ahead of time but to resize the buffer if there
4452/// are unmappable characters. This is why max length queries are only available
4453/// for the case where there are no unmappable characters.
4454///
4455/// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4456/// calling from Rust, the type system takes care of this.) When encoding from
4457/// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4458/// CHARACTERS. Therefore, in order for astral characters not to turn into a
4459/// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4460/// are not split across input buffer boundaries.
4461///
4462/// After an `encode_*` call returns, the output produced so far, taken as a
4463/// whole from the start of the stream, is guaranteed to consist of a valid
4464/// byte sequence in the target encoding. (I.e. the code unit sequence for a
4465/// character is guaranteed not to be split across output buffers. However, due
4466/// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4467/// from the start for it to be valid. For other encodings, the validity holds
4468/// on a per-output buffer basis.)
4469///
4470/// The boolean argument `last` indicates that the end of the stream is reached
4471/// when all the characters in `src` have been consumed. This argument is needed
4472/// for ISO-2022-JP and is ignored for other encodings.
4473///
4474/// An `Encoder` object can be used to incrementally encode a byte stream.
4475///
4476/// During the processing of a single stream, the caller must call `encode_*`
4477/// zero or more times with `last` set to `false` and then call `encode_*` at
4478/// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4479/// the processing of the stream has ended. Otherwise, the caller must call
4480/// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4481/// as a fatal error).
4482///
4483/// Once the stream has ended, the `Encoder` object must not be used anymore.
4484/// That is, you need to create another one to process another stream.
4485///
4486/// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4487/// and the caller does not wish to treat it as a fatal error, the input buffer
4488/// `src` may not have been completely consumed. In that case, the caller must
4489/// pass the unconsumed contents of `src` to `encode_*` again upon the next
4490/// call.
4491///
4492/// [1]: enum.EncoderResult.html
4493/// [2]: #method.max_buffer_length_from_utf8_without_replacement
4494/// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4495/// [4]: #method.max_buffer_length_from_utf16_without_replacement
4496/// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4497///
4498/// # Infinite loops
4499///
4500/// When converting with a fixed-size output buffer whose size is too small to
4501/// accommodate one character of output, an infinite loop ensues. When
4502/// converting with a fixed-size output buffer, it generally makes sense to
4503/// make the buffer fairly large (e.g. couple of kilobytes).
4504pub struct Encoder {
4505 encoding: &'static Encoding,
4506 variant: VariantEncoder,
4507}
4508
4509impl Encoder {
4510 fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4511 Encoder {
4512 encoding: enc,
4513 variant: encoder,
4514 }
4515 }
4516
4517 /// The `Encoding` this `Encoder` is for.
4518 #[inline]
4519 pub fn encoding(&self) -> &'static Encoding {
4520 self.encoding
4521 }
4522
4523 /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4524 /// ASCII state and `false` otherwise.
4525 #[inline]
4526 pub fn has_pending_state(&self) -> bool {
4527 self.variant.has_pending_state()
4528 }
4529
4530 /// Query the worst-case output size when encoding from UTF-8 with
4531 /// replacement.
4532 ///
4533 /// Returns the size of the output buffer in bytes that will not overflow
4534 /// given the current state of the encoder and `byte_length` number of
4535 /// additional input code units if there are no unmappable characters in
4536 /// the input or `None` if `usize` would overflow.
4537 ///
4538 /// Available via the C wrapper.
4539 pub fn max_buffer_length_from_utf8_if_no_unmappables(
4540 &self,
4541 byte_length: usize,
4542 ) -> Option<usize> {
4543 checked_add(
4544 if self.encoding().can_encode_everything() {
4545 0
4546 } else {
4547 NCR_EXTRA
4548 },
4549 self.max_buffer_length_from_utf8_without_replacement(byte_length),
4550 )
4551 }
4552
4553 /// Query the worst-case output size when encoding from UTF-8 without
4554 /// replacement.
4555 ///
4556 /// Returns the size of the output buffer in bytes that will not overflow
4557 /// given the current state of the encoder and `byte_length` number of
4558 /// additional input code units or `None` if `usize` would overflow.
4559 ///
4560 /// Available via the C wrapper.
4561 pub fn max_buffer_length_from_utf8_without_replacement(
4562 &self,
4563 byte_length: usize,
4564 ) -> Option<usize> {
4565 self.variant
4566 .max_buffer_length_from_utf8_without_replacement(byte_length)
4567 }
4568
4569 /// Incrementally encode into byte stream from UTF-8 with unmappable
4570 /// characters replaced with HTML (decimal) numeric character references.
4571 ///
4572 /// See the documentation of the struct for documentation for `encode_*`
4573 /// methods collectively.
4574 ///
4575 /// Available via the C wrapper.
4576 pub fn encode_from_utf8(
4577 &mut self,
4578 src: &str,
4579 dst: &mut [u8],
4580 last: bool,
4581 ) -> (CoderResult, usize, usize, bool) {
4582 let dst_len = dst.len();
4583 let effective_dst_len = if self.encoding().can_encode_everything() {
4584 dst_len
4585 } else {
4586 if dst_len < NCR_EXTRA {
4587 if src.is_empty() && !(last && self.has_pending_state()) {
4588 return (CoderResult::InputEmpty, 0, 0, false);
4589 }
4590 return (CoderResult::OutputFull, 0, 0, false);
4591 }
4592 dst_len - NCR_EXTRA
4593 };
4594 let mut had_unmappables = false;
4595 let mut total_read = 0usize;
4596 let mut total_written = 0usize;
4597 loop {
4598 let (result, read, written) = self.encode_from_utf8_without_replacement(
4599 &src[total_read..],
4600 &mut dst[total_written..effective_dst_len],
4601 last,
4602 );
4603 total_read += read;
4604 total_written += written;
4605 match result {
4606 EncoderResult::InputEmpty => {
4607 return (
4608 CoderResult::InputEmpty,
4609 total_read,
4610 total_written,
4611 had_unmappables,
4612 );
4613 }
4614 EncoderResult::OutputFull => {
4615 return (
4616 CoderResult::OutputFull,
4617 total_read,
4618 total_written,
4619 had_unmappables,
4620 );
4621 }
4622 EncoderResult::Unmappable(unmappable) => {
4623 had_unmappables = true;
4624 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4625 debug_assert_ne!(self.encoding(), UTF_16BE);
4626 debug_assert_ne!(self.encoding(), UTF_16LE);
4627 // Additionally, Iso2022JpEncoder is responsible for
4628 // transitioning to ASCII when returning with Unmappable.
4629 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4630 if total_written >= effective_dst_len {
4631 if total_read == src.len() && !(last && self.has_pending_state()) {
4632 return (
4633 CoderResult::InputEmpty,
4634 total_read,
4635 total_written,
4636 had_unmappables,
4637 );
4638 }
4639 return (
4640 CoderResult::OutputFull,
4641 total_read,
4642 total_written,
4643 had_unmappables,
4644 );
4645 }
4646 }
4647 }
4648 }
4649 }
4650
4651 /// Incrementally encode into byte stream from UTF-8 with unmappable
4652 /// characters replaced with HTML (decimal) numeric character references.
4653 ///
4654 /// See the documentation of the struct for documentation for `encode_*`
4655 /// methods collectively.
4656 ///
4657 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4658 /// by default).
4659 #[cfg(feature = "alloc")]
4660 pub fn encode_from_utf8_to_vec(
4661 &mut self,
4662 src: &str,
4663 dst: &mut Vec<u8>,
4664 last: bool,
4665 ) -> (CoderResult, usize, bool) {
4666 unsafe {
4667 let old_len = dst.len();
4668 let capacity = dst.capacity();
4669 dst.set_len(capacity);
4670 let (result, read, written, replaced) =
4671 self.encode_from_utf8(src, &mut dst[old_len..], last);
4672 dst.set_len(old_len + written);
4673 (result, read, replaced)
4674 }
4675 }
4676
4677 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4678 ///
4679 /// See the documentation of the struct for documentation for `encode_*`
4680 /// methods collectively.
4681 ///
4682 /// Available via the C wrapper.
4683 pub fn encode_from_utf8_without_replacement(
4684 &mut self,
4685 src: &str,
4686 dst: &mut [u8],
4687 last: bool,
4688 ) -> (EncoderResult, usize, usize) {
4689 self.variant.encode_from_utf8_raw(src, dst, last)
4690 }
4691
4692 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4693 ///
4694 /// See the documentation of the struct for documentation for `encode_*`
4695 /// methods collectively.
4696 ///
4697 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4698 /// by default).
4699 #[cfg(feature = "alloc")]
4700 pub fn encode_from_utf8_to_vec_without_replacement(
4701 &mut self,
4702 src: &str,
4703 dst: &mut Vec<u8>,
4704 last: bool,
4705 ) -> (EncoderResult, usize) {
4706 unsafe {
4707 let old_len = dst.len();
4708 let capacity = dst.capacity();
4709 dst.set_len(capacity);
4710 let (result, read, written) =
4711 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4712 dst.set_len(old_len + written);
4713 (result, read)
4714 }
4715 }
4716
4717 /// Query the worst-case output size when encoding from UTF-16 with
4718 /// replacement.
4719 ///
4720 /// Returns the size of the output buffer in bytes that will not overflow
4721 /// given the current state of the encoder and `u16_length` number of
4722 /// additional input code units if there are no unmappable characters in
4723 /// the input or `None` if `usize` would overflow.
4724 ///
4725 /// Available via the C wrapper.
4726 pub fn max_buffer_length_from_utf16_if_no_unmappables(
4727 &self,
4728 u16_length: usize,
4729 ) -> Option<usize> {
4730 checked_add(
4731 if self.encoding().can_encode_everything() {
4732 0
4733 } else {
4734 NCR_EXTRA
4735 },
4736 self.max_buffer_length_from_utf16_without_replacement(u16_length),
4737 )
4738 }
4739
4740 /// Query the worst-case output size when encoding from UTF-16 without
4741 /// replacement.
4742 ///
4743 /// Returns the size of the output buffer in bytes that will not overflow
4744 /// given the current state of the encoder and `u16_length` number of
4745 /// additional input code units or `None` if `usize` would overflow.
4746 ///
4747 /// Available via the C wrapper.
4748 pub fn max_buffer_length_from_utf16_without_replacement(
4749 &self,
4750 u16_length: usize,
4751 ) -> Option<usize> {
4752 self.variant
4753 .max_buffer_length_from_utf16_without_replacement(u16_length)
4754 }
4755
4756 /// Incrementally encode into byte stream from UTF-16 with unmappable
4757 /// characters replaced with HTML (decimal) numeric character references.
4758 ///
4759 /// See the documentation of the struct for documentation for `encode_*`
4760 /// methods collectively.
4761 ///
4762 /// Available via the C wrapper.
4763 pub fn encode_from_utf16(
4764 &mut self,
4765 src: &[u16],
4766 dst: &mut [u8],
4767 last: bool,
4768 ) -> (CoderResult, usize, usize, bool) {
4769 let dst_len = dst.len();
4770 let effective_dst_len = if self.encoding().can_encode_everything() {
4771 dst_len
4772 } else {
4773 if dst_len < NCR_EXTRA {
4774 if src.is_empty() && !(last && self.has_pending_state()) {
4775 return (CoderResult::InputEmpty, 0, 0, false);
4776 }
4777 return (CoderResult::OutputFull, 0, 0, false);
4778 }
4779 dst_len - NCR_EXTRA
4780 };
4781 let mut had_unmappables = false;
4782 let mut total_read = 0usize;
4783 let mut total_written = 0usize;
4784 loop {
4785 let (result, read, written) = self.encode_from_utf16_without_replacement(
4786 &src[total_read..],
4787 &mut dst[total_written..effective_dst_len],
4788 last,
4789 );
4790 total_read += read;
4791 total_written += written;
4792 match result {
4793 EncoderResult::InputEmpty => {
4794 return (
4795 CoderResult::InputEmpty,
4796 total_read,
4797 total_written,
4798 had_unmappables,
4799 );
4800 }
4801 EncoderResult::OutputFull => {
4802 return (
4803 CoderResult::OutputFull,
4804 total_read,
4805 total_written,
4806 had_unmappables,
4807 );
4808 }
4809 EncoderResult::Unmappable(unmappable) => {
4810 had_unmappables = true;
4811 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4812 // There are no UTF-16 encoders and even if there were,
4813 // they'd never have unmappables.
4814 debug_assert_ne!(self.encoding(), UTF_16BE);
4815 debug_assert_ne!(self.encoding(), UTF_16LE);
4816 // Additionally, Iso2022JpEncoder is responsible for
4817 // transitioning to ASCII when returning with Unmappable
4818 // from the jis0208 state. That is, when we encode
4819 // ISO-2022-JP and come here, the encoder is in either the
4820 // ASCII or the Roman state. We are allowed to generate any
4821 // printable ASCII excluding \ and ~.
4822 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4823 if total_written >= effective_dst_len {
4824 if total_read == src.len() && !(last && self.has_pending_state()) {
4825 return (
4826 CoderResult::InputEmpty,
4827 total_read,
4828 total_written,
4829 had_unmappables,
4830 );
4831 }
4832 return (
4833 CoderResult::OutputFull,
4834 total_read,
4835 total_written,
4836 had_unmappables,
4837 );
4838 }
4839 }
4840 }
4841 }
4842 }
4843
4844 /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4845 ///
4846 /// See the documentation of the struct for documentation for `encode_*`
4847 /// methods collectively.
4848 ///
4849 /// Available via the C wrapper.
4850 pub fn encode_from_utf16_without_replacement(
4851 &mut self,
4852 src: &[u16],
4853 dst: &mut [u8],
4854 last: bool,
4855 ) -> (EncoderResult, usize, usize) {
4856 self.variant.encode_from_utf16_raw(src, dst, last)
4857 }
4858}
4859
4860/// Format an unmappable as NCR without heap allocation.
4861fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4862 // len is the number of decimal digits needed to represent unmappable plus
4863 // 3 (the length of "&#" and ";").
4864 let mut number = unmappable as u32;
4865 let len = if number >= 1_000_000u32 {
4866 10usize
4867 } else if number >= 100_000u32 {
4868 9usize
4869 } else if number >= 10_000u32 {
4870 8usize
4871 } else if number >= 1_000u32 {
4872 7usize
4873 } else if number >= 100u32 {
4874 6usize
4875 } else {
4876 // Review the outcome of https://github.com/whatwg/encoding/issues/15
4877 // to see if this case is possible
4878 5usize
4879 };
4880 debug_assert!(number >= 10u32);
4881 debug_assert!(len <= dst.len());
4882 let mut pos = len - 1;
4883 dst[pos] = b';';
4884 pos -= 1;
4885 loop {
4886 let rightmost = number % 10;
4887 dst[pos] = rightmost as u8 + b'0';
4888 pos -= 1;
4889 if number < 10 {
4890 break;
4891 }
4892 number /= 10;
4893 }
4894 dst[1] = b'#';
4895 dst[0] = b'&';
4896 len
4897}
4898
4899#[inline(always)]
4900fn in_range16(i: u16, start: u16, end: u16) -> bool {
4901 i.wrapping_sub(start) < (end - start)
4902}
4903
4904#[inline(always)]
4905fn in_range32(i: u32, start: u32, end: u32) -> bool {
4906 i.wrapping_sub(start) < (end - start)
4907}
4908
4909#[inline(always)]
4910fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4911 i.wrapping_sub(start) <= (end - start)
4912}
4913
4914#[inline(always)]
4915fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4916 i.wrapping_sub(start) <= (end - start)
4917}
4918
4919#[inline(always)]
4920fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4921 i.wrapping_sub(start) <= (end - start)
4922}
4923
4924#[inline(always)]
4925fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4926 i.wrapping_sub(start) <= (end - start)
4927}
4928
4929#[inline(always)]
4930fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4931 if let Some(n) = opt {
4932 n.checked_add(num)
4933 } else {
4934 None
4935 }
4936}
4937
4938#[inline(always)]
4939fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4940 if let Some(n) = one {
4941 checked_add(n, other)
4942 } else {
4943 None
4944 }
4945}
4946
4947#[inline(always)]
4948fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4949 if let Some(n) = opt {
4950 n.checked_mul(num)
4951 } else {
4952 None
4953 }
4954}
4955
4956#[inline(always)]
4957fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4958 if let Some(n) = opt {
4959 n.checked_div(num)
4960 } else {
4961 None
4962 }
4963}
4964
4965#[cfg(feature = "alloc")]
4966#[inline(always)]
4967fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4968 opt.map(|n| n.next_power_of_two())
4969}
4970
4971#[cfg(feature = "alloc")]
4972#[inline(always)]
4973fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4974 if let Some(a) = one {
4975 if let Some(b) = other {
4976 Some(::core::cmp::min(a, b))
4977 } else {
4978 Some(a)
4979 }
4980 } else {
4981 other
4982 }
4983}
4984
4985// ############## TESTS ###############
4986
4987#[cfg(all(test, feature = "serde"))]
4988#[derive(Serialize, Deserialize, Debug, PartialEq)]
4989struct Demo {
4990 num: u32,
4991 name: String,
4992 enc: &'static Encoding,
4993}
4994
4995#[cfg(test)]
4996mod test_labels_names;
4997
4998#[cfg(all(test, feature = "alloc"))]
4999mod tests {
5000 use super::*;
5001 use alloc::borrow::Cow;
5002
5003 fn sniff_to_utf16(
5004 initial_encoding: &'static Encoding,
5005 expected_encoding: &'static Encoding,
5006 bytes: &[u8],
5007 expect: &[u16],
5008 breaks: &[usize],
5009 ) {
5010 let mut decoder = initial_encoding.new_decoder();
5011
5012 let mut dest: Vec<u16> =
5013 Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
5014 let capacity = dest.capacity();
5015 dest.resize(capacity, 0u16);
5016
5017 let mut total_written = 0usize;
5018 let mut start = 0usize;
5019 for br in breaks {
5020 let (result, read, written, _) =
5021 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
5022 total_written += written;
5023 assert_eq!(read, *br - start);
5024 match result {
5025 CoderResult::InputEmpty => {}
5026 CoderResult::OutputFull => {
5027 unreachable!();
5028 }
5029 }
5030 start = *br;
5031 }
5032 let (result, read, written, _) =
5033 decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
5034 total_written += written;
5035 match result {
5036 CoderResult::InputEmpty => {}
5037 CoderResult::OutputFull => {
5038 unreachable!();
5039 }
5040 }
5041 assert_eq!(read, bytes.len() - start);
5042 assert_eq!(total_written, expect.len());
5043 assert_eq!(&dest[..total_written], expect);
5044 assert_eq!(decoder.encoding(), expected_encoding);
5045 }
5046
5047 // Any copyright to the test code below this comment is dedicated to the
5048 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
5049
5050 #[test]
5051 fn test_bom_sniffing() {
5052 // ASCII
5053 sniff_to_utf16(
5054 WINDOWS_1252,
5055 WINDOWS_1252,
5056 b"\x61\x62",
5057 &[0x0061u16, 0x0062u16],
5058 &[],
5059 );
5060 // UTF-8
5061 sniff_to_utf16(
5062 WINDOWS_1252,
5063 UTF_8,
5064 b"\xEF\xBB\xBF\x61\x62",
5065 &[0x0061u16, 0x0062u16],
5066 &[],
5067 );
5068 sniff_to_utf16(
5069 WINDOWS_1252,
5070 UTF_8,
5071 b"\xEF\xBB\xBF\x61\x62",
5072 &[0x0061u16, 0x0062u16],
5073 &[1],
5074 );
5075 sniff_to_utf16(
5076 WINDOWS_1252,
5077 UTF_8,
5078 b"\xEF\xBB\xBF\x61\x62",
5079 &[0x0061u16, 0x0062u16],
5080 &[2],
5081 );
5082 sniff_to_utf16(
5083 WINDOWS_1252,
5084 UTF_8,
5085 b"\xEF\xBB\xBF\x61\x62",
5086 &[0x0061u16, 0x0062u16],
5087 &[3],
5088 );
5089 sniff_to_utf16(
5090 WINDOWS_1252,
5091 UTF_8,
5092 b"\xEF\xBB\xBF\x61\x62",
5093 &[0x0061u16, 0x0062u16],
5094 &[4],
5095 );
5096 sniff_to_utf16(
5097 WINDOWS_1252,
5098 UTF_8,
5099 b"\xEF\xBB\xBF\x61\x62",
5100 &[0x0061u16, 0x0062u16],
5101 &[2, 3],
5102 );
5103 sniff_to_utf16(
5104 WINDOWS_1252,
5105 UTF_8,
5106 b"\xEF\xBB\xBF\x61\x62",
5107 &[0x0061u16, 0x0062u16],
5108 &[1, 2],
5109 );
5110 sniff_to_utf16(
5111 WINDOWS_1252,
5112 UTF_8,
5113 b"\xEF\xBB\xBF\x61\x62",
5114 &[0x0061u16, 0x0062u16],
5115 &[1, 3],
5116 );
5117 sniff_to_utf16(
5118 WINDOWS_1252,
5119 UTF_8,
5120 b"\xEF\xBB\xBF\x61\x62",
5121 &[0x0061u16, 0x0062u16],
5122 &[1, 2, 3, 4],
5123 );
5124 sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
5125 // Not UTF-8
5126 sniff_to_utf16(
5127 WINDOWS_1252,
5128 WINDOWS_1252,
5129 b"\xEF\xBB\x61\x62",
5130 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5131 &[],
5132 );
5133 sniff_to_utf16(
5134 WINDOWS_1252,
5135 WINDOWS_1252,
5136 b"\xEF\xBB\x61\x62",
5137 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5138 &[1],
5139 );
5140 sniff_to_utf16(
5141 WINDOWS_1252,
5142 WINDOWS_1252,
5143 b"\xEF\x61\x62",
5144 &[0x00EFu16, 0x0061u16, 0x0062u16],
5145 &[],
5146 );
5147 sniff_to_utf16(
5148 WINDOWS_1252,
5149 WINDOWS_1252,
5150 b"\xEF\x61\x62",
5151 &[0x00EFu16, 0x0061u16, 0x0062u16],
5152 &[1],
5153 );
5154 sniff_to_utf16(
5155 WINDOWS_1252,
5156 WINDOWS_1252,
5157 b"\xEF\xBB",
5158 &[0x00EFu16, 0x00BBu16],
5159 &[],
5160 );
5161 sniff_to_utf16(
5162 WINDOWS_1252,
5163 WINDOWS_1252,
5164 b"\xEF\xBB",
5165 &[0x00EFu16, 0x00BBu16],
5166 &[1],
5167 );
5168 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5169 // Not UTF-16
5170 sniff_to_utf16(
5171 WINDOWS_1252,
5172 WINDOWS_1252,
5173 b"\xFE\x61\x62",
5174 &[0x00FEu16, 0x0061u16, 0x0062u16],
5175 &[],
5176 );
5177 sniff_to_utf16(
5178 WINDOWS_1252,
5179 WINDOWS_1252,
5180 b"\xFE\x61\x62",
5181 &[0x00FEu16, 0x0061u16, 0x0062u16],
5182 &[1],
5183 );
5184 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5185 sniff_to_utf16(
5186 WINDOWS_1252,
5187 WINDOWS_1252,
5188 b"\xFF\x61\x62",
5189 &[0x00FFu16, 0x0061u16, 0x0062u16],
5190 &[],
5191 );
5192 sniff_to_utf16(
5193 WINDOWS_1252,
5194 WINDOWS_1252,
5195 b"\xFF\x61\x62",
5196 &[0x00FFu16, 0x0061u16, 0x0062u16],
5197 &[1],
5198 );
5199 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5200 // UTF-16
5201 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5202 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5203 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5204 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5205 }
5206
5207 #[test]
5208 fn test_output_encoding() {
5209 assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5210 assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5211 assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5212 assert_eq!(UTF_8.output_encoding(), UTF_8);
5213 assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5214 assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5215 assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5216 assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5217 assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5218 assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5219 }
5220
5221 #[test]
5222 fn test_label_resolution() {
5223 assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5224 assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5225 assert_eq!(
5226 Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5227 Some(UTF_8)
5228 );
5229 assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5230 assert_eq!(Encoding::for_label(b"bogus"), None);
5231 assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5232 }
5233
5234 #[test]
5235 fn test_decode_valid_windows_1257_to_cow() {
5236 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5237 match cow {
5238 Cow::Borrowed(_) => unreachable!(),
5239 Cow::Owned(s) => {
5240 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5241 }
5242 }
5243 assert_eq!(encoding, WINDOWS_1257);
5244 assert!(!had_errors);
5245 }
5246
5247 #[test]
5248 fn test_decode_invalid_windows_1257_to_cow() {
5249 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5250 match cow {
5251 Cow::Borrowed(_) => unreachable!(),
5252 Cow::Owned(s) => {
5253 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5254 }
5255 }
5256 assert_eq!(encoding, WINDOWS_1257);
5257 assert!(had_errors);
5258 }
5259
5260 #[test]
5261 fn test_decode_ascii_only_windows_1257_to_cow() {
5262 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5263 match cow {
5264 Cow::Borrowed(s) => {
5265 assert_eq!(s, "abc");
5266 }
5267 Cow::Owned(_) => unreachable!(),
5268 }
5269 assert_eq!(encoding, WINDOWS_1257);
5270 assert!(!had_errors);
5271 }
5272
5273 #[test]
5274 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5275 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5276 match cow {
5277 Cow::Borrowed(s) => {
5278 assert_eq!(s, "\u{20AC}\u{00E4}");
5279 }
5280 Cow::Owned(_) => unreachable!(),
5281 }
5282 assert_eq!(encoding, UTF_8);
5283 assert!(!had_errors);
5284 }
5285
5286 #[test]
5287 fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5288 let (cow, encoding, had_errors) =
5289 WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5290 match cow {
5291 Cow::Borrowed(_) => unreachable!(),
5292 Cow::Owned(s) => {
5293 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5294 }
5295 }
5296 assert_eq!(encoding, UTF_8);
5297 assert!(had_errors);
5298 }
5299
5300 #[test]
5301 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5302 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5303 match cow {
5304 Cow::Borrowed(s) => {
5305 assert_eq!(s, "\u{20AC}\u{00E4}");
5306 }
5307 Cow::Owned(_) => unreachable!(),
5308 }
5309 assert_eq!(encoding, UTF_8);
5310 assert!(!had_errors);
5311 }
5312
5313 #[test]
5314 fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5315 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5316 match cow {
5317 Cow::Borrowed(_) => unreachable!(),
5318 Cow::Owned(s) => {
5319 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5320 }
5321 }
5322 assert_eq!(encoding, UTF_8);
5323 assert!(had_errors);
5324 }
5325
5326 #[test]
5327 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5328 let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5329 match cow {
5330 Cow::Borrowed(s) => {
5331 assert_eq!(s, "\u{20AC}\u{00E4}");
5332 }
5333 Cow::Owned(_) => unreachable!(),
5334 }
5335 assert!(!had_errors);
5336 }
5337
5338 #[test]
5339 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5340 let (cow, had_errors) =
5341 WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5342 match cow {
5343 Cow::Borrowed(_) => unreachable!(),
5344 Cow::Owned(s) => {
5345 assert_eq!(
5346 s,
5347 "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5348 );
5349 }
5350 }
5351 assert!(!had_errors);
5352 }
5353
5354 #[test]
5355 fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5356 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5357 match cow {
5358 Cow::Borrowed(_) => unreachable!(),
5359 Cow::Owned(s) => {
5360 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5361 }
5362 }
5363 assert!(!had_errors);
5364 }
5365
5366 #[test]
5367 fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5368 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5369 match cow {
5370 Cow::Borrowed(_) => unreachable!(),
5371 Cow::Owned(s) => {
5372 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5373 }
5374 }
5375 assert!(had_errors);
5376 }
5377
5378 #[test]
5379 fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5380 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5381 match cow {
5382 Cow::Borrowed(s) => {
5383 assert_eq!(s, "abc");
5384 }
5385 Cow::Owned(_) => unreachable!(),
5386 }
5387 assert!(!had_errors);
5388 }
5389
5390 #[test]
5391 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5392 let (cow, had_errors) =
5393 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5394 match cow {
5395 Cow::Borrowed(s) => {
5396 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5397 }
5398 Cow::Owned(_) => unreachable!(),
5399 }
5400 assert!(!had_errors);
5401 }
5402
5403 #[test]
5404 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5405 let (cow, had_errors) =
5406 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5407 match cow {
5408 Cow::Borrowed(_) => unreachable!(),
5409 Cow::Owned(s) => {
5410 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5411 }
5412 }
5413 assert!(had_errors);
5414 }
5415
5416 #[test]
5417 fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5418 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5419 match cow {
5420 Cow::Borrowed(_) => unreachable!(),
5421 Cow::Owned(s) => {
5422 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5423 }
5424 }
5425 assert!(!had_errors);
5426 }
5427
5428 #[test]
5429 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5430 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5431 match cow {
5432 Cow::Borrowed(_) => unreachable!(),
5433 Cow::Owned(s) => {
5434 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5435 }
5436 }
5437 assert!(had_errors);
5438 }
5439
5440 #[test]
5441 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5442 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5443 match cow {
5444 Cow::Borrowed(s) => {
5445 assert_eq!(s, "abc");
5446 }
5447 Cow::Owned(_) => unreachable!(),
5448 }
5449 assert!(!had_errors);
5450 }
5451
5452 #[test]
5453 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5454 match UTF_8.decode_without_bom_handling_and_without_replacement(
5455 b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5456 ) {
5457 Some(cow) => match cow {
5458 Cow::Borrowed(s) => {
5459 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5460 }
5461 Cow::Owned(_) => unreachable!(),
5462 },
5463 None => unreachable!(),
5464 }
5465 }
5466
5467 #[test]
5468 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5469 assert!(UTF_8
5470 .decode_without_bom_handling_and_without_replacement(
5471 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5472 )
5473 .is_none());
5474 }
5475
5476 #[test]
5477 fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5478 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5479 Some(cow) => match cow {
5480 Cow::Borrowed(_) => unreachable!(),
5481 Cow::Owned(s) => {
5482 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5483 }
5484 },
5485 None => unreachable!(),
5486 }
5487 }
5488
5489 #[test]
5490 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5491 assert!(WINDOWS_1257
5492 .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5493 .is_none());
5494 }
5495
5496 #[test]
5497 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5498 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5499 Some(cow) => match cow {
5500 Cow::Borrowed(s) => {
5501 assert_eq!(s, "abc");
5502 }
5503 Cow::Owned(_) => unreachable!(),
5504 },
5505 None => unreachable!(),
5506 }
5507 }
5508
5509 #[test]
5510 fn test_encode_ascii_only_windows_1257_to_cow() {
5511 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5512 match cow {
5513 Cow::Borrowed(s) => {
5514 assert_eq!(s, b"abc");
5515 }
5516 Cow::Owned(_) => unreachable!(),
5517 }
5518 assert_eq!(encoding, WINDOWS_1257);
5519 assert!(!had_errors);
5520 }
5521
5522 #[test]
5523 fn test_encode_valid_windows_1257_to_cow() {
5524 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5525 match cow {
5526 Cow::Borrowed(_) => unreachable!(),
5527 Cow::Owned(s) => {
5528 assert_eq!(s, b"abc\x80\xE4");
5529 }
5530 }
5531 assert_eq!(encoding, WINDOWS_1257);
5532 assert!(!had_errors);
5533 }
5534
5535 #[test]
5536 fn test_utf16_space_with_one_bom_byte() {
5537 let mut decoder = UTF_16LE.new_decoder();
5538 let mut dst = [0u16; 12];
5539 {
5540 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5541 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5542 assert_eq!(result, CoderResult::InputEmpty);
5543 }
5544 {
5545 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5546 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5547 assert_eq!(result, CoderResult::InputEmpty);
5548 }
5549 }
5550
5551 #[test]
5552 fn test_utf8_space_with_one_bom_byte() {
5553 let mut decoder = UTF_8.new_decoder();
5554 let mut dst = [0u16; 12];
5555 {
5556 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5557 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5558 assert_eq!(result, CoderResult::InputEmpty);
5559 }
5560 {
5561 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5562 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5563 assert_eq!(result, CoderResult::InputEmpty);
5564 }
5565 }
5566
5567 #[test]
5568 fn test_utf16_space_with_two_bom_bytes() {
5569 let mut decoder = UTF_16LE.new_decoder();
5570 let mut dst = [0u16; 12];
5571 {
5572 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5573 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5574 assert_eq!(result, CoderResult::InputEmpty);
5575 }
5576 {
5577 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5578 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5579 assert_eq!(result, CoderResult::InputEmpty);
5580 }
5581 {
5582 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5583 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5584 assert_eq!(result, CoderResult::InputEmpty);
5585 }
5586 }
5587
5588 #[test]
5589 fn test_utf8_space_with_two_bom_bytes() {
5590 let mut decoder = UTF_8.new_decoder();
5591 let mut dst = [0u16; 12];
5592 {
5593 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5594 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5595 assert_eq!(result, CoderResult::InputEmpty);
5596 }
5597 {
5598 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5599 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5600 assert_eq!(result, CoderResult::InputEmpty);
5601 }
5602 {
5603 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5604 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5605 assert_eq!(result, CoderResult::InputEmpty);
5606 }
5607 }
5608
5609 #[test]
5610 fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5611 let mut decoder = UTF_16LE.new_decoder();
5612 let mut dst = [0u16; 12];
5613 {
5614 let needed = decoder.max_utf16_buffer_length(2).unwrap();
5615 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5616 assert_eq!(result, CoderResult::InputEmpty);
5617 }
5618 }
5619
5620 #[test]
5621 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5622 let mut dst = [0u8; 8];
5623 let mut encoder = ISO_2022_JP.new_encoder();
5624 {
5625 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5626 assert_eq!(result, CoderResult::InputEmpty);
5627 }
5628 {
5629 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5630 assert_eq!(result, CoderResult::InputEmpty);
5631 }
5632 }
5633
5634 #[test]
5635 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5636 let mut dst = [0u8; 16];
5637 let mut encoder = ISO_2022_JP.new_encoder();
5638 {
5639 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5640 assert_eq!(result, CoderResult::InputEmpty);
5641 }
5642 {
5643 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5644 assert_eq!(result, CoderResult::InputEmpty);
5645 }
5646 {
5647 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5648 assert_eq!(result, CoderResult::OutputFull);
5649 }
5650 }
5651
5652 #[test]
5653 fn test_buffer_end_iso_2022_jp_from_utf8() {
5654 let mut dst = [0u8; 18];
5655 {
5656 let mut encoder = ISO_2022_JP.new_encoder();
5657 let (result, _, _, _) =
5658 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5659 assert_eq!(result, CoderResult::InputEmpty);
5660 }
5661 {
5662 let mut encoder = ISO_2022_JP.new_encoder();
5663 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5664 assert_eq!(result, CoderResult::OutputFull);
5665 }
5666 {
5667 let mut encoder = ISO_2022_JP.new_encoder();
5668 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5669 assert_eq!(result, CoderResult::InputEmpty);
5670 }
5671 {
5672 let mut encoder = ISO_2022_JP.new_encoder();
5673 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5674 assert_eq!(result, CoderResult::InputEmpty);
5675 }
5676 }
5677
5678 #[test]
5679 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5680 let mut dst = [0u8; 8];
5681 let mut encoder = ISO_2022_JP.new_encoder();
5682 {
5683 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5684 assert_eq!(result, CoderResult::InputEmpty);
5685 }
5686 {
5687 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5688 assert_eq!(result, CoderResult::InputEmpty);
5689 }
5690 }
5691
5692 #[test]
5693 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5694 let mut dst = [0u8; 16];
5695 let mut encoder = ISO_2022_JP.new_encoder();
5696 {
5697 let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5698 assert_eq!(result, CoderResult::InputEmpty);
5699 }
5700 {
5701 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5702 assert_eq!(result, CoderResult::InputEmpty);
5703 }
5704 {
5705 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5706 assert_eq!(result, CoderResult::OutputFull);
5707 }
5708 }
5709
5710 #[test]
5711 fn test_buffer_end_iso_2022_jp_from_utf16() {
5712 let mut dst = [0u8; 18];
5713 {
5714 let mut encoder = ISO_2022_JP.new_encoder();
5715 let (result, _, _, _) =
5716 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5717 assert_eq!(result, CoderResult::InputEmpty);
5718 }
5719 {
5720 let mut encoder = ISO_2022_JP.new_encoder();
5721 let (result, _, _, _) =
5722 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5723 assert_eq!(result, CoderResult::OutputFull);
5724 }
5725 {
5726 let mut encoder = ISO_2022_JP.new_encoder();
5727 let (result, _, _, _) =
5728 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5729 assert_eq!(result, CoderResult::InputEmpty);
5730 }
5731 {
5732 let mut encoder = ISO_2022_JP.new_encoder();
5733 let (result, _, _, _) =
5734 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5735 assert_eq!(result, CoderResult::InputEmpty);
5736 }
5737 }
5738
5739 #[test]
5740 fn test_buffer_end_utf16be() {
5741 let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5742 let mut dest = [0u8; 4];
5743
5744 assert_eq!(
5745 decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5746 (CoderResult::InputEmpty, 2, 0, false)
5747 );
5748
5749 let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5750 }
5751
5752 #[test]
5753 fn test_hash() {
5754 let mut encodings = ::alloc::collections::btree_set::BTreeSet::new();
5755 encodings.insert(UTF_8);
5756 encodings.insert(ISO_2022_JP);
5757 assert!(encodings.contains(UTF_8));
5758 assert!(encodings.contains(ISO_2022_JP));
5759 assert!(!encodings.contains(WINDOWS_1252));
5760 encodings.remove(ISO_2022_JP);
5761 assert!(!encodings.contains(ISO_2022_JP));
5762 }
5763
5764 #[test]
5765 fn test_iso_2022_jp_ncr_extra_from_utf16() {
5766 let mut dst = [0u8; 17];
5767 {
5768 let mut encoder = ISO_2022_JP.new_encoder();
5769 let (result, _, _, _) =
5770 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5771 assert_eq!(result, CoderResult::OutputFull);
5772 }
5773 }
5774
5775 #[test]
5776 fn test_iso_2022_jp_ncr_extra_from_utf8() {
5777 let mut dst = [0u8; 17];
5778 {
5779 let mut encoder = ISO_2022_JP.new_encoder();
5780 let (result, _, _, _) =
5781 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5782 assert_eq!(result, CoderResult::OutputFull);
5783 }
5784 }
5785
5786 #[test]
5787 fn test_max_length_with_bom_to_utf8() {
5788 let mut output = [0u8; 20];
5789 let mut decoder = REPLACEMENT.new_decoder();
5790 let input = b"\xEF\xBB\xBFA";
5791 {
5792 let needed = decoder
5793 .max_utf8_buffer_length_without_replacement(input.len())
5794 .unwrap();
5795 let (result, read, written) =
5796 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5797 assert_eq!(result, DecoderResult::InputEmpty);
5798 assert_eq!(read, input.len());
5799 assert_eq!(written, 1);
5800 assert_eq!(output[0], 0x41);
5801 }
5802 }
5803
5804 #[cfg(feature = "serde")]
5805 #[test]
5806 fn test_serde() {
5807 let demo = Demo {
5808 num: 42,
5809 name: "foo".into(),
5810 enc: UTF_8,
5811 };
5812
5813 let serialized = serde_json::to_string(&demo).unwrap();
5814
5815 let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5816 assert_eq!(deserialized, demo);
5817
5818 let bincoded = bincode::serialize(&demo).unwrap();
5819 let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5820 assert_eq!(debincoded, demo);
5821 }
5822
5823 #[test]
5824 fn test_is_single_byte() {
5825 assert!(!BIG5.is_single_byte());
5826 assert!(!EUC_JP.is_single_byte());
5827 assert!(!EUC_KR.is_single_byte());
5828 assert!(!GB18030.is_single_byte());
5829 assert!(!GBK.is_single_byte());
5830 assert!(!REPLACEMENT.is_single_byte());
5831 assert!(!SHIFT_JIS.is_single_byte());
5832 assert!(!UTF_8.is_single_byte());
5833 assert!(!UTF_16BE.is_single_byte());
5834 assert!(!UTF_16LE.is_single_byte());
5835 assert!(!ISO_2022_JP.is_single_byte());
5836
5837 assert!(IBM866.is_single_byte());
5838 assert!(ISO_8859_2.is_single_byte());
5839 assert!(ISO_8859_3.is_single_byte());
5840 assert!(ISO_8859_4.is_single_byte());
5841 assert!(ISO_8859_5.is_single_byte());
5842 assert!(ISO_8859_6.is_single_byte());
5843 assert!(ISO_8859_7.is_single_byte());
5844 assert!(ISO_8859_8.is_single_byte());
5845 assert!(ISO_8859_10.is_single_byte());
5846 assert!(ISO_8859_13.is_single_byte());
5847 assert!(ISO_8859_14.is_single_byte());
5848 assert!(ISO_8859_15.is_single_byte());
5849 assert!(ISO_8859_16.is_single_byte());
5850 assert!(ISO_8859_8_I.is_single_byte());
5851 assert!(KOI8_R.is_single_byte());
5852 assert!(KOI8_U.is_single_byte());
5853 assert!(MACINTOSH.is_single_byte());
5854 assert!(WINDOWS_874.is_single_byte());
5855 assert!(WINDOWS_1250.is_single_byte());
5856 assert!(WINDOWS_1251.is_single_byte());
5857 assert!(WINDOWS_1252.is_single_byte());
5858 assert!(WINDOWS_1253.is_single_byte());
5859 assert!(WINDOWS_1254.is_single_byte());
5860 assert!(WINDOWS_1255.is_single_byte());
5861 assert!(WINDOWS_1256.is_single_byte());
5862 assert!(WINDOWS_1257.is_single_byte());
5863 assert!(WINDOWS_1258.is_single_byte());
5864 assert!(X_MAC_CYRILLIC.is_single_byte());
5865 assert!(X_USER_DEFINED.is_single_byte());
5866 }
5867
5868 #[test]
5869 fn test_latin1_byte_compatible_up_to() {
5870 let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5871 assert_eq!(
5872 BIG5.new_decoder_without_bom_handling()
5873 .latin1_byte_compatible_up_to(buffer)
5874 .unwrap(),
5875 1
5876 );
5877 assert_eq!(
5878 EUC_JP
5879 .new_decoder_without_bom_handling()
5880 .latin1_byte_compatible_up_to(buffer)
5881 .unwrap(),
5882 1
5883 );
5884 assert_eq!(
5885 EUC_KR
5886 .new_decoder_without_bom_handling()
5887 .latin1_byte_compatible_up_to(buffer)
5888 .unwrap(),
5889 1
5890 );
5891 assert_eq!(
5892 GB18030
5893 .new_decoder_without_bom_handling()
5894 .latin1_byte_compatible_up_to(buffer)
5895 .unwrap(),
5896 1
5897 );
5898 assert_eq!(
5899 GBK.new_decoder_without_bom_handling()
5900 .latin1_byte_compatible_up_to(buffer)
5901 .unwrap(),
5902 1
5903 );
5904 assert!(REPLACEMENT
5905 .new_decoder_without_bom_handling()
5906 .latin1_byte_compatible_up_to(buffer)
5907 .is_none());
5908 assert_eq!(
5909 SHIFT_JIS
5910 .new_decoder_without_bom_handling()
5911 .latin1_byte_compatible_up_to(buffer)
5912 .unwrap(),
5913 1
5914 );
5915 assert_eq!(
5916 UTF_8
5917 .new_decoder_without_bom_handling()
5918 .latin1_byte_compatible_up_to(buffer)
5919 .unwrap(),
5920 1
5921 );
5922 assert!(UTF_16BE
5923 .new_decoder_without_bom_handling()
5924 .latin1_byte_compatible_up_to(buffer)
5925 .is_none());
5926 assert!(UTF_16LE
5927 .new_decoder_without_bom_handling()
5928 .latin1_byte_compatible_up_to(buffer)
5929 .is_none());
5930 assert_eq!(
5931 ISO_2022_JP
5932 .new_decoder_without_bom_handling()
5933 .latin1_byte_compatible_up_to(buffer)
5934 .unwrap(),
5935 1
5936 );
5937
5938 assert_eq!(
5939 IBM866
5940 .new_decoder_without_bom_handling()
5941 .latin1_byte_compatible_up_to(buffer)
5942 .unwrap(),
5943 1
5944 );
5945 assert_eq!(
5946 ISO_8859_2
5947 .new_decoder_without_bom_handling()
5948 .latin1_byte_compatible_up_to(buffer)
5949 .unwrap(),
5950 2
5951 );
5952 assert_eq!(
5953 ISO_8859_3
5954 .new_decoder_without_bom_handling()
5955 .latin1_byte_compatible_up_to(buffer)
5956 .unwrap(),
5957 2
5958 );
5959 assert_eq!(
5960 ISO_8859_4
5961 .new_decoder_without_bom_handling()
5962 .latin1_byte_compatible_up_to(buffer)
5963 .unwrap(),
5964 2
5965 );
5966 assert_eq!(
5967 ISO_8859_5
5968 .new_decoder_without_bom_handling()
5969 .latin1_byte_compatible_up_to(buffer)
5970 .unwrap(),
5971 2
5972 );
5973 assert_eq!(
5974 ISO_8859_6
5975 .new_decoder_without_bom_handling()
5976 .latin1_byte_compatible_up_to(buffer)
5977 .unwrap(),
5978 2
5979 );
5980 assert_eq!(
5981 ISO_8859_7
5982 .new_decoder_without_bom_handling()
5983 .latin1_byte_compatible_up_to(buffer)
5984 .unwrap(),
5985 2
5986 );
5987 assert_eq!(
5988 ISO_8859_8
5989 .new_decoder_without_bom_handling()
5990 .latin1_byte_compatible_up_to(buffer)
5991 .unwrap(),
5992 3
5993 );
5994 assert_eq!(
5995 ISO_8859_10
5996 .new_decoder_without_bom_handling()
5997 .latin1_byte_compatible_up_to(buffer)
5998 .unwrap(),
5999 2
6000 );
6001 assert_eq!(
6002 ISO_8859_13
6003 .new_decoder_without_bom_handling()
6004 .latin1_byte_compatible_up_to(buffer)
6005 .unwrap(),
6006 4
6007 );
6008 assert_eq!(
6009 ISO_8859_14
6010 .new_decoder_without_bom_handling()
6011 .latin1_byte_compatible_up_to(buffer)
6012 .unwrap(),
6013 4
6014 );
6015 assert_eq!(
6016 ISO_8859_15
6017 .new_decoder_without_bom_handling()
6018 .latin1_byte_compatible_up_to(buffer)
6019 .unwrap(),
6020 6
6021 );
6022 assert_eq!(
6023 ISO_8859_16
6024 .new_decoder_without_bom_handling()
6025 .latin1_byte_compatible_up_to(buffer)
6026 .unwrap(),
6027 4
6028 );
6029 assert_eq!(
6030 ISO_8859_8_I
6031 .new_decoder_without_bom_handling()
6032 .latin1_byte_compatible_up_to(buffer)
6033 .unwrap(),
6034 3
6035 );
6036 assert_eq!(
6037 KOI8_R
6038 .new_decoder_without_bom_handling()
6039 .latin1_byte_compatible_up_to(buffer)
6040 .unwrap(),
6041 1
6042 );
6043 assert_eq!(
6044 KOI8_U
6045 .new_decoder_without_bom_handling()
6046 .latin1_byte_compatible_up_to(buffer)
6047 .unwrap(),
6048 1
6049 );
6050 assert_eq!(
6051 MACINTOSH
6052 .new_decoder_without_bom_handling()
6053 .latin1_byte_compatible_up_to(buffer)
6054 .unwrap(),
6055 1
6056 );
6057 assert_eq!(
6058 WINDOWS_874
6059 .new_decoder_without_bom_handling()
6060 .latin1_byte_compatible_up_to(buffer)
6061 .unwrap(),
6062 2
6063 );
6064 assert_eq!(
6065 WINDOWS_1250
6066 .new_decoder_without_bom_handling()
6067 .latin1_byte_compatible_up_to(buffer)
6068 .unwrap(),
6069 4
6070 );
6071 assert_eq!(
6072 WINDOWS_1251
6073 .new_decoder_without_bom_handling()
6074 .latin1_byte_compatible_up_to(buffer)
6075 .unwrap(),
6076 1
6077 );
6078 assert_eq!(
6079 WINDOWS_1252
6080 .new_decoder_without_bom_handling()
6081 .latin1_byte_compatible_up_to(buffer)
6082 .unwrap(),
6083 5
6084 );
6085 assert_eq!(
6086 WINDOWS_1253
6087 .new_decoder_without_bom_handling()
6088 .latin1_byte_compatible_up_to(buffer)
6089 .unwrap(),
6090 3
6091 );
6092 assert_eq!(
6093 WINDOWS_1254
6094 .new_decoder_without_bom_handling()
6095 .latin1_byte_compatible_up_to(buffer)
6096 .unwrap(),
6097 4
6098 );
6099 assert_eq!(
6100 WINDOWS_1255
6101 .new_decoder_without_bom_handling()
6102 .latin1_byte_compatible_up_to(buffer)
6103 .unwrap(),
6104 3
6105 );
6106 assert_eq!(
6107 WINDOWS_1256
6108 .new_decoder_without_bom_handling()
6109 .latin1_byte_compatible_up_to(buffer)
6110 .unwrap(),
6111 1
6112 );
6113 assert_eq!(
6114 WINDOWS_1257
6115 .new_decoder_without_bom_handling()
6116 .latin1_byte_compatible_up_to(buffer)
6117 .unwrap(),
6118 4
6119 );
6120 assert_eq!(
6121 WINDOWS_1258
6122 .new_decoder_without_bom_handling()
6123 .latin1_byte_compatible_up_to(buffer)
6124 .unwrap(),
6125 4
6126 );
6127 assert_eq!(
6128 X_MAC_CYRILLIC
6129 .new_decoder_without_bom_handling()
6130 .latin1_byte_compatible_up_to(buffer)
6131 .unwrap(),
6132 1
6133 );
6134 assert_eq!(
6135 X_USER_DEFINED
6136 .new_decoder_without_bom_handling()
6137 .latin1_byte_compatible_up_to(buffer)
6138 .unwrap(),
6139 1
6140 );
6141
6142 assert!(UTF_8
6143 .new_decoder()
6144 .latin1_byte_compatible_up_to(buffer)
6145 .is_none());
6146
6147 let mut decoder = UTF_8.new_decoder();
6148 let mut output = [0u16; 4];
6149 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6150 assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6151 let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6152 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6153 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6154 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6155 }
6156}