encoding_rs/
euc_jp.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::data::*;
12use crate::handles::*;
13use crate::variant::*;
14// Rust 1.14.0 requires the following despite the asterisk above.
15use super::in_inclusive_range16;
16
17enum EucJpPending {
18    None,
19    Jis0208Lead(u8),
20    Jis0212Shift,
21    Jis0212Lead(u8),
22    HalfWidthKatakana,
23}
24
25impl EucJpPending {
26    fn is_none(&self) -> bool {
27        match *self {
28            EucJpPending::None => true,
29            _ => false,
30        }
31    }
32
33    fn count(&self) -> usize {
34        match *self {
35            EucJpPending::None => 0,
36            EucJpPending::Jis0208Lead(_)
37            | EucJpPending::Jis0212Shift
38            | EucJpPending::HalfWidthKatakana => 1,
39            EucJpPending::Jis0212Lead(_) => 2,
40        }
41    }
42}
43
44pub struct EucJpDecoder {
45    pending: EucJpPending,
46}
47
48impl EucJpDecoder {
49    pub fn new() -> VariantDecoder {
50        VariantDecoder::EucJp(EucJpDecoder {
51            pending: EucJpPending::None,
52        })
53    }
54
55    pub fn in_neutral_state(&self) -> bool {
56        self.pending.is_none()
57    }
58
59    fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
60        byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 })
61    }
62
63    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
64        self.plus_one_if_lead(byte_length)
65    }
66
67    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
68        // worst case: 2 to 3
69        let len = self.plus_one_if_lead(byte_length);
70        checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
71    }
72
73    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
74        checked_mul(3, self.plus_one_if_lead(byte_length))
75    }
76
77    euc_jp_decoder_functions!(
78        {
79            let trail_minus_offset = byte.wrapping_sub(0xA1);
80            // Fast-track Hiragana (60% according to Lunde)
81            // and Katakana (10% acconding to Lunde).
82            if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
83                // Hiragana
84                handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset))
85            } else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
86                // Katakana
87                handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
88            } else if trail_minus_offset > (0xFE - 0xA1) {
89                if byte < 0x80 {
90                    return (
91                        DecoderResult::Malformed(1, 0),
92                        unread_handle_trail.unread(),
93                        handle.written(),
94                    );
95                }
96                return (
97                    DecoderResult::Malformed(2, 0),
98                    unread_handle_trail.consumed(),
99                    handle.written(),
100                );
101            } else {
102                let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset);
103                let level1_pointer = pointer.wrapping_sub(1410);
104                if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
105                    handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
106                } else {
107                    let level2_pointer = pointer.wrapping_sub(4418);
108                    if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
109                        handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
110                    } else {
111                        let ibm_pointer = pointer.wrapping_sub(8272);
112                        if ibm_pointer < IBM_KANJI.len() {
113                            handle.write_upper_bmp(IBM_KANJI[ibm_pointer])
114                        } else if let Some(bmp) = jis0208_symbol_decode(pointer) {
115                            handle.write_bmp_excl_ascii(bmp)
116                        } else if let Some(bmp) = jis0208_range_decode(pointer) {
117                            handle.write_bmp_excl_ascii(bmp)
118                        } else {
119                            return (
120                                DecoderResult::Malformed(2, 0),
121                                unread_handle_trail.consumed(),
122                                handle.written(),
123                            );
124                        }
125                    }
126                }
127            }
128        },
129        {
130            // If lead is between 0xA1 and 0xFE, inclusive,
131            // subtract 0xA1.
132            let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1);
133            if jis0212_lead_minus_offset > (0xFE - 0xA1) {
134                if lead < 0x80 {
135                    return (
136                        DecoderResult::Malformed(1, 0),
137                        unread_handle_jis0212.unread(),
138                        handle.written(),
139                    );
140                }
141                return (
142                    DecoderResult::Malformed(2, 0),
143                    unread_handle_jis0212.consumed(),
144                    handle.written(),
145                );
146            }
147            jis0212_lead_minus_offset
148        },
149        {
150            // If trail is between 0xA1 and 0xFE, inclusive,
151            // subtract 0xA1.
152            let trail_minus_offset = byte.wrapping_sub(0xA1);
153            if trail_minus_offset > (0xFE - 0xA1) {
154                if byte < 0x80 {
155                    return (
156                        DecoderResult::Malformed(2, 0),
157                        unread_handle_trail.unread(),
158                        handle.written(),
159                    );
160                }
161                return (
162                    DecoderResult::Malformed(3, 0),
163                    unread_handle_trail.consumed(),
164                    handle.written(),
165                );
166            }
167            let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset);
168            let pointer_minus_kanji = pointer.wrapping_sub(1410);
169            if pointer_minus_kanji < JIS0212_KANJI.len() {
170                handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
171            } else if let Some(bmp) = jis0212_accented_decode(pointer) {
172                handle.write_bmp_excl_ascii(bmp)
173            } else {
174                let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597);
175                if pointer_minus_upper_cyrillic <= (607 - 597) {
176                    handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16)
177                } else {
178                    let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645);
179                    if pointer_minus_lower_cyrillic <= (655 - 645) {
180                        handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16)
181                    } else {
182                        return (
183                            DecoderResult::Malformed(3, 0),
184                            unread_handle_trail.consumed(),
185                            handle.written(),
186                        );
187                    }
188                }
189            }
190        },
191        {
192            // If trail is between 0xA1 and 0xDF, inclusive,
193            // subtract 0xA1 and map to half-width Katakana.
194            let trail_minus_offset = byte.wrapping_sub(0xA1);
195            if trail_minus_offset > (0xDF - 0xA1) {
196                if byte < 0x80 {
197                    return (
198                        DecoderResult::Malformed(1, 0),
199                        unread_handle_trail.unread(),
200                        handle.written(),
201                    );
202                }
203                return (
204                    DecoderResult::Malformed(2, 0),
205                    unread_handle_trail.consumed(),
206                    handle.written(),
207                );
208            }
209            handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset))
210        },
211        self,
212        non_ascii,
213        jis0208_lead_minus_offset,
214        byte,
215        unread_handle_trail,
216        jis0212_lead_minus_offset,
217        lead,
218        unread_handle_jis0212,
219        source,
220        handle
221    );
222}
223
224#[cfg(feature = "fast-kanji-encode")]
225#[inline(always)]
226fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
227    jis0208_kanji_euc_jp_encode(bmp)
228}
229
230#[cfg(not(feature = "fast-kanji-encode"))]
231#[inline(always)]
232fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
233    if 0x4EDD == bmp {
234        // Ideograph on the symbol row!
235        Some((0xA1, 0xB8))
236    } else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
237        Some((lead, trail))
238    } else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
239        let lead = (pos / 94) + 0xD0;
240        let trail = (pos % 94) + 0xA1;
241        Some((lead as u8, trail as u8))
242    } else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
243        let lead = (pos / 94) + 0xF9;
244        let trail = (pos % 94) + 0xA1;
245        Some((lead as u8, trail as u8))
246    } else {
247        None
248    }
249}
250
251pub struct EucJpEncoder;
252
253impl EucJpEncoder {
254    pub fn new(encoding: &'static Encoding) -> Encoder {
255        Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder))
256    }
257
258    pub fn max_buffer_length_from_utf16_without_replacement(
259        &self,
260        u16_length: usize,
261    ) -> Option<usize> {
262        u16_length.checked_mul(2)
263    }
264
265    pub fn max_buffer_length_from_utf8_without_replacement(
266        &self,
267        byte_length: usize,
268    ) -> Option<usize> {
269        byte_length.checked_add(1)
270    }
271
272    ascii_compatible_bmp_encoder_functions!(
273        {
274            // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
275            let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
276            if bmp_minus_hiragana < 0x53 {
277                handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
278            } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
279                if let Some((lead, trail)) = encode_kanji(bmp) {
280                    handle.write_two(lead, trail)
281                } else {
282                    return (
283                        EncoderResult::unmappable_from_bmp(bmp),
284                        source.consumed(),
285                        handle.written(),
286                    );
287                }
288            } else {
289                let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
290                if bmp_minus_katakana < 0x56 {
291                    handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8)
292                } else {
293                    let bmp_minus_space = bmp.wrapping_sub(0x3000);
294                    if bmp_minus_space < 3 {
295                        // fast-track common punctuation
296                        handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8)
297                    } else if bmp == 0xA5 {
298                        handle.write_one(0x5Cu8)
299                    } else if bmp == 0x203E {
300                        handle.write_one(0x7Eu8)
301                    } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
302                        handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8)
303                    } else if bmp == 0x2212 {
304                        handle.write_two(0xA1u8, 0xDDu8)
305                    } else if let Some(pointer) = jis0208_range_encode(bmp) {
306                        let lead = (pointer / 94) + 0xA1;
307                        let trail = (pointer % 94) + 0xA1;
308                        handle.write_two(lead as u8, trail as u8)
309                    } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
310                        || bmp == 0xF929
311                        || bmp == 0xF9DC
312                    {
313                        // Guaranteed to be found in IBM_KANJI
314                        let pos = position(&IBM_KANJI[..], bmp).unwrap();
315                        let lead = (pos / 94) + 0xF9;
316                        let trail = (pos % 94) + 0xA1;
317                        handle.write_two(lead as u8, trail as u8)
318                    } else if let Some(pointer) = ibm_symbol_encode(bmp) {
319                        let lead = (pointer / 94) + 0xA1;
320                        let trail = (pointer % 94) + 0xA1;
321                        handle.write_two(lead as u8, trail as u8)
322                    } else if let Some(pointer) = jis0208_symbol_encode(bmp) {
323                        let lead = (pointer / 94) + 0xA1;
324                        let trail = (pointer % 94) + 0xA1;
325                        handle.write_two(lead as u8, trail as u8)
326                    } else {
327                        return (
328                            EncoderResult::unmappable_from_bmp(bmp),
329                            source.consumed(),
330                            handle.written(),
331                        );
332                    }
333                }
334            }
335        },
336        bmp,
337        self,
338        source,
339        handle,
340        copy_ascii_to_check_space_two,
341        check_space_two,
342        false
343    );
344}
345
346// Any copyright to the test code below this comment is dedicated to the
347// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
348
349#[cfg(all(test, feature = "alloc"))]
350mod tests {
351    use super::super::testing::*;
352    use super::super::*;
353
354    fn decode_euc_jp(bytes: &[u8], expect: &str) {
355        decode(EUC_JP, bytes, expect);
356    }
357
358    fn encode_euc_jp(string: &str, expect: &[u8]) {
359        encode(EUC_JP, string, expect);
360    }
361
362    #[test]
363    fn test_euc_jp_decode() {
364        // Empty
365        decode_euc_jp(b"", &"");
366
367        // ASCII
368        decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}");
369
370        // Half-width
371        decode_euc_jp(b"\x8E\xA1", "\u{FF61}");
372        decode_euc_jp(b"\x8E\xDF", "\u{FF9F}");
373        decode_euc_jp(b"\x8E\xA0", "\u{FFFD}");
374        decode_euc_jp(b"\x8E\xE0", "\u{FFFD}");
375        decode_euc_jp(b"\x8E\xFF", "\u{FFFD}");
376        decode_euc_jp(b"\x8E", "\u{FFFD}");
377
378        // JIS 0212
379        decode_euc_jp(b"\x8F\xA1\xA1", "\u{FFFD}");
380        decode_euc_jp(b"\x8F\xA2\xAF", "\u{02D8}");
381        decode_euc_jp(b"\x8F\xA2\xFF", "\u{FFFD}");
382        decode_euc_jp(b"\x8F\xA1", "\u{FFFD}");
383        decode_euc_jp(b"\x8F", "\u{FFFD}");
384
385        // JIS 0208
386        decode_euc_jp(b"\xA1\xA1", "\u{3000}");
387        decode_euc_jp(b"\xA1\xA0", "\u{FFFD}");
388        decode_euc_jp(b"\xFC\xFE", "\u{FF02}");
389        decode_euc_jp(b"\xFE\xFE", "\u{FFFD}");
390        decode_euc_jp(b"\xA1", "\u{FFFD}");
391
392        // Bad leads
393        decode_euc_jp(b"\xFF\xA1\xA1", "\u{FFFD}\u{3000}");
394        decode_euc_jp(b"\xA0\xA1\xA1", "\u{FFFD}\u{3000}");
395        decode_euc_jp(b"\x80\xA1\xA1", "\u{FFFD}\u{3000}");
396        decode_euc_jp(b"\x81\xA1\xA1", "\u{FFFD}\u{3000}");
397        decode_euc_jp(b"\x82\xA1\xA1", "\u{FFFD}\u{3000}");
398        decode_euc_jp(b"\x83\xA1\xA1", "\u{FFFD}\u{3000}");
399        decode_euc_jp(b"\x84\xA1\xA1", "\u{FFFD}\u{3000}");
400        decode_euc_jp(b"\x85\xA1\xA1", "\u{FFFD}\u{3000}");
401        decode_euc_jp(b"\x86\xA1\xA1", "\u{FFFD}\u{3000}");
402        decode_euc_jp(b"\x87\xA1\xA1", "\u{FFFD}\u{3000}");
403        decode_euc_jp(b"\x88\xA1\xA1", "\u{FFFD}\u{3000}");
404        decode_euc_jp(b"\x89\xA1\xA1", "\u{FFFD}\u{3000}");
405        decode_euc_jp(b"\x8A\xA1\xA1", "\u{FFFD}\u{3000}");
406        decode_euc_jp(b"\x8B\xA1\xA1", "\u{FFFD}\u{3000}");
407        decode_euc_jp(b"\x8C\xA1\xA1", "\u{FFFD}\u{3000}");
408        decode_euc_jp(b"\x8D\xA1\xA1", "\u{FFFD}\u{3000}");
409
410        // Bad ASCII trail
411        decode_euc_jp(b"\xA1\x40", "\u{FFFD}\u{0040}");
412    }
413
414    #[test]
415    fn test_euc_jp_encode() {
416        // Empty
417        encode_euc_jp("", b"");
418
419        // ASCII
420        encode_euc_jp("\u{0061}\u{0062}", b"\x61\x62");
421
422        // Exceptional code points
423        encode_euc_jp("\u{00A5}", b"\x5C");
424        encode_euc_jp("\u{203E}", b"\x7E");
425        encode_euc_jp("\u{2212}", b"\xA1\xDD");
426
427        // Half-width
428        encode_euc_jp("\u{FF61}", b"\x8E\xA1");
429        encode_euc_jp("\u{FF9F}", b"\x8E\xDF");
430
431        // JIS 0212
432        encode_euc_jp("\u{02D8}", b"&#728;");
433
434        // JIS 0208
435        encode_euc_jp("\u{3000}", b"\xA1\xA1");
436        encode_euc_jp("\u{FF02}", b"\xFC\xFE");
437    }
438
439    #[test]
440    #[cfg_attr(miri, ignore)] // Miri is too slow
441    fn test_jis0208_decode_all() {
442        let input = include_bytes!("test_data/jis0208_in.txt");
443        let expectation = include_str!("test_data/jis0208_in_ref.txt");
444        let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
445        assert!(had_errors, "Should have had errors.");
446        assert_eq!(&cow[..], expectation);
447    }
448
449    #[test]
450    #[cfg_attr(miri, ignore)] // Miri is too slow
451    fn test_jis0208_encode_all() {
452        let input = include_str!("test_data/jis0208_out.txt");
453        let expectation = include_bytes!("test_data/jis0208_out_ref.txt");
454        let (cow, encoding, had_errors) = EUC_JP.encode(input);
455        assert!(!had_errors, "Should not have had errors.");
456        assert_eq!(encoding, EUC_JP);
457        assert_eq!(&cow[..], &expectation[..]);
458    }
459
460    #[test]
461    #[cfg_attr(miri, ignore)] // Miri is too slow
462    fn test_jis0212_decode_all() {
463        let input = include_bytes!("test_data/jis0212_in.txt");
464        let expectation = include_str!("test_data/jis0212_in_ref.txt");
465        let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
466        assert!(had_errors, "Should have had errors.");
467        assert_eq!(&cow[..], expectation);
468    }
469}