symphonia_bundle_mp3/
synthesis.rs

1// Symphonia
2// Copyright (c) 2019-2022 The Project Symphonia Developers.
3//
4// This Source Code Form is subject to the terms of the Mozilla Public
5// License, v. 2.0. If a copy of the MPL was not distributed with this
6// file, You can obtain one at https://mozilla.org/MPL/2.0/.
7
8//! The `synthesis` module implements the polyphase synthesis filterbank of the MPEG audio standard.
9
10/// Synthesis window D[i], defined in Table B.3 of ISO/IEC 11172-3.
11#[allow(clippy::unreadable_literal)]
12#[rustfmt::skip]
13const SYNTHESIS_D: [f32; 512] = [
14     0.000000000, -0.000015259, -0.000015259, -0.000015259,
15    -0.000015259, -0.000015259, -0.000015259, -0.000030518,
16    -0.000030518, -0.000030518, -0.000030518, -0.000045776,
17    -0.000045776, -0.000061035, -0.000061035, -0.000076294,
18    -0.000076294, -0.000091553, -0.000106812, -0.000106812,
19    -0.000122070, -0.000137329, -0.000152588, -0.000167847,
20    -0.000198364, -0.000213623, -0.000244141, -0.000259399,
21    -0.000289917, -0.000320435, -0.000366211, -0.000396729,
22    -0.000442505, -0.000473022, -0.000534058, -0.000579834,
23    -0.000625610, -0.000686646, -0.000747681, -0.000808716,
24    -0.000885010, -0.000961304, -0.001037598, -0.001113892,
25    -0.001205444, -0.001296997, -0.001388550, -0.001480103,
26    -0.001586914, -0.001693726, -0.001785278, -0.001907349,
27    -0.002014160, -0.002120972, -0.002243042, -0.002349854,
28    -0.002456665, -0.002578735, -0.002685547, -0.002792358,
29    -0.002899170, -0.002990723, -0.003082275, -0.003173828,
30     0.003250122,  0.003326416,  0.003387451,  0.003433228,
31     0.003463745,  0.003479004,  0.003479004,  0.003463745,
32     0.003417969,  0.003372192,  0.003280640,  0.003173828,
33     0.003051758,  0.002883911,  0.002700806,  0.002487183,
34     0.002227783,  0.001937866,  0.001617432,  0.001266479,
35     0.000869751,  0.000442505, -0.000030518, -0.000549316,
36    -0.001098633, -0.001693726, -0.002334595, -0.003005981,
37    -0.003723145, -0.004486084, -0.005294800, -0.006118774,
38    -0.007003784, -0.007919312, -0.008865356, -0.009841919,
39    -0.010848999, -0.011886597, -0.012939453, -0.014022827,
40    -0.015121460, -0.016235352, -0.017349243, -0.018463135,
41    -0.019577026, -0.020690918, -0.021789551, -0.022857666,
42    -0.023910522, -0.024932861, -0.025909424, -0.026840210,
43    -0.027725220, -0.028533936, -0.029281616, -0.029937744,
44    -0.030532837, -0.031005859, -0.031387329, -0.031661987,
45    -0.031814575, -0.031845093, -0.031738281, -0.031478882,
46     0.031082153,  0.030517578,  0.029785156,  0.028884888,
47     0.027801514,  0.026535034,  0.025085449,  0.023422241,
48     0.021575928,  0.019531250,  0.017257690,  0.014801025,
49     0.012115479,  0.009231567,  0.006134033,  0.002822876,
50    -0.000686646, -0.004394531, -0.008316040, -0.012420654,
51    -0.016708374, -0.021179199, -0.025817871, -0.030609131,
52    -0.035552979, -0.040634155, -0.045837402, -0.051132202,
53    -0.056533813, -0.061996460, -0.067520142, -0.073059082,
54    -0.078628540, -0.084182739, -0.089706421, -0.095169067,
55    -0.100540161, -0.105819702, -0.110946655, -0.115921021,
56    -0.120697021, -0.125259399, -0.129562378, -0.133590698,
57    -0.137298584, -0.140670776, -0.143676758, -0.146255493,
58    -0.148422241, -0.150115967, -0.151306152, -0.151962280,
59    -0.152069092, -0.151596069, -0.150497437, -0.148773193,
60    -0.146362305, -0.143264771, -0.139450073, -0.134887695,
61    -0.129577637, -0.123474121, -0.116577148, -0.108856201,
62     0.100311279,  0.090927124,  0.080688477,  0.069595337,
63     0.057617187,  0.044784546,  0.031082153,  0.016510010,
64     0.001068115, -0.015228271, -0.032379150, -0.050354004,
65    -0.069168091, -0.088775635, -0.109161377, -0.130310059,
66    -0.152206421, -0.174789429, -0.198059082, -0.221984863,
67    -0.246505737, -0.271591187, -0.297210693, -0.323318481,
68    -0.349868774, -0.376800537, -0.404083252, -0.431655884,
69    -0.459472656, -0.487472534, -0.515609741, -0.543823242,
70    -0.572036743, -0.600219727, -0.628295898, -0.656219482,
71    -0.683914185, -0.711318970, -0.738372803, -0.765029907,
72    -0.791213989, -0.816864014, -0.841949463, -0.866363525,
73    -0.890090942, -0.913055420, -0.935195923, -0.956481934,
74    -0.976852417, -0.996246338, -1.014617920, -1.031936646,
75    -1.048156738, -1.063217163, -1.077117920, -1.089782715,
76    -1.101211548, -1.111373901, -1.120223999, -1.127746582,
77    -1.133926392, -1.138763428, -1.142211914, -1.144287109,
78     1.144989014,  1.144287109,  1.142211914,  1.138763428,
79     1.133926392,  1.127746582,  1.120223999,  1.111373901,
80     1.101211548,  1.089782715,  1.077117920,  1.063217163,
81     1.048156738,  1.031936646,  1.014617920,  0.996246338,
82     0.976852417,  0.956481934,  0.935195923,  0.913055420,
83     0.890090942,  0.866363525,  0.841949463,  0.816864014,
84     0.791213989,  0.765029907,  0.738372803,  0.711318970,
85     0.683914185,  0.656219482,  0.628295898,  0.600219727,
86     0.572036743,  0.543823242,  0.515609741,  0.487472534,
87     0.459472656,  0.431655884,  0.404083252,  0.376800537,
88     0.349868774,  0.323318481,  0.297210693,  0.271591187,
89     0.246505737,  0.221984863,  0.198059082,  0.174789429,
90     0.152206421,  0.130310059,  0.109161377,  0.088775635,
91     0.069168091,  0.050354004,  0.032379150,  0.015228271,
92    -0.001068115, -0.016510010, -0.031082153, -0.044784546,
93    -0.057617187, -0.069595337, -0.080688477, -0.090927124,
94     0.100311279,  0.108856201,  0.116577148,  0.123474121,
95     0.129577637,  0.134887695,  0.139450073,  0.143264771,
96     0.146362305,  0.148773193,  0.150497437,  0.151596069,
97     0.152069092,  0.151962280,  0.151306152,  0.150115967,
98     0.148422241,  0.146255493,  0.143676758,  0.140670776,
99     0.137298584,  0.133590698,  0.129562378,  0.125259399,
100     0.120697021,  0.115921021,  0.110946655,  0.105819702,
101     0.100540161,  0.095169067,  0.089706421,  0.084182739,
102     0.078628540,  0.073059082,  0.067520142,  0.061996460,
103     0.056533813,  0.051132202,  0.045837402,  0.040634155,
104     0.035552979,  0.030609131,  0.025817871,  0.021179199,
105     0.016708374,  0.012420654,  0.008316040,  0.004394531,
106     0.000686646, -0.002822876, -0.006134033, -0.009231567,
107    -0.012115479, -0.014801025, -0.017257690, -0.019531250,
108    -0.021575928, -0.023422241, -0.025085449, -0.026535034,
109    -0.027801514, -0.028884888, -0.029785156, -0.030517578,
110     0.031082153,  0.031478882,  0.031738281,  0.031845093,
111     0.031814575,  0.031661987,  0.031387329,  0.031005859,
112     0.030532837,  0.029937744,  0.029281616,  0.028533936,
113     0.027725220,  0.026840210,  0.025909424,  0.024932861,
114     0.023910522,  0.022857666,  0.021789551,  0.020690918,
115     0.019577026,  0.018463135,  0.017349243,  0.016235352,
116     0.015121460,  0.014022827,  0.012939453,  0.011886597,
117     0.010848999,  0.009841919,  0.008865356,  0.007919312,
118     0.007003784,  0.006118774,  0.005294800,  0.004486084,
119     0.003723145,  0.003005981,  0.002334595,  0.001693726,
120     0.001098633,  0.000549316,  0.000030518, -0.000442505,
121    -0.000869751, -0.001266479, -0.001617432, -0.001937866,
122    -0.002227783, -0.002487183, -0.002700806, -0.002883911,
123    -0.003051758, -0.003173828, -0.003280640, -0.003372192,
124    -0.003417969, -0.003463745, -0.003479004, -0.003479004,
125    -0.003463745, -0.003433228, -0.003387451, -0.003326416,
126     0.003250122,  0.003173828,  0.003082275,  0.002990723,
127     0.002899170,  0.002792358,  0.002685547,  0.002578735,
128     0.002456665,  0.002349854,  0.002243042,  0.002120972,
129     0.002014160,  0.001907349,  0.001785278,  0.001693726,
130     0.001586914,  0.001480103,  0.001388550,  0.001296997,
131     0.001205444,  0.001113892,  0.001037598,  0.000961304,
132     0.000885010,  0.000808716,  0.000747681,  0.000686646,
133     0.000625610,  0.000579834,  0.000534058,  0.000473022,
134     0.000442505,  0.000396729,  0.000366211,  0.000320435,
135     0.000289917,  0.000259399,  0.000244141,  0.000213623,
136     0.000198364,  0.000167847,  0.000152588,  0.000137329,
137     0.000122070,  0.000106812,  0.000106812,  0.000091553,
138     0.000076294,  0.000076294,  0.000061035,  0.000061035,
139     0.000045776,  0.000045776,  0.000030518,  0.000030518,
140     0.000030518,  0.000030518,  0.000015259,  0.000015259,
141     0.000015259,  0.000015259,  0.000015259,  0.000015259,
142];
143
144/// `SynthesisState` maintains the persistant state of sub-band synthesis.
145pub struct SynthesisState {
146    v_vec: [[f32; 64]; 16],
147    v_front: usize,
148}
149
150impl Default for SynthesisState {
151    fn default() -> Self {
152        SynthesisState { v_vec: [[0f32; 64]; 16], v_front: 0 }
153    }
154}
155
156/// Sub-band synthesis transforms 32 sub-band blocks containing 18 time-domain samples each into
157/// 18 blocks of 32 PCM audio samples.
158pub fn synthesis(state: &mut SynthesisState, n_frames: usize, in_samples: &[f32], out: &mut [f32]) {
159    let mut s_vec = [0f32; 32];
160    let mut d_vec = [0f32; 32];
161
162    assert!(in_samples.len() == 32 * n_frames);
163
164    // There are 18 synthesized PCM sample blocks.
165    for b in 0..n_frames {
166        // First, select the b-th sample from each of the 32 sub-bands, and place them in the s
167        // vector, s_vec.
168        for i in 0..32 {
169            s_vec[i] = in_samples[n_frames * i + b];
170        }
171
172        // Get the front slot of the v_vec FIFO.
173        let v_vec = &mut state.v_vec[state.v_front];
174
175        // Matrixing is performed next. As per the standard, matrixing would require 2048
176        // multiplications per sub-band! However, following the method by Konstantinides
177        // published in [1], it is possible to achieve the same result through the use of a 32-point
178        // DCT followed by some reconstruction.
179        //
180        // It should be noted that this is a deceptively simple solution. It is instructive to
181        // derive the algorithm before getting to the implementation to better understand what is
182        // happening, and where the edge-cases are.
183        //
184        // First, there are a few key observations to this approach:
185        //
186        //     1) The "matrixing" operation as per the standard is simply a 32-point MDCT. Note that
187        //        an N-point MDCT produces a 2N-point output.
188        //
189        //     2) The output of the MDCT contains repeated blocks of samples. If the result of a
190        //        MDCT defined as is X[0..64), then:
191        //
192        //          1) X(16.. 0] =  X(48..32]
193        //          2) X[48..64) = -X[16..32)
194        //
195        //        Corollary: Only points [16..48) of the MDCT are actually required! All other
196        //                   points are redundant.
197        //
198        //      3) Points [16..48) of the MDCT can be mapped from a 32-point DCT of the input
199        //         vector thus allowing the use of an efficient DCT algorithm.
200        //
201        // The mappings above can be found graphically by plotting each row of the cosine
202        // coefficient matricies of both the DCT and MDCT side-by side. The mapping becomes readily
203        // apparent, and so too do the exceptions.
204        //
205        // Using the observations above, if we apply a 32-point DCT transform to the input vector,
206        // s_vec, and place the output in the DCT output vector, d_vec, we obtain the plot labelled
207        // d_vec below.
208        //
209        // Next, assuming the 32-point MDCT output vector is denoted v_vec. Map the samples from the
210        // 32-point DCT, d_vec[0..32], to points v_vec[0..16], v_vec[16..32], v_vec[32..48], and
211        // v_vec[48..64] of the 32-point MDCT. The result is depicted graphically in the plot
212        // labelled v_vec below.
213        //
214        // d_vec        0              16             32
215        //              .               .              .
216        //              .     +---------+   +----------+
217        //              +-----+    A    | /     B      |
218        //              +---------------+--------------+
219        //
220        // v_vec        0              16             32             48              64
221        //              .               .              .              .               .
222        //              .   +-----------+              .              .               .
223        //              . /      B      |              .              .               .
224        //              +---------------+--------------+--------------+---------------+
225        //              .               |     -B     / |   -A   +-----+-----+   -A    |
226        //              .               +----------+   +--------+     .     +---------+
227        //
228        // Note however that the mappings in the previous step have exceptions for boundary samples.
229        // These exceptions can be seen when plotting the coefficient matricies as mentioned above.
230        // The mapping for boundary samples are as follows:
231        //
232        //     1) v_vec[ 0] =  d_vec[16]
233        //     2) v_vec[32] = -d_vec[16]
234        //     3) v_vec[48] = -d_vec[ 0]
235        //     4) v_vec[16] =  0.0
236        //
237        // The final algorithm written below performs the copy and flip operations of each 16 sample
238        // quadrant in seperate loops to assist auto-vectorization. The boundary samples are
239        // excluded from these loops and handled manually afterwards.
240        //
241        // [1] K. Konstantinides, "Fast subband filtering in MPEG audio coding", Signal Processing
242        // Letters IEEE, vol. 1, no. 2, pp. 26-28, 1994.
243        //
244        // https://ieeexplore.ieee.org/abstract/document/300309
245        dct32(&s_vec, &mut d_vec);
246
247        for (d, s) in v_vec[48 - 15..48 + 0].iter_mut().rev().zip(&d_vec[1..16]) {
248            *d = -s;
249        }
250        for (d, s) in v_vec[48 + 1..48 + 16].iter_mut().zip(&d_vec[1..16]) {
251            *d = -s;
252        }
253        for (d, s) in v_vec[16 + 1..16 + 16].iter_mut().rev().zip(&d_vec[17..32]) {
254            *d = -s;
255        }
256        for (d, s) in v_vec[1..16].iter_mut().zip(&d_vec[17..32]) {
257            *d = *s;
258        }
259
260        v_vec[0] = d_vec[16];
261        v_vec[32] = -d_vec[16];
262        v_vec[48] = -d_vec[0];
263        v_vec[16] = 0.0;
264
265        // Next, as per the specification, build a vector, u_vec, by iterating over the 16 slots in
266        // v_vec, and copying the first 32 samples of EVEN numbered v_vec slots, and the last 32
267        // samples of ODD numbered v_vec slots sequentially into u_vec.
268        //
269        // For example, given:
270        //
271        //        0   32   64   96  128  160  192  224  256           896  928   960  992 1024
272        //        +----+----+----+----+----+----+----+----+ . . . . . . +----+-----+----+----+
273        // v_vec  | a  : b  | c  : d  | e  : f  | g  | h  | . . . . . . | w  : x   | y  : z  |
274        //        +----+----+----+----+----+----+----+----+ . . . . . . +----+-----+----+----+
275        //        [ Slot 0 ][ Slot 1 ][ Slot 2 ][ Slot 3 ]  . . . . . . [ Slot 14 ][ Slot 15 ]
276        //
277        // Assuming v_front, the front of the FIFO, is slot 0, then u_vec is filled as follows:
278        //
279        //        0   32   64   96  128       448  480  512
280        //        +----+----+----+----+ . . . . +----+----+
281        // u_vec  | a  | d  | e  | h  | . . . . | w  | z  |
282        //        +----+----+----+----+ . . . . +----+----+
283        //
284        // Finally, generate the 32 sample PCM blocks. Assuming s[i] is sample i of a PCM sample
285        // block, the following equation governs sample generation:
286        //
287        //         16
288        // s[i] = SUM { u_vec[32*j + i] * D[32*j + i] }    for i=0..32
289        //        j=0
290        //
291        // where:
292        //     D[0..512] is the synthesis window provided in table B.3 of ISO/IEC 11172-3.
293        //
294        // In words, u_vec is logically partitioned into 16 slots of 32 samples each (i.e.,
295        // slot 0 spans u_vec[0..32], slot 1 spans u_vec[32..64], and so on). Then, the i-th
296        // sample in the PCM block is the summation of the i-th sample in each of the 16 u_vec
297        // slots after being multiplied by the synthesis window.
298        //
299        // But wait! This is VERY inefficient!
300        //
301        // If PCM sample generation is reframed such that instead of iterating j for every i, i is
302        // iterated through for every j, then it is possible to iterate straight-through
303        // v_vec[j][0..32] and D[32*j..(32*j) + 32] while multiplying and accumulating the
304        // intermediary calculations in a zeroed output vector, o_vec. After iterating over every j,
305        // o_vec can be copied to the output sample buffer, out, in one block.
306        //
307        // Using this method, there is no reason to build u_vec and cache locality is greatly
308        // improved.
309        let mut o_vec = [0f32; 32];
310
311        for j in 0..8 {
312            let v_start = state.v_front + (j << 1);
313
314            let v0 = &state.v_vec[(v_start + 0) & 0xf][0..32];
315            let v1 = &state.v_vec[(v_start + 1) & 0xf][32..64];
316
317            let k = j << 6;
318
319            for i in 0..32 {
320                o_vec[i] += v0[i] * SYNTHESIS_D[k + i + 0];
321                o_vec[i] += v1[i] * SYNTHESIS_D[k + i + 32];
322            }
323        }
324
325        // Clamp and copy the PCM samples from o_vec to the output buffer.
326        let offset = b << 5;
327
328        for (o, s) in out[offset..offset + 32].iter_mut().zip(&o_vec) {
329            *o = s.clamp(-1.0, 1.0);
330        }
331
332        // Shift the v_vec FIFO. The value v_front is the index of the 64 sample slot in v_vec
333        // that will be overwritten next iteration. Conversely, that makes it the front of the
334        // FIFO for the purpose of building u_vec. We would like to overwrite the oldest slot,
335        // so we subtract 1 via a wrapping addition to move the front backwards by 1 slot,
336        // effectively overwriting the oldest slot with the soon-to-be newest.
337        state.v_front = (state.v_front + 15) & 0xf;
338    }
339}
340
341/// Performs a 32-point Discrete Cosine Transform (DCT) using Byeong Gi Lee's fast algorithm
342/// published in article [1] without inverse square-root 2 scaling.
343///
344/// This is a straight-forward implemention of the recursive algorithm, flattened into a single
345/// function body to avoid the overhead of function calls and the stack.
346///
347/// [1] B.G. Lee, "A new algorithm to compute the discrete cosine transform", IEEE Transactions
348/// on Acoustics, Speech, and Signal Processing, vol. 32, no. 6, pp. 1243-1245, 1984.
349///
350/// https://ieeexplore.ieee.org/document/1164443
351fn dct32(x: &[f32; 32], y: &mut [f32; 32]) {
352    // The following tables are pre-computed values of the the following equation:
353    //
354    // c[i] = 1.0 / [2.0 * cos((PI / N) * (2*i + 1))]    for i = 0..N/2
355    //
356    // where N = [32, 16, 8, 4, 2], for COS_16, COS8, COS_4, and COS_2, respectively.
357    const COS_16: [f32; 16] = [
358        0.500_602_998_235_196_3,  // i= 0
359        0.505_470_959_897_543_6,  // i= 1
360        0.515_447_309_922_624_6,  // i= 2
361        0.531_042_591_089_784_1,  // i= 3
362        0.553_103_896_034_444_5,  // i= 4
363        0.582_934_968_206_133_9,  // i= 5
364        0.622_504_123_035_664_8,  // i= 6
365        0.674_808_341_455_005_7,  // i= 7
366        0.744_536_271_002_298_6,  // i= 8
367        0.839_349_645_415_526_8,  // i= 9
368        0.972_568_237_861_960_8,  // i=10
369        1.169_439_933_432_884_7,  // i=11
370        1.484_164_616_314_166_2,  // i=12
371        2.057_781_009_953_410_8,  // i=13
372        3.407_608_418_468_719_0,  // i=14
373        10.190_008_123_548_032_9, // i=15
374    ];
375
376    const COS_8: [f32; 8] = [
377        0.502_419_286_188_155_7, // i=0
378        0.522_498_614_939_688_9, // i=1
379        0.566_944_034_816_357_7, // i=2
380        0.646_821_783_359_990_1, // i=3
381        0.788_154_623_451_250_2, // i=4
382        1.060_677_685_990_347_1, // i=5
383        1.722_447_098_238_334_2, // i=6
384        5.101_148_618_689_155_3, // i=7
385    ];
386
387    const COS_4: [f32; 4] = [
388        0.509_795_579_104_159_2, // i=0
389        0.601_344_886_935_045_3, // i=1
390        0.899_976_223_136_415_6, // i=2
391        2.562_915_447_741_505_5, // i=3
392    ];
393
394    const COS_2: [f32; 2] = [
395        0.541_196_100_146_197_0, // i=0
396        1.306_562_964_876_376_4, // i=1
397    ];
398
399    const COS_1: f32 = 0.707_106_781_186_547_5;
400
401    // 16-point DCT decomposition
402    let mut t0 = [
403        (x[0] + x[32 - 1]),
404        (x[1] + x[32 - 2]),
405        (x[2] + x[32 - 3]),
406        (x[3] + x[32 - 4]),
407        (x[4] + x[32 - 5]),
408        (x[5] + x[32 - 6]),
409        (x[6] + x[32 - 7]),
410        (x[7] + x[32 - 8]),
411        (x[8] + x[32 - 9]),
412        (x[9] + x[32 - 10]),
413        (x[10] + x[32 - 11]),
414        (x[11] + x[32 - 12]),
415        (x[12] + x[32 - 13]),
416        (x[13] + x[32 - 14]),
417        (x[14] + x[32 - 15]),
418        (x[15] + x[32 - 16]),
419        (x[0] - x[32 - 1]) * COS_16[0],
420        (x[1] - x[32 - 2]) * COS_16[1],
421        (x[2] - x[32 - 3]) * COS_16[2],
422        (x[3] - x[32 - 4]) * COS_16[3],
423        (x[4] - x[32 - 5]) * COS_16[4],
424        (x[5] - x[32 - 6]) * COS_16[5],
425        (x[6] - x[32 - 7]) * COS_16[6],
426        (x[7] - x[32 - 8]) * COS_16[7],
427        (x[8] - x[32 - 9]) * COS_16[8],
428        (x[9] - x[32 - 10]) * COS_16[9],
429        (x[10] - x[32 - 11]) * COS_16[10],
430        (x[11] - x[32 - 12]) * COS_16[11],
431        (x[12] - x[32 - 13]) * COS_16[12],
432        (x[13] - x[32 - 14]) * COS_16[13],
433        (x[14] - x[32 - 15]) * COS_16[14],
434        (x[15] - x[32 - 16]) * COS_16[15],
435    ];
436
437    // 16-point DCT decomposition of t0[0..16]
438    {
439        let mut t1 = [
440            (t0[0] + t0[16 - 1]),
441            (t0[1] + t0[16 - 2]),
442            (t0[2] + t0[16 - 3]),
443            (t0[3] + t0[16 - 4]),
444            (t0[4] + t0[16 - 5]),
445            (t0[5] + t0[16 - 6]),
446            (t0[6] + t0[16 - 7]),
447            (t0[7] + t0[16 - 8]),
448            (t0[0] - t0[16 - 1]) * COS_8[0],
449            (t0[1] - t0[16 - 2]) * COS_8[1],
450            (t0[2] - t0[16 - 3]) * COS_8[2],
451            (t0[3] - t0[16 - 4]) * COS_8[3],
452            (t0[4] - t0[16 - 5]) * COS_8[4],
453            (t0[5] - t0[16 - 6]) * COS_8[5],
454            (t0[6] - t0[16 - 7]) * COS_8[6],
455            (t0[7] - t0[16 - 8]) * COS_8[7],
456        ];
457
458        // 8-point DCT decomposition of t1[0..8]
459        {
460            let mut t2 = [
461                (t1[0] + t1[8 - 1]),
462                (t1[1] + t1[8 - 2]),
463                (t1[2] + t1[8 - 3]),
464                (t1[3] + t1[8 - 4]),
465                (t1[0] - t1[8 - 1]) * COS_4[0],
466                (t1[1] - t1[8 - 2]) * COS_4[1],
467                (t1[2] - t1[8 - 3]) * COS_4[2],
468                (t1[3] - t1[8 - 4]) * COS_4[3],
469            ];
470
471            // 4-point DCT decomposition of t2[0..4]
472            {
473                let mut t3 = [
474                    (t2[0] + t2[4 - 1]),
475                    (t2[1] + t2[4 - 2]),
476                    (t2[0] - t2[4 - 1]) * COS_2[0],
477                    (t2[1] - t2[4 - 2]) * COS_2[1],
478                ];
479
480                // 2-point DCT decomposition of t3[0..2]
481                {
482                    let t4 = [(t3[0] + t3[2 - 1]), (t3[0] - t3[2 - 1]) * COS_1];
483
484                    t3[0] = t4[0];
485                    t3[1] = t4[1];
486                }
487
488                // 2-point DCT decomposition of t3[2..4]
489                {
490                    let t4 = [(t3[2] + t3[4 - 1]), (t3[2] - t3[4 - 1]) * COS_1];
491
492                    t3[2 + 0] = t4[0];
493                    t3[2 + 1] = t4[1];
494                }
495
496                t2[0 + 0] = t3[0];
497                t2[0 + 1] = t3[2] + t3[3];
498                t2[0 + 2] = t3[1];
499                t2[0 + 3] = t3[3];
500            }
501
502            // 4-point DCT decomposition of t2[4..8]
503            {
504                let mut t3 = [
505                    (t2[4] + t2[8 - 1]),
506                    (t2[5] + t2[8 - 2]),
507                    (t2[4] - t2[8 - 1]) * COS_2[0],
508                    (t2[5] - t2[8 - 2]) * COS_2[1],
509                ];
510
511                // 2-point DCT decomposition of t3[0..2]
512                {
513                    let t4 = [(t3[0] + t3[2 - 1]), (t3[0] - t3[2 - 1]) * COS_1];
514
515                    t3[0] = t4[0];
516                    t3[1] = t4[1];
517                }
518
519                // 2-point DCT decomposition of t3[2..4]
520                {
521                    let t4 = [(t3[2] + t3[4 - 1]), (t3[2] - t3[4 - 1]) * COS_1];
522
523                    t3[2 + 0] = t4[0];
524                    t3[2 + 1] = t4[1];
525                }
526
527                t2[4 + 0] = t3[0];
528                t2[4 + 1] = t3[2] + t3[3];
529                t2[4 + 2] = t3[1];
530                t2[4 + 3] = t3[3];
531            }
532
533            // Recombine t2[0..4] and t2[4..8], overwriting t1[0..8].
534            for i in 0..3 {
535                t1[(i << 1) + 0] = t2[i];
536                t1[(i << 1) + 1] = t2[4 + i] + t2[4 + i + 1];
537            }
538
539            t1[8 - 2] = t2[4 - 1];
540            t1[8 - 1] = t2[8 - 1];
541        }
542
543        // 8-point DCT decomposition of t1[8..16]
544        {
545            let mut t2 = [
546                (t1[8] + t1[16 - 1]),
547                (t1[9] + t1[16 - 2]),
548                (t1[10] + t1[16 - 3]),
549                (t1[11] + t1[16 - 4]),
550                (t1[8] - t1[16 - 1]) * COS_4[0],
551                (t1[9] - t1[16 - 2]) * COS_4[1],
552                (t1[10] - t1[16 - 3]) * COS_4[2],
553                (t1[11] - t1[16 - 4]) * COS_4[3],
554            ];
555
556            // 4-point DCT decomposition of t2[0..4]
557            {
558                let mut t3 = [
559                    (t2[0] + t2[4 - 1]),
560                    (t2[1] + t2[4 - 2]),
561                    (t2[0] - t2[4 - 1]) * COS_2[0],
562                    (t2[1] - t2[4 - 2]) * COS_2[1],
563                ];
564
565                // 2-point DCT decomposition of t3[0..2]
566                {
567                    let t4 = [(t3[0] + t3[2 - 1]), (t3[0] - t3[2 - 1]) * COS_1];
568
569                    t3[0] = t4[0];
570                    t3[1] = t4[1];
571                }
572
573                // 2-point DCT decomposition of t3[2..4]
574                {
575                    let t4 = [(t3[2] + t3[4 - 1]), (t3[2] - t3[4 - 1]) * COS_1];
576
577                    t3[2 + 0] = t4[0];
578                    t3[2 + 1] = t4[1];
579                }
580
581                t2[0 + 0] = t3[0];
582                t2[0 + 1] = t3[2] + t3[3];
583                t2[0 + 2] = t3[1];
584                t2[0 + 3] = t3[3];
585            }
586
587            // 4-point DCT decomposition of t2[4..8]
588            {
589                let mut t3 = [
590                    (t2[4] + t2[8 - 1]),
591                    (t2[5] + t2[8 - 2]),
592                    (t2[4] - t2[8 - 1]) * COS_2[0],
593                    (t2[5] - t2[8 - 2]) * COS_2[1],
594                ];
595
596                // 2-point DCT decomposition of t3[0..2]
597                {
598                    let t4 = [(t3[0] + t3[2 - 1]), (t3[0] - t3[2 - 1]) * COS_1];
599
600                    t3[0] = t4[0];
601                    t3[1] = t4[1];
602                }
603
604                // 2-point DCT decomposition of t3[2..4]
605                {
606                    let t4 = [(t3[2] + t3[4 - 1]), (t3[2] - t3[4 - 1]) * COS_1];
607
608                    t3[2 + 0] = t4[0];
609                    t3[2 + 1] = t4[1];
610                }
611
612                t2[4 + 0] = t3[0];
613                t2[4 + 1] = t3[2] + t3[3];
614                t2[4 + 2] = t3[1];
615                t2[4 + 3] = t3[3];
616            }
617
618            // Recombine t2[0..4] and t2[4..8], overwriting t1[8..16].
619            for i in 0..3 {
620                t1[8 + (i << 1) + 0] = t2[i];
621                t1[8 + (i << 1) + 1] = t2[4 + i] + t2[4 + i + 1];
622            }
623
624            t1[16 - 2] = t2[4 - 1];
625            t1[16 - 1] = t2[8 - 1];
626        }
627
628        // Recombine t1[0..8] and t1[8..16], overwriting t0[0..16].
629        for i in 0..7 {
630            t0[(i << 1) + 0] = t1[i];
631            t0[(i << 1) + 1] = t1[8 + i] + t1[8 + i + 1];
632        }
633
634        t0[16 - 2] = t1[8 - 1];
635        t0[16 - 1] = t1[16 - 1];
636    }
637
638    // 16-point DCT decomposition of t0[16..32]
639    {
640        let mut t1 = [
641            (t0[16] + t0[32 - 1]),
642            (t0[17] + t0[32 - 2]),
643            (t0[18] + t0[32 - 3]),
644            (t0[19] + t0[32 - 4]),
645            (t0[20] + t0[32 - 5]),
646            (t0[21] + t0[32 - 6]),
647            (t0[22] + t0[32 - 7]),
648            (t0[23] + t0[32 - 8]),
649            (t0[16] - t0[32 - 1]) * COS_8[0],
650            (t0[17] - t0[32 - 2]) * COS_8[1],
651            (t0[18] - t0[32 - 3]) * COS_8[2],
652            (t0[19] - t0[32 - 4]) * COS_8[3],
653            (t0[20] - t0[32 - 5]) * COS_8[4],
654            (t0[21] - t0[32 - 6]) * COS_8[5],
655            (t0[22] - t0[32 - 7]) * COS_8[6],
656            (t0[23] - t0[32 - 8]) * COS_8[7],
657        ];
658
659        // 8-point DCT decomposition of t1[0..8]
660        {
661            let mut t2 = [
662                (t1[0] + t1[8 - 1]),
663                (t1[1] + t1[8 - 2]),
664                (t1[2] + t1[8 - 3]),
665                (t1[3] + t1[8 - 4]),
666                (t1[0] - t1[8 - 1]) * COS_4[0],
667                (t1[1] - t1[8 - 2]) * COS_4[1],
668                (t1[2] - t1[8 - 3]) * COS_4[2],
669                (t1[3] - t1[8 - 4]) * COS_4[3],
670            ];
671
672            // 4-point DCT decomposition of t2[0..4]
673            {
674                let mut t3 = [
675                    (t2[0] + t2[4 - 1]),
676                    (t2[1] + t2[4 - 2]),
677                    (t2[0] - t2[4 - 1]) * COS_2[0],
678                    (t2[1] - t2[4 - 2]) * COS_2[1],
679                ];
680
681                // 2-point DCT decomposition of t3[0..2]
682                {
683                    let t4 = [(t3[0] + t3[2 - 1]), (t3[0] - t3[2 - 1]) * COS_1];
684
685                    t3[0] = t4[0];
686                    t3[1] = t4[1];
687                }
688
689                // 2-point DCT decomposition of t3[2..4]
690                {
691                    let t4 = [(t3[2] + t3[4 - 1]), (t3[2] - t3[4 - 1]) * COS_1];
692
693                    t3[2 + 0] = t4[0];
694                    t3[2 + 1] = t4[1];
695                }
696
697                t2[0 + 0] = t3[0];
698                t2[0 + 1] = t3[2] + t3[3];
699                t2[0 + 2] = t3[1];
700                t2[0 + 3] = t3[3];
701            }
702
703            // 4-point DCT decomposition of t2[4..8]
704            {
705                let mut t3 = [
706                    (t2[4] + t2[8 - 1]),
707                    (t2[5] + t2[8 - 2]),
708                    (t2[4] - t2[8 - 1]) * COS_2[0],
709                    (t2[5] - t2[8 - 2]) * COS_2[1],
710                ];
711
712                // 2-point DCT decomposition of t3[0..2]
713                {
714                    let t4 = [(t3[0] + t3[2 - 1]), (t3[0] - t3[2 - 1]) * COS_1];
715
716                    t3[0] = t4[0];
717                    t3[1] = t4[1];
718                }
719
720                // 2-point DCT decomposition of t3[2..4]
721                {
722                    let t4 = [(t3[2] + t3[4 - 1]), (t3[2] - t3[4 - 1]) * COS_1];
723
724                    t3[2 + 0] = t4[0];
725                    t3[2 + 1] = t4[1];
726                }
727
728                t2[4 + 0] = t3[0];
729                t2[4 + 1] = t3[2] + t3[3];
730                t2[4 + 2] = t3[1];
731                t2[4 + 3] = t3[3];
732            }
733
734            // Recombine t2[0..4] and t2[4..8], overwriting t1[0..8].
735            for i in 0..3 {
736                t1[(i << 1) + 0] = t2[i];
737                t1[(i << 1) + 1] = t2[4 + i] + t2[4 + i + 1];
738            }
739
740            t1[8 - 2] = t2[4 - 1];
741            t1[8 - 1] = t2[8 - 1];
742        }
743
744        // 8-point DCT decomposition of t1[8..16]
745        {
746            let mut t2 = [
747                (t1[8] + t1[16 - 1]),
748                (t1[9] + t1[16 - 2]),
749                (t1[10] + t1[16 - 3]),
750                (t1[11] + t1[16 - 4]),
751                (t1[8] - t1[16 - 1]) * COS_4[0],
752                (t1[9] - t1[16 - 2]) * COS_4[1],
753                (t1[10] - t1[16 - 3]) * COS_4[2],
754                (t1[11] - t1[16 - 4]) * COS_4[3],
755            ];
756
757            // 4-point DCT decomposition of t2[0..4]
758            {
759                let mut t3 = [
760                    (t2[0] + t2[4 - 1]),
761                    (t2[1] + t2[4 - 2]),
762                    (t2[0] - t2[4 - 1]) * COS_2[0],
763                    (t2[1] - t2[4 - 2]) * COS_2[1],
764                ];
765
766                // 2-point DCT decomposition of t3[0..2]
767                {
768                    let t4 = [(t3[0] + t3[2 - 1]), (t3[0] - t3[2 - 1]) * COS_1];
769
770                    t3[0] = t4[0];
771                    t3[1] = t4[1];
772                }
773
774                // 2-point DCT decomposition of t3[2..4]
775                {
776                    let t4 = [(t3[2] + t3[4 - 1]), (t3[2] - t3[4 - 1]) * COS_1];
777
778                    t3[2 + 0] = t4[0];
779                    t3[2 + 1] = t4[1];
780                }
781
782                t2[0 + 0] = t3[0];
783                t2[0 + 1] = t3[2] + t3[3];
784                t2[0 + 2] = t3[1];
785                t2[0 + 3] = t3[3];
786            }
787
788            // 4-point DCT decomposition of t2[4..8]
789            {
790                let mut t3 = [
791                    (t2[4] + t2[8 - 1]),
792                    (t2[5] + t2[8 - 2]),
793                    (t2[4] - t2[8 - 1]) * COS_2[0],
794                    (t2[5] - t2[8 - 2]) * COS_2[1],
795                ];
796
797                // 2-point DCT decomposition of t3[0..2]
798                {
799                    let t4 = [(t3[0] + t3[2 - 1]), (t3[0] - t3[2 - 1]) * COS_1];
800
801                    t3[0] = t4[0];
802                    t3[1] = t4[1];
803                }
804
805                // 2-point DCT decomposition of t3[2..4]
806                {
807                    let t4 = [(t3[2] + t3[4 - 1]), (t3[2] - t3[4 - 1]) * COS_1];
808
809                    t3[2 + 0] = t4[0];
810                    t3[2 + 1] = t4[1];
811                }
812
813                t2[4 + 0] = t3[0];
814                t2[4 + 1] = t3[2] + t3[3];
815                t2[4 + 2] = t3[1];
816                t2[4 + 3] = t3[3];
817            }
818
819            // Recombine t2[0..4] and t2[4..8], overwriting t1[8..16].
820            for i in 0..3 {
821                t1[8 + (i << 1) + 0] = t2[i];
822                t1[8 + (i << 1) + 1] = t2[4 + i] + t2[4 + i + 1];
823            }
824
825            t1[16 - 2] = t2[4 - 1];
826            t1[16 - 1] = t2[8 - 1];
827        }
828
829        // Recombine t1[0..8] and t1[8..16], overwriting t0[0..16].
830        for i in 0..7 {
831            t0[16 + (i << 1) + 0] = t1[i];
832            t0[16 + (i << 1) + 1] = t1[8 + i] + t1[8 + i + 1];
833        }
834
835        t0[32 - 2] = t1[8 - 1];
836        t0[32 - 1] = t1[16 - 1];
837    }
838
839    // Recombine t1[0..16] and t1[16..32] into y.
840    for i in 0..15 {
841        y[(i << 1) + 0] = t0[i];
842        y[(i << 1) + 1] = t0[16 + i] + t0[16 + i + 1];
843    }
844
845    y[32 - 2] = t0[16 - 1];
846    y[32 - 1] = t0[32 - 1];
847}
848
849#[cfg(test)]
850mod tests {
851    use super::dct32;
852    use std::f64;
853
854    fn dct32_analytical(x: &[f32; 32]) -> [f32; 32] {
855        const PI_32: f64 = f64::consts::PI / 32.0;
856
857        let mut result = [0f32; 32];
858        for (i, item) in result.iter_mut().enumerate() {
859            *item = x
860                .iter()
861                .enumerate()
862                .map(|(j, &jtem)| jtem * (PI_32 * (i as f64) * ((j as f64) + 0.5)).cos() as f32)
863                .sum();
864        }
865
866        result
867    }
868
869    #[test]
870    fn verify_dct32() {
871        const TEST_VECTOR: [f32; 32] = [
872            0.1710, 0.1705, 0.3476, 0.1866, 0.4784, 0.6525, 0.2690, 0.9996, //
873            0.1864, 0.7277, 0.1163, 0.6620, 0.0911, 0.3225, 0.1126, 0.5344, //
874            0.7839, 0.9741, 0.8757, 0.5763, 0.5926, 0.2756, 0.1757, 0.6531, //
875            0.7101, 0.7376, 0.1924, 0.0351, 0.8044, 0.2409, 0.9347, 0.9417, //
876        ];
877
878        let mut test_result = [0f32; 32];
879        dct32(&TEST_VECTOR, &mut test_result);
880
881        let actual_result = dct32_analytical(&TEST_VECTOR);
882        for i in 0..32 {
883            assert!((actual_result[i] - test_result[i]).abs() < 0.00001);
884        }
885    }
886}