lewton/
imdct.rs

1// Vorbis decoder written in Rust
2//
3// Copyright (c) 2016 est31 <MTest31@outlook.com>
4// and contributors. All rights reserved.
5// Licensed under MIT license, or Apache 2 license,
6// at your option. Please see the LICENSE file
7// attached to this source distribution for details.
8
9// This file is a very close translation of the
10// implementation of the algorithm from stb_vorbis.
11
12use ::header_cached::CachedBlocksizeDerived;
13
14fn imdct_step3_iter0_loop(n :usize, e :&mut[f32], i_off :usize, k_off :isize, a :&[f32]) {
15	let mut a_offs = 0;
16	let mut i_offs = i_off;
17	let mut k_offs = i_off as isize + k_off;
18
19	macro_rules! ee0 {
20		(-$x:expr) => {e[i_offs - ($x as usize)]};
21		($x:expr) => {e[i_offs + ($x as usize)]}
22	}
23
24	macro_rules! ee2 {
25		(-$x:expr) => {e[(k_offs - $x) as usize]};
26		($x:expr) => {e[(k_offs + $x) as usize]}
27	}
28
29	macro_rules! aa {
30		($x:expr) => {a[a_offs + ($x as usize)]}
31	}
32
33	assert_eq!((n & 3), 0);
34
35	for _ in 0 .. n >> 2 {
36		let mut k00_20 = ee0![ 0] - ee2![ 0];
37		let mut k01_21 = ee0![-1] - ee2![-1];
38		ee0![ 0] += ee2![ 0];
39		ee0![-1] += ee2![-1];
40		ee2![ 0] = k00_20 * aa![0] - k01_21 * aa![1];
41		ee2![-1] = k01_21 * aa![0] + k00_20 * aa![1];
42		a_offs += 8;
43
44		k00_20  = ee0![-2] - ee2![-2];
45		k01_21  = ee0![-3] - ee2![-3];
46		ee0![-2] += ee2![-2];
47		ee0![-3] += ee2![-3];
48		ee2![-2] = k00_20 * aa![0] - k01_21 * aa![1];
49		ee2![-3] = k01_21 * aa![0] + k00_20 * aa![1];
50		a_offs += 8;
51
52		k00_20  = ee0![-4] - ee2![-4];
53		k01_21  = ee0![-5] - ee2![-5];
54		ee0![-4] += ee2![-4];
55		ee0![-5] += ee2![-5];
56		ee2![-4] = k00_20 * aa![0] - k01_21 * aa![1];
57		ee2![-5] = k01_21 * aa![0] + k00_20 * aa![1];
58		a_offs += 8;
59
60		k00_20  = ee0![-6] - ee2![-6];
61		k01_21  = ee0![-7] - ee2![-7];
62		ee0![-6] += ee2![-6];
63		ee0![-7] += ee2![-7];
64		ee2![-6] = k00_20 * aa![0] - k01_21 * aa![1];
65		ee2![-7] = k01_21 * aa![0] + k00_20 * aa![1];
66
67		a_offs += 8;
68		i_offs -= 8;
69		k_offs -= 8;
70	}
71}
72
73fn imdct_step3_inner_r_loop(lim :usize, e :&mut [f32],
74		d0 :usize, k_off :isize, a :&[f32], k1 :usize) {
75	let mut a_offs = 0;
76	let mut d0_offs = d0;
77	let mut k_offs = d0 as isize + k_off;
78
79	macro_rules! e0 {
80		(-$x:expr) => {e[d0_offs - ($x as usize)]};
81		($x:expr) => {e[d0_offs + ($x as usize)]}
82	}
83
84	macro_rules! e2 {
85		(-$x:expr) => {e[(k_offs - $x) as usize]};
86		($x:expr) => {e[(k_offs + $x) as usize]}
87	}
88
89	macro_rules! aa {
90		($x:expr) => {a[a_offs + ($x as usize)]}
91	}
92
93	for _ in 0 .. lim >> 2 {
94		let mut k00_20 = e0![-0] - e2![-0];
95		let mut k01_21 = e0![-1] - e2![-1];
96		e0![-0] += e2![-0];
97		e0![-1] += e2![-1];
98		e2![-0] = (k00_20) * aa![0] - (k01_21) * aa![1];
99		e2![-1] = (k01_21) * aa![0] + (k00_20) * aa![1];
100
101		a_offs += k1;
102
103		k00_20 = e0![-2] - e2![-2];
104		k01_21 = e0![-3] - e2![-3];
105		e0![-2] += e2![-2];
106		e0![-3] += e2![-3];
107		e2![-2] = (k00_20) * aa![0] - (k01_21) * aa![1];
108		e2![-3] = (k01_21) * aa![0] + (k00_20) * aa![1];
109
110		a_offs += k1;
111
112		k00_20 = e0![-4] - e2![-4];
113		k01_21 = e0![-5] - e2![-5];
114		e0![-4] += e2![-4];
115		e0![-5] += e2![-5];
116		e2![-4] = (k00_20) * aa![0] - (k01_21) * aa![1];
117		e2![-5] = (k01_21) * aa![0] + (k00_20) * aa![1];
118
119		a_offs += k1;
120
121		k00_20 = e0![-6] - e2![-6];
122		k01_21 = e0![-7] - e2![-7];
123		e0![-6] += e2![-6];
124		e0![-7] += e2![-7];
125		e2![-6] = (k00_20) * aa![0] - (k01_21) * aa![1];
126		e2![-7] = (k01_21) * aa![0] + (k00_20) * aa![1];
127
128		d0_offs -= 8;
129		k_offs -= 8;
130
131		a_offs += k1;
132	}
133}
134
135fn imdct_step3_inner_s_loop(n :usize, e :&mut [f32], i_off :usize, k_off :isize,
136		a :&[f32], a_off :usize, k0 :usize) {
137	let a0 = a[0];
138	let a1 = a[0+1];
139	let a2 = a[0+a_off];
140	let a3 = a[0+a_off+1];
141	let a4 = a[0+a_off*2+0];
142	let a5 = a[0+a_off*2+1];
143	let a6 = a[0+a_off*3+0];
144	let a7 = a[0+a_off*3+1];
145
146	let mut i_offs = i_off;
147	let mut k_offs = (i_off as isize + k_off) as usize;
148
149	macro_rules! ee0 {
150		(-$x:expr) => {e[i_offs - ($x as usize)]};
151		($x:expr) => {e[i_offs + ($x as usize)]}
152	}
153
154	macro_rules! ee2 {
155		(-$x:expr) => {e[k_offs - ($x as usize)]};
156		($x:expr) => {e[k_offs + ($x as usize)]}
157	}
158
159	let mut i = 0;
160	loop {
161		let mut k00 = ee0![ 0] - ee2![ 0];
162		let mut k11 = ee0![-1] - ee2![-1];
163		ee0![ 0] =  ee0![ 0] + ee2![ 0];
164		ee0![-1] =  ee0![-1] + ee2![-1];
165		ee2![ 0] = (k00) * a0 - (k11) * a1;
166		ee2![-1] = (k11) * a0 + (k00) * a1;
167
168		k00      = ee0![-2] - ee2![-2];
169		k11      = ee0![-3] - ee2![-3];
170		ee0![-2] =  ee0![-2] + ee2![-2];
171		ee0![-3] =  ee0![-3] + ee2![-3];
172		ee2![-2] = (k00) * a2 - (k11) * a3;
173		ee2![-3] = (k11) * a2 + (k00) * a3;
174
175		k00      = ee0![-4] - ee2![-4];
176		k11      = ee0![-5] - ee2![-5];
177		ee0![-4] =  ee0![-4] + ee2![-4];
178		ee0![-5] =  ee0![-5] + ee2![-5];
179		ee2![-4] = (k00) * a4 - (k11) * a5;
180		ee2![-5] = (k11) * a4 + (k00) * a5;
181
182		k00      = ee0![-6] - ee2![-6];
183		k11      = ee0![-7] - ee2![-7];
184		ee0![-6] =  ee0![-6] + ee2![-6];
185		ee0![-7] =  ee0![-7] + ee2![-7];
186		ee2![-6] = (k00) * a6 - (k11) * a7;
187		ee2![-7] = (k11) * a6 + (k00) * a7;
188
189		i += 1;
190		// we have this check instead of a for loop
191		// over an iterator because otherwise we
192		// overflow.
193		if i >= n {
194			break;
195		}
196		i_offs -= k0;
197		k_offs -= k0;
198	}
199}
200
201#[inline]
202fn iter_54(zm7 :&mut [f32]) {
203	// difference from stb_vorbis implementation:
204	// zm7 points to z minus 7
205	// (Rust disallows negative indices)
206
207	let k00  = zm7[7] - zm7[3];
208	let y0   = zm7[7] + zm7[3];
209	let y2   = zm7[5] + zm7[1];
210	let k22  = zm7[5] - zm7[1];
211
212	zm7[7] = y0 + y2;      // z0 + z4 + z2 + z6
213	zm7[5] = y0 - y2;      // z0 + z4 - z2 - z6
214
215	// done with y0,y2
216
217	let k33  = zm7[4] - zm7[0];
218
219	zm7[3] = k00 + k33;    // z0 - z4 + z3 - z7
220	zm7[1] = k00 - k33;    // z0 - z4 - z3 + z7
221
222	// done with k33
223
224	let k11  = zm7[6] - zm7[2];
225	let y1   = zm7[6] + zm7[2];
226	let y3   = zm7[4] + zm7[0];
227
228	zm7[6] = y1 + y3;      // z1 + z5 + z3 + z7
229	zm7[4] = y1 - y3;      // z1 + z5 - z3 - z7
230	zm7[2] = k11 - k22;    // z1 - z5 + z2 - z6
231	zm7[0] = k11 + k22;    // z1 - z5 - z2 + z6
232}
233
234fn imdct_step3_inner_s_loop_ld654(n :usize, e :&mut [f32], i_off :usize,
235	a :&[f32], base_n :usize)
236{
237	let a_off = base_n >> 3;
238	let a2 = a[a_off];
239
240	let mut z_offs = i_off;
241
242	let basep16 = i_off - 16 * (n - 1 as usize);
243
244	macro_rules! z {
245		(-$x:expr) => {e[z_offs - ($x as usize)]}
246	}
247
248	loop {
249		let mut k00 = z![-0] - z![-8];
250		let mut k11 = z![-1] - z![-9];
251		z![-0] = z![-0] + z![-8];
252		z![-1] = z![-1] + z![-9];
253		z![-8] =  k00;
254		z![-9] =  k11;
255
256		k00     = z![ -2] - z![-10];
257		k11     = z![ -3] - z![-11];
258		z![ -2] = z![ -2] + z![-10];
259		z![ -3] = z![ -3] + z![-11];
260		z![-10] = (k00+k11) * a2;
261		z![-11] = (k11-k00) * a2;
262
263		k00     = z![-12] - z![ -4];  // reverse to avoid a unary negation
264		k11     = z![ -5] - z![-13];
265		z![ -4] = z![ -4] + z![-12];
266		z![ -5] = z![ -5] + z![-13];
267		z![-12] = k11;
268		z![-13] = k00;
269
270		k00     = z![-14] - z![ -6];  // reverse to avoid a unary negation
271		k11     = z![ -7] - z![-15];
272		z![ -6] = z![ -6] + z![-14];
273		z![ -7] = z![ -7] + z![-15];
274		z![-14] = (k00+k11) * a2;
275		z![-15] = (k00-k11) * a2;
276
277		iter_54(e.split_at_mut(z_offs - 7).1);
278		iter_54(e.split_at_mut(z_offs - 7 - 8).1);
279		// We need to compare with basep16 here
280		// in order to prevent a possible overflow
281		// in calculation of base, and in calculation
282		// of z_offs.
283		if z_offs <= basep16 {
284			break;
285		}
286		z_offs -= 16;
287	}
288}
289
290#[allow(dead_code)]
291pub fn inverse_mdct(cached_bd :&CachedBlocksizeDerived, buffer :&mut [f32], bs :u8) {
292	let n = buffer.len();
293	// Pre-condition.
294	assert_eq!(n, 1 << bs);
295
296	let n2 = n >> 1;
297	let n4 = n >> 2;
298	let n8 = n >> 3;
299
300	// TODO later on we might want to do Vec::with_capacity here,
301	// and use buf2.push everywhere...
302	let mut buf2 :Vec<f32> = vec![0.0; n2];
303
304	let ctf = &cached_bd.twiddle_factors;
305	let a :&[f32] = &ctf.a;
306	let b :&[f32] = &ctf.b;
307	let c :&[f32] = &ctf.c;
308
309	macro_rules! break_if_sub_overflows {
310		($i:ident, $x:expr) => {
311			$i = match $i.checked_sub($x) {
312				Some(v) => v,
313				None => break,
314			};
315		}
316	}
317
318	// IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
319	// See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old' in stb_vorbis original.
320
321	// kernel from paper
322
323
324	// merged:
325	//   copy and reflect spectral data
326	//   step 0
327
328	// note that it turns out that the items added together during
329	// this step are, in fact, being added to themselves (as reflected
330	// by step 0). inexplicable inefficiency! this became obvious
331	// once I combined the passes.
332
333	// so there's a missing 'times 2' here (for adding X to itself).
334	// this propogates through linearly to the end, where the numbers
335	// are 1/2 too small, and need to be compensated for.
336
337	{
338		let mut a_offs = 0;
339		let mut d_offs = n2 - 2;
340		let mut e_offs = 0;
341		let e_stop = n2;
342
343		macro_rules! d {
344			($x:expr) => {buf2[d_offs + ($x as usize)]}
345		}
346		macro_rules! aa {
347			($x:expr) => {a[a_offs + ($x as usize)]}
348		}
349		macro_rules! e {
350			($x:expr) => {buffer[e_offs + ($x as usize)]}
351		}
352
353		// TODO replace the while with a for once step_by on iterators
354		// is stabilized
355		while e_offs != e_stop {
356			d![1] = e![0] * aa![0] - e![2]*aa![1];
357			d![0] = e![0] * aa![1] + e![2]*aa![0];
358			d_offs -= 2;
359			a_offs += 2;
360			e_offs += 4;
361		}
362
363		e_offs = n2 - 3;
364		loop {
365			d![1] = -e![2] * aa![0] - -e![0]*aa![1];
366			d![0] = -e![2] * aa![1] + -e![0]*aa![0];
367			break_if_sub_overflows!(d_offs, 2);
368			a_offs += 2;
369			e_offs -= 4;
370		}
371	}
372
373
374	{
375		// now we use symbolic names for these, so that we can
376		// possibly swap their meaning as we change which operations
377		// are in place
378
379		let u = &mut *buffer;
380		let v = &mut *buf2;
381
382		// step 2    (paper output is w, now u)
383		// this could be in place, but the data ends up in the wrong
384		// place... _somebody_'s got to swap it, so this is nominated
385		{
386			let mut a_offs = n2 - 8;
387			let mut d0_offs = n4;
388			let mut d1_offs = 0;
389			let mut e0_offs = n4;
390			let mut e1_offs = 0;
391
392			macro_rules! aa {
393				($x:expr) => {a[a_offs + ($x as usize)]}
394			}
395			macro_rules! d0 {
396				($x:expr) => {u[d0_offs + ($x as usize)]}
397			}
398			macro_rules! d1 {
399				($x:expr) => {u[d1_offs + ($x as usize)]}
400			}
401			macro_rules! e0 {
402				($x:expr) => {v[e0_offs + ($x as usize)]}
403			}
404			macro_rules! e1 {
405				($x:expr) => {v[e1_offs + ($x as usize)]}
406			}
407
408			loop {
409				let mut v41_21 = e0![1] - e1![1];
410				let mut v40_20 = e0![0] - e1![0];
411				d0![1]  = e0![1] + e1![1];
412				d0![0]  = e0![0] + e1![0];
413				d1![1]  = v41_21*aa![4] - v40_20*aa![5];
414				d1![0]  = v40_20*aa![4] + v41_21*aa![5];
415
416				v41_21 = e0![3] - e1![3];
417				v40_20 = e0![2] - e1![2];
418				d0![3]  = e0![3] + e1![3];
419				d0![2]  = e0![2] + e1![2];
420				d1![3]  = v41_21*aa![0] - v40_20*aa![1];
421				d1![2]  = v40_20*aa![0] + v41_21*aa![1];
422
423				break_if_sub_overflows!(a_offs, 8);
424
425				d0_offs += 4;
426				d1_offs += 4;
427				e0_offs += 4;
428				e1_offs += 4;
429			}
430		}
431
432
433		// step 3
434
435		let ld = bs as usize;
436
437		// optimized step 3:
438
439		// the original step3 loop can be nested r inside s or s inside r;
440		// it's written originally as s inside r, but this is dumb when r
441		// iterates many times, and s few. So I have two copies of it and
442		// switch between them halfway.
443
444		// this is iteration 0 of step 3
445		imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*0, -(n as isize >> 3), a);
446		imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*1, -(n as isize >> 3), a);
447
448		// this is iteration 1 of step 3
449		imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*0, -(n as isize >> 4), a, 16);
450		imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*1, -(n as isize >> 4), a, 16);
451		imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*2, -(n as isize >> 4), a, 16);
452		imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*3, -(n as isize >> 4), a, 16);
453
454		for l in 2 .. (ld - 3) >> 1 {
455			let k0 = n >> (l + 2);
456			let k0_2 = k0 as isize >> 1;
457			let lim = 1 << (l+1);
458			for i in 0 .. lim {
459				imdct_step3_inner_r_loop(n >> (l + 4),
460					u, n2-1 - k0*i, -k0_2, a, 1 << (l+3));
461			}
462		}
463		for l in (ld - 3) >> 1 .. ld - 6 {
464			let k0 = n >> (l + 2);
465			let k1 = 1 << (l + 3);
466			let k0_2 = k0 as isize >> 1;
467			let rlim = n >> (l + 6);
468			let lim = 1 << (l + 1);
469			let mut i_off = n2 - 1;
470			let mut a_off = 0;
471			for _ in 0 .. rlim {
472				let a0 = a.split_at(a_off).1;
473				imdct_step3_inner_s_loop(lim, u, i_off, -k0_2, a0, k1, k0);
474				a_off += k1 * 4;
475				i_off -= 8;
476			}
477		}
478
479		// iterations with count:
480		//   ld-6,-5,-4 all interleaved together
481		//       the big win comes from getting rid of needless flops
482		//         due to the constants on pass 5 & 4 being all 1 and 0;
483		//       combining them to be simultaneous to improve cache made little difference
484		imdct_step3_inner_s_loop_ld654(n >> 5, u, n2 - 1, a, n);
485
486		// output is u
487
488		// step 4, 5, and 6
489		// cannot be in-place because of step 5
490		{
491			let bitrev_vec = &cached_bd.bitrev;
492			// weirdly, I'd have thought reading sequentially and writing
493			// erratically would have been better than vice-versa, but in
494			// fact that's not what my testing showed. (That is, with
495			// j = bitreverse(i), do you read i and write j, or read j and write i.)
496
497			let mut d0_offs = n4 - 4;
498			let mut d1_offs = n2 - 4;
499			let mut bitrev_offs = 0;
500
501			macro_rules! d0 {
502				($x:expr) => {v[d0_offs + ($x as usize)]}
503			}
504			macro_rules! d1 {
505				($x:expr) => {v[d1_offs + ($x as usize)]}
506			}
507			macro_rules! bitrev {
508				($x:expr) => {bitrev_vec[bitrev_offs + ($x as usize)]}
509			}
510
511			loop {
512				let mut k4 = bitrev![0] as usize;
513				d1![3] = u[k4 + 0];
514				d1![2] = u[k4 + 1];
515				d0![3] = u[k4 + 2];
516				d0![2] = u[k4 + 3];
517
518				k4 = bitrev![1] as usize;
519				d1![1] = u[k4 + 0];
520				d1![0] = u[k4 + 1];
521				d0![1] = u[k4 + 2];
522				d0![0] = u[k4 + 3];
523
524				break_if_sub_overflows!(d0_offs, 4);
525				d1_offs -= 4;
526				bitrev_offs += 2;
527			}
528		}
529		// (paper output is u, now v)
530
531		// step 7   (paper output is v, now v)
532		// this is now in place
533		{
534			let mut c_offs = 0;
535			let mut d_offs = 0;
536			let mut e_offs = n2 - 4;
537
538			macro_rules! cc {
539				($x:expr) => {c[c_offs + ($x as usize)]}
540			}
541			macro_rules! d {
542				($x:expr) => {v[d_offs + ($x as usize)]}
543			}
544			macro_rules! e {
545				($x:expr) => {v[e_offs + ($x as usize)]}
546			}
547			while d_offs < e_offs {
548				let mut a02 = d![0] - e![2];
549				let mut a11 = d![1] + e![3];
550
551				let mut b0 = cc![1]*a02 + cc![0]*a11;
552				let mut b1 = cc![1]*a11 - cc![0]*a02;
553
554				let mut b2 = d![0] + e![ 2];
555				let mut b3 = d![1] - e![ 3];
556
557				d![0] = b2 + b0;
558				d![1] = b3 + b1;
559				e![2] = b2 - b0;
560				e![3] = b1 - b3;
561
562				a02 = d![2] - e![0];
563				a11 = d![3] + e![1];
564
565				b0 = cc![3]*a02 + cc![2]*a11;
566				b1 = cc![3]*a11 - cc![2]*a02;
567
568				b2 = d![2] + e![ 0];
569				b3 = d![3] - e![ 1];
570
571				d![2] = b2 + b0;
572				d![3] = b3 + b1;
573				e![0] = b2 - b0;
574				e![1] = b1 - b3;
575
576				c_offs += 4;
577				d_offs += 4;
578				e_offs -= 4;
579			}
580		}
581	}
582
583	// step 8+decode   (paper output is X, now buffer)
584	// this generates pairs of data a la 8 and pushes them directly through
585	// the decode kernel (pushing rather than pulling) to avoid having
586	// to make another pass later
587
588	// this cannot POSSIBLY be in place, so we refer to the buffers directly
589	{
590		let mut d0_offs = 0;
591		let mut d1_offs = n2 - 4;
592		let mut d2_offs = n2;
593		let mut d3_offs = n - 4;
594
595		let mut b_offs = n2 - 8;
596		let mut e_offs = n2 - 8;
597
598		macro_rules! d0 {
599			($x:expr) => {buffer[d0_offs + ($x as usize)]}
600		}
601		macro_rules! d1 {
602			($x:expr) => {buffer[d1_offs + ($x as usize)]}
603		}
604		macro_rules! d2 {
605			($x:expr) => {buffer[d2_offs + ($x as usize)]}
606		}
607		macro_rules! d3 {
608			($x:expr) => {buffer[d3_offs + ($x as usize)]}
609		}
610
611		macro_rules! b {
612			($x:expr) => {b[b_offs + ($x as usize)]}
613		}
614		macro_rules! e {
615			($x:expr) => {buf2[e_offs + ($x as usize)]}
616		}
617
618		loop {
619			let mut p3 =  e![6]*b![7] - e![7]*b![6];
620			let mut p2 = -e![6]*b![6] - e![7]*b![7];
621
622			d0![0] =   p3;
623			d1![3] = - p3;
624			d2![0] =   p2;
625			d3![3] =   p2;
626
627			let mut p1 =  e![4]*b![5] - e![5]*b![4];
628			let mut p0 = -e![4]*b![4] - e![5]*b![5];
629
630			d0![1] =   p1;
631			d1![2] = - p1;
632			d2![1] =   p0;
633			d3![2] =   p0;
634
635			p3 =  e![2]*b![3] - e![3]*b![2];
636			p2 = -e![2]*b![2] - e![3]*b![3];
637
638			d0![2] =   p3;
639			d1![1] = - p3;
640			d2![2] =   p2;
641			d3![1] =   p2;
642
643			p1 =  e![0]*b![1] - e![1]*b![0];
644			p0 = -e![0]*b![0] - e![1]*b![1];
645
646			d0![3] =   p1;
647			d1![0] = - p1;
648			d2![3] =   p0;
649			d3![0] =   p0;
650
651			break_if_sub_overflows!(e_offs, 8);
652			b_offs -= 8;
653			d0_offs += 4;
654			d2_offs += 4;
655			d1_offs -= 4;
656			d3_offs -= 4;
657		}
658	}
659}
660
661#[allow(dead_code)]
662pub fn inverse_mdct_naive(cached_bd :&CachedBlocksizeDerived, buffer :&mut[f32]) {
663	let n = buffer.len();
664	let n2 = n >> 1;
665	let n4 = n >> 2;
666	let n8 = n >> 3;
667	let n3_4 = n - n4;
668
669	let mut u = [0.0; 1 << 13];
670	let mut xa = [0.0; 1 << 13];
671	let mut v = [0.0; 1 << 13];
672	let mut w = [0.0; 1 << 13];
673
674	// retrieve the cached twiddle factors
675	let ctf = &cached_bd.twiddle_factors;
676	let a :&[f32] = &ctf.a;
677	let b :&[f32] = &ctf.b;
678	let c :&[f32] = &ctf.c;
679
680	// IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
681	// Note there are bugs in that pseudocode, presumably due to them attempting
682	// to rename the arrays nicely rather than representing the way their actual
683	// implementation bounces buffers back and forth. As a result, even in the
684	// "some formulars corrected" version, a direct implementation fails. These
685	// are noted below as "paper bug".
686
687	// copy and reflect spectral data
688	for k in 0 .. n2 {
689		u[k] = buffer[k];
690	}
691	for k in n2 .. n {
692		u[k] = -buffer[n - k - 1];
693	}
694
695	let mut k2 = 0;
696	let mut k4 = 0;
697
698	// kernel from paper
699	// step 1
700	while k2 < n2 { // n4 iterations
701		v[n-k4-1] = (u[k4] - u[n-k4-1]) * a[k2]   - (u[k4+2] - u[n-k4-3])*a[k2+1];
702		v[n-k4-3] = (u[k4] - u[n-k4-1]) * a[k2+1] + (u[k4+2] - u[n-k4-3])*a[k2];
703		k2 += 2;
704		k4 += 4;
705	}
706	// step 2
707	k4 = 0;
708	while k4 < n2 { // n8 iterations
709		w[n2+3+k4] = v[n2+3+k4] + v[k4+3];
710		w[n2+1+k4] = v[n2+1+k4] + v[k4+1];
711		w[k4+3]    = (v[n2+3+k4] - v[k4+3])*a[n2-4-k4] - (v[n2+1+k4]-v[k4+1])*a[n2-3-k4];
712		w[k4+1]    = (v[n2+1+k4] - v[k4+1])*a[n2-4-k4] + (v[n2+3+k4]-v[k4+3])*a[n2-3-k4];
713		k4 += 4;
714	}
715
716	// step 3
717	let ld :usize = (::ilog(n as u64) - 1) as usize;
718	for l in 0 .. ld - 3 {
719		let k0 = n >> (l+2);
720		let k1 = 1 << (l+3);
721		let rlim = n >> (l+4);
722		let slim = 1 << (l+1);
723		let mut r4 = 0;
724		for r in 0 .. rlim {
725			let mut s2 = 0;
726			for _ in 0 .. slim {
727				u[n-1-k0*s2-r4] = w[n-1-k0*s2-r4] + w[n-1-k0*(s2+1)-r4];
728				u[n-3-k0*s2-r4] = w[n-3-k0*s2-r4] + w[n-3-k0*(s2+1)-r4];
729				u[n-1-k0*(s2+1)-r4] = (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * a[r*k1]
730					- (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * a[r*k1+1];
731				u[n-3-k0*(s2+1)-r4] = (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * a[r*k1]
732					+ (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * a[r*k1+1];
733				s2 += 2;
734			}
735			r4 += 4;
736		}
737		if l+1 < ld-3 {
738			// paper bug: ping-ponging of u&w here is omitted
739			w.copy_from_slice(&u);
740		}
741	}
742
743	// step 4
744	for i in 0 .. n8 {
745		let j = (::bit_reverse(i as u32) >> (32-ld+3)) as usize;
746		assert!(j < n8);
747		if i == j {
748			// paper bug: original code probably swapped in place; if copying,
749			//            need to directly copy in this case
750			let ii = i << 3;
751			v[ii+1] = u[ii+1];
752			v[ii+3] = u[ii+3];
753			v[ii+5] = u[ii+5];
754			v[ii+7] = u[ii+7];
755		} else if i < j {
756			let ii = i << 3;
757			let j8 = j << 3;
758			v[j8+1] = u[ii+1];
759			v[ii+1] = u[j8 + 1];
760			v[j8+3] = u[ii+3];
761			v[ii+3] = u[j8 + 3];
762			v[j8+5] = u[ii+5];
763			v[ii+5] = u[j8 + 5];
764			v[j8+7] = u[ii+7];
765			v[ii+7] = u[j8 + 7];
766		}
767	}
768
769	// step 5
770	for k in 0 .. n2 {
771		w[k] = v[k*2+1];
772	}
773	// step 6
774	let mut k2 = 0;
775	let mut k4 = 0;
776	while k2 < n4 { // n8 iterations
777		u[n-1-k2] = w[k4];
778		u[n-2-k2] = w[k4+1];
779		u[n3_4 - 1 - k2] = w[k4+2];
780		u[n3_4 - 2 - k2] = w[k4+3];
781		k2 += 2;
782		k4 += 4;
783	}
784	// step 7
785	k2 = 0;
786	while k2 < n4 { // n8 iterations
787		v[n2 + k2 ] = ( u[n2 + k2] + u[n-2-k2] + c[k2+1]*(u[n2+k2]-u[n-2-k2]) + c[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2.0;
788		v[n-2 - k2] = ( u[n2 + k2] + u[n-2-k2] - c[k2+1]*(u[n2+k2]-u[n-2-k2]) - c[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2.0;
789		v[n2+1+ k2] = ( u[n2+1+k2] - u[n-1-k2] + c[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - c[k2]*(u[n2+k2]-u[n-2-k2]))/2.0;
790		v[n-1 - k2] = (-u[n2+1+k2] + u[n-1-k2] + c[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - c[k2]*(u[n2+k2]-u[n-2-k2]))/2.0;
791		k2 += 2;
792	}
793	// step 8
794	k2 = 0;
795	for k in 0 .. n4 {
796		xa[k]      = v[k2+n2]*b[k2  ] + v[k2+1+n2]*b[k2+1];
797		xa[n2-1-k] = v[k2+n2]*b[k2+1] - v[k2+1+n2]*b[k2  ];
798		k2 += 2;
799	}
800
801	// decode kernel to output
802
803	for i in 0 .. n4 {
804		buffer[i] = xa[i + n4];
805	}
806	for i in n4 .. n3_4 {
807		buffer[i] = -xa[n3_4 - i - 1];
808	}
809	for i in n3_4 .. n {
810		buffer[i] = -xa[i - n3_4];
811	}
812}
813
814#[cfg(test)]
815#[test]
816fn test_imdct_naive() {
817	use imdct_test::*;
818	let mut arr_1 = imdct_prepare(&IMDCT_INPUT_TEST_ARR_1);
819	let cbd = CachedBlocksizeDerived::from_blocksize(8);
820	inverse_mdct_naive(&cbd, &mut arr_1);
821	let mismatches = fuzzy_compare_array(
822		&arr_1, &IMDCT_OUTPUT_TEST_ARR_1,
823		0.00005, true);
824	let mismatches_limit = 0;
825	if mismatches > mismatches_limit {
826		panic!("Numer of mismatches {} was larger than limit of {}",
827			mismatches, mismatches_limit);
828	}
829}
830
831#[cfg(test)]
832#[test]
833fn test_imdct() {
834	use imdct_test::*;
835	let mut arr_1 = imdct_prepare(&IMDCT_INPUT_TEST_ARR_1);
836	let blocksize = 8;
837	let cbd = CachedBlocksizeDerived::from_blocksize(blocksize);
838	inverse_mdct(&cbd, &mut arr_1, blocksize);
839	let mismatches = fuzzy_compare_array(
840		&arr_1, &IMDCT_OUTPUT_TEST_ARR_1,
841		0.00005, true);
842	let mismatches_limit = 0;
843	if mismatches > mismatches_limit {
844		panic!("Numer of mismatches {} was larger than limit of {}",
845			mismatches, mismatches_limit);
846	}
847}