summaryrefslogtreecommitdiffstats
path: root/meta-oe/recipes-multimedia/libmad/files/libmad-0.15.1b-avr32-optimization.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-oe/recipes-multimedia/libmad/files/libmad-0.15.1b-avr32-optimization.patch')
-rw-r--r--meta-oe/recipes-multimedia/libmad/files/libmad-0.15.1b-avr32-optimization.patch2922
1 files changed, 2922 insertions, 0 deletions
diff --git a/meta-oe/recipes-multimedia/libmad/files/libmad-0.15.1b-avr32-optimization.patch b/meta-oe/recipes-multimedia/libmad/files/libmad-0.15.1b-avr32-optimization.patch
new file mode 100644
index 0000000000..b74eea3225
--- /dev/null
+++ b/meta-oe/recipes-multimedia/libmad/files/libmad-0.15.1b-avr32-optimization.patch
@@ -0,0 +1,2922 @@
1diff --git a/bit.c b/bit.c
2index c2bfb24..262ce3a 100644
3--- a/bit.c
4+++ b/bit.c
5@@ -25,12 +25,6 @@
6
7 # include "global.h"
8
9-# ifdef HAVE_LIMITS_H
10-# include <limits.h>
11-# else
12-# define CHAR_BIT 8
13-# endif
14-
15 # include "bit.h"
16
17 /*
18@@ -81,6 +75,8 @@ unsigned short const crc_table[256] = {
19
20 # define CRC_POLY 0x8005
21
22+#ifndef FPM_AVR32
23+
24 /*
25 * NAME: bit->init()
26 * DESCRIPTION: initialize bit pointer struct
27@@ -190,6 +186,8 @@ void mad_bit_write(struct mad_bitptr *bitptr, unsigned int len,
28 }
29 # endif
30
31+#endif
32+
33 /*
34 * NAME: bit->crc()
35 * DESCRIPTION: compute CRC-check word
36diff --git a/bit.h b/bit.h
37index 5a51570..70f550a 100644
38--- a/bit.h
39+++ b/bit.h
40@@ -22,6 +22,92 @@
41 # ifndef LIBMAD_BIT_H
42 # define LIBMAD_BIT_H
43
44+# ifdef HAVE_LIMITS_H
45+# include <limits.h>
46+# else
47+# define CHAR_BIT 8
48+# endif
49+
50+#ifdef FPM_AVR32
51+
52+struct mad_bitptr {
53+ unsigned char const *byte;
54+ unsigned int read_bytes;
55+};
56+
57+/*
58+ * NAME: bit->init()
59+ * DESCRIPTION: initialize bit pointer struct
60+ */
61+static void mad_bit_init(struct mad_bitptr *bitptr, unsigned char const *byte)
62+{
63+ bitptr->byte = byte;
64+ bitptr->read_bytes = 0;
65+}
66+
67+/*
68+ * NAME: bit->length()
69+ * DESCRIPTION: return number of bits between start and end points
70+ */
71+static unsigned int mad_bit_length(struct mad_bitptr const *begin,
72+ struct mad_bitptr const *end)
73+{
74+ return (end->read_bytes - begin->read_bytes) +
75+ 8 * (end->byte - begin->byte);
76+}
77+
78+/*
79+ * NAME: bit->nextbyte()
80+ * DESCRIPTION: return pointer to next unprocessed byte
81+ */
82+static unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *bitptr)
83+{
84+ return bitptr->byte + ((bitptr->read_bytes + 0x7) >> 3);
85+}
86+
87+/*
88+ * NAME: bit->skip()
89+ * DESCRIPTION: advance bit pointer
90+ */
91+static void mad_bit_skip(struct mad_bitptr *bitptr, unsigned int len)
92+{
93+ bitptr->read_bytes += len;
94+ bitptr->byte += (bitptr->read_bytes >> 3);
95+ bitptr->read_bytes &= 0x7;
96+}
97+
98+/*
99+ * NAME: bit->read()
100+ * DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value
101+ */
102+static unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len)
103+{
104+ register unsigned long value;
105+
106+ if (!len)
107+ return 0;
108+
109+ value = *(unsigned int *)bitptr->byte;
110+
111+ value <<= bitptr->read_bytes;
112+ value >>= (32 - len);
113+
114+ bitptr->read_bytes += len;
115+ bitptr->byte += (bitptr->read_bytes >> 3);
116+ bitptr->read_bytes &= 0x7;
117+
118+ return value;
119+}
120+
121+# define mad_bit_finish(bitptr) /* nothing */
122+
123+static unsigned long mad_bit_bitsleft(struct mad_bitptr *bitptr)
124+{
125+ return (8 - (bitptr)->read_bytes);
126+}
127+
128+#else /* #ifdef FPM_AVR32 */
129+
130 struct mad_bitptr {
131 unsigned char const *byte;
132 unsigned short cache;
133@@ -42,6 +128,8 @@ void mad_bit_skip(struct mad_bitptr *, unsigned int);
134 unsigned long mad_bit_read(struct mad_bitptr *, unsigned int);
135 void mad_bit_write(struct mad_bitptr *, unsigned int, unsigned long);
136
137+#endif
138+
139 unsigned short mad_bit_crc(struct mad_bitptr, unsigned int, unsigned short);
140
141 # endif
142diff --git a/configure.ac b/configure.ac
143index 9b79399..063cb9b 100644
144--- a/configure.ac
145+++ b/configure.ac
146@@ -274,13 +274,14 @@ fi
147 AC_MSG_CHECKING(for architecture-specific fixed-point math routines)
148 AC_ARG_ENABLE(fpm, AC_HELP_STRING([--enable-fpm=ARCH],
149 [use ARCH-specific fixed-point math routines
150- (one of: intel, arm, mips, sparc, ppc, 64bit, default)]),
151+ (one of: intel, arm, avr32, mips, sparc, ppc, 64bit, default)]),
152 [
153 case "$enableval" in
154 yes) ;;
155 no|default|approx) FPM="DEFAULT" ;;
156 intel|i?86) FPM="INTEL" ;;
157 arm) FPM="ARM" ;;
158+ avr32) FPM="AVR32" ;;
159 mips) FPM="MIPS" ;;
160 sparc) FPM="SPARC" ;;
161 ppc|powerpc) FPM="PPC" ;;
162@@ -298,6 +299,7 @@ then
163 case "$host" in
164 i?86-*) FPM="INTEL" ;;
165 arm*-*) FPM="ARM" ;;
166+ avr32*-*) FPM="AVR32" ;;
167 mips*-*) FPM="MIPS" ;;
168 sparc*-*) FPM="SPARC" ;;
169 powerpc*-*) FPM="PPC" ;;
170@@ -343,6 +345,11 @@ then
171 ASO="$ASO -DASO_IMDCT"
172 ASO_OBJS="imdct_l_arm.lo"
173 ;;
174+ avr32*-*)
175+ ASO="$ASO -DASO_INTERLEAVE2"
176+ ASO="$ASO -DASO_ZEROCHECK"
177+ ASO_OBJS="dct32_avr32.lo synth_avr32.lo imdct_avr32.lo"
178+ ;;
179 mips*-*)
180 ASO="$ASO -DASO_INTERLEAVE2"
181 ASO="$ASO -DASO_ZEROCHECK"
182diff --git a/configure b/configure
183index ee421cc..7a9f0c8 100755
184--- a/configure
185+++ b/configure
186@@ -1048,7 +1048,7 @@ Optional Features:
187 --enable-speed optimize for speed over accuracy
188 --enable-accuracy optimize for accuracy over speed
189 --enable-fpm=ARCH use ARCH-specific fixed-point math routines (one of:
190- intel, arm, mips, sparc, ppc, 64bit, default)
191+ intel, arm, avr32, mips, sparc, ppc, 64bit, default)
192 --enable-sso use subband synthesis optimization
193 --disable-aso disable architecture-specific optimizations
194 --enable-strict-iso use strict ISO/IEC interpretations
195@@ -21477,6 +21477,7 @@ if test "${enable_fpm+set}" = set; then
196 no|default|approx) FPM="DEFAULT" ;;
197 intel|i?86) FPM="INTEL" ;;
198 arm) FPM="ARM" ;;
199+ avr32) FPM="AVR32" ;;
200 mips) FPM="MIPS" ;;
201 sparc) FPM="SPARC" ;;
202 ppc|powerpc) FPM="PPC" ;;
203@@ -21498,6 +21499,7 @@ then
204 case "$host" in
205 i?86-*) FPM="INTEL" ;;
206 arm*-*) FPM="ARM" ;;
207+ avr32*-*) FPM="AVR32" ;;
208 mips*-*) FPM="MIPS" ;;
209 sparc*-*) FPM="SPARC" ;;
210 powerpc*-*) FPM="PPC" ;;
211@@ -21554,6 +21556,11 @@ then
212 ASO="$ASO -DASO_IMDCT"
213 ASO_OBJS="imdct_l_arm.lo"
214 ;;
215+ avr32*-*)
216+ ASO="$ASO -DASO_INTERLEAVE2"
217+ ASO="$ASO -DASO_ZEROCHECK"
218+ ASO_OBJS="dct32_avr32.lo synth_avr32.lo imdct_avr32.lo"
219+ ;;
220 mips*-*)
221 ASO="$ASO -DASO_INTERLEAVE2"
222 ASO="$ASO -DASO_ZEROCHECK"
223diff --git a/dct32_avr32.S b/dct32_avr32.S
224new file mode 100644
225index 0000000..7513340
226--- /dev/null
227+++ b/dct32_avr32.S
228@@ -0,0 +1,780 @@
229+/*
230+ Optimized 32-point Discrete Cosine Transform (DCT)
231+ Copyright 2003-2006 Atmel Corporation.
232+
233+ Written by Ronny Pedersen, Atmel Norway
234+
235+ This program is free software; you can redistribute it and/or modify
236+ it under the terms of the GNU General Public License as published by
237+ the Free Software Foundation; either version 2 of the License, or
238+ (at your option) any later version.
239+
240+ This program is distributed in the hope that it will be useful,
241+ but WITHOUT ANY WARRANTY; without even the implied warranty of
242+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
243+ GNU General Public License for more details.
244+
245+ You should have received a copy of the GNU General Public License
246+ along with this program; if not, write to the Free Software
247+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
248+
249+#define SHIFT 12
250+#define MAD_F_SCALEBITS 28
251+#define SLOTS 8
252+
253+#define MAD_F(x) ((x + (1 << 15)) >> 16)
254+
255+# define costab1 MAD_F(0x7fd8878e)
256+# define costab2 MAD_F(0x7f62368f)
257+# define costab3 MAD_F(0x7e9d55fc)
258+# define costab4 MAD_F(0x7d8a5f40)
259+# define costab5 MAD_F(0x7c29fbee)
260+# define costab6 MAD_F(0x7a7d055b)
261+# define costab7 MAD_F(0x78848414)
262+# define costab8 MAD_F(0x7641af3d)
263+# define costab9 MAD_F(0x73b5ebd1)
264+# define costab10 MAD_F(0x70e2cbc6)
265+# define costab11 MAD_F(0x6dca0d14)
266+# define costab12 MAD_F(0x6a6d98a4)
267+# define costab13 MAD_F(0x66cf8120)
268+# define costab14 MAD_F(0x62f201ac)
269+# define costab15 MAD_F(0x5ed77c8a)
270+# define costab16 MAD_F(0x5a82799a)
271+# define costab17 MAD_F(0x55f5a4d2)
272+# define costab18 MAD_F(0x5133cc94)
273+# define costab19 MAD_F(0x4c3fdff4)
274+# define costab20 MAD_F(0x471cece7)
275+# define costab21 MAD_F(0x41ce1e65)
276+# define costab22 MAD_F(0x3c56ba70)
277+# define costab23 MAD_F(0x36ba2014)
278+# define costab24 MAD_F(0x30fbc54d)
279+# define costab25 MAD_F(0x2b1f34eb)
280+# define costab26 MAD_F(0x25280c5e)
281+# define costab27 MAD_F(0x1f19f97b)
282+# define costab28 MAD_F(0x18f8b83c)
283+# define costab29 MAD_F(0x12c8106f)
284+# define costab30 MAD_F(0x0c8bd35e)
285+# define costab31 MAD_F(0x0647d97c)
286+
287+
288+ .macro butterfly2_in out1, out2, out3, out4, in, idx_in1, idx_in2, idx_in3, idx_in4, coeff1, coeff2, tmplo, tmphi
289+ mov \tmplo, \coeff1
290+ ld.w \out1, \in[\idx_in1 * 4]
291+ ld.w \out2, \in[\idx_in2 * 4]
292+ ld.w \out3, \in[\idx_in3 * 4]
293+ ld.w \out4, \in[\idx_in4 * 4]
294+ sub \tmphi, \out1, \out2
295+ add \out1, \out2
296+ mulsatrndwh.w \out2, \tmphi, \tmplo:b
297+
298+ sub \tmphi, \out3, \out4
299+ mov \tmplo, \coeff2
300+ add \out3, \out4
301+ mulsatrndwh.w \out4, \tmphi, \tmplo:b
302+ .endm
303+
304+ .macro butterfly2 in1, in2, in3, in4, coeff1, tmplo, tmphi, tmp
305+ mov \tmp, \coeff1
306+ sub \tmphi, \in1, \in2
307+ add \in1, \in2
308+ mulsatrndwh.w \in2, \tmphi, \tmp:b
309+
310+ sub \tmphi, \in3, \in4
311+ add \in3, \in4
312+ mulsatrndwh.w \in4, \tmphi, \tmp:b
313+ .endm
314+
315+ .macro butterfly4 in1, in2, in3, in4, in5, in6, in7, in8, coeff1, tmplo, tmphi, tmp
316+ mov \tmp, \coeff1
317+ sub \tmphi, \in1, \in2
318+ add \in1, \in2
319+ mulsatrndwh.w \in2, \tmphi, \tmp:b
320+
321+ sub \tmphi, \in3, \in4
322+ add \in3, \in4
323+ mulsatrndwh.w \in4, \tmphi, \tmp:b
324+
325+ sub \tmphi, \in5, \in6
326+ add \in5, \in6
327+ mulsatrndwh.w \in6, \tmphi, \tmp:b
328+
329+ sub \tmphi, \in7, \in8
330+ add \in7, \in8
331+ mulsatrndwh.w \in8, \tmphi, \tmp:b
332+ .endm
333+
334+ .macro scale reg
335+ .endm
336+
337+/*void dct32( mad_fixed_t const in[32], unsigned int slot,
338+ mad_fixed_t lo[16][8], mad_fixed_t hi[16][8]) */
339+
340+ .global dct32_avr32
341+dct32_avr32:
342+ stm --sp, r0-r7, r9-r11, lr
343+
344+ sub sp, 32*4
345+
346+/* t0 = in[0] + in[31]; t16 = MUL(in[0] - in[31], costab1);
347+ t1 = in[15] + in[16]; t17 = MUL(in[15] - in[16], costab31); */
348+ butterfly2_in r4/*t0*/, r5/*t16*/, r6/*t1*/, r7/*t17*/, r12, 0, 31, 15, 16, costab1, costab31, r10, r11
349+
350+/* t41 = t16 + t17;
351+ t59 = MUL(t16 - t17, costab2);
352+ t33 = t0 + t1;
353+ t50 = MUL(t0 - t1, costab2);*/
354+ butterfly2 r5/*t41*/, r7/*t59*/, r4/*t33*/, r6/*t50*/, costab2, r10, r11, lr
355+
356+/* t2 = in[7] + in[24]; t18 = MUL(in[7] - in[24], costab15);
357+ t3 = in[8] + in[23]; t19 = MUL(in[8] - in[23], costab17); */
358+ butterfly2_in r0/*t2*/, r1/*t18*/, r2/*t3*/, r3/*t19*/, r12, 7, 24, 8, 23, costab15, costab17, r10, r11
359+
360+/* t42 = t18 + t19;
361+ t60 = MUL(t18 - t19, costab30);
362+ t34 = t2 + t3;
363+ t51 = MUL(t2 - t3, costab30); */
364+ butterfly2 r1/*t42*/, r3/*t60*/, r0/*t34*/, r2/*t51*/, costab30, r10, r11, lr
365+
366+/* t73 = t41 + t42; t94 = MUL(t41 - t42, costab4);
367+ t83 = t59 + t60; t106 = MUL(t59 - t60, costab4); */
368+
369+
370+/* t69 = t33 + t34; t89 = MUL(t33 - t34, costab4);
371+ t78 = t50 + t51; t100 = MUL(t50 - t51, costab4); */
372+ butterfly4 r5/*t73*/, r1/*t94*/, r7/*t83*/, r3/*t106*/,r4/*t69*/, r0/*t89*/, r6/*t78*/, r2/*t100*/, costab4, r10, r11, lr
373+
374+/* Store away the computed butterflies:
375+ sp[0-7] = t83, t78, t73, t69, t106, t100, t94, t89 */
376+ stm sp, r0-r7
377+
378+
379+/* t4 = in[3] + in[28]; t20 = MUL(in[3] - in[28], costab7);
380+ t5 = in[12] + in[19]; t21 = MUL(in[12] - in[19], costab25); */
381+ butterfly2_in r4/*t4*/, r5/*t20*/, r6/*t5*/, r7/*t21*/, r12, 3, 28, 12, 19, costab7, costab25, r10, r11
382+
383+/* t43 = t20 + t21;
384+ t61 = MUL(t20 - t21, costab14);
385+ t35 = t4 + t5;
386+ t52 = MUL(t4 - t5, costab14); */
387+ butterfly2 r5/*t43*/, r7/*t61*/, r4/*t35*/, r6/*t52*/, costab14, r10, r11, lr
388+
389+/* t6 = in[4] + in[27]; t22 = MUL(in[4] - in[27], costab9);
390+ t7 = in[11] + in[20]; t23 = MUL(in[11] - in[20], costab23); */
391+ butterfly2_in r0/*t6*/, r1/*t22*/, r2/*t7*/, r3/*t23*/, r12, 4, 27, 11, 20, costab9, costab23, r10, r11
392+
393+/* t44 = t22 + t23;
394+ t62 = MUL(t22 - t23, costab18);
395+ t36 = t6 + t7;
396+ t53 = MUL(t6 - t7, costab18); */
397+ butterfly2 r1/*t44*/, r3/*t62*/, r0/*t36*/, r2/*t53*/, costab18, r10, r11, lr
398+
399+/* t74 = t43 + t44; t95 = MUL(t43 - t44, costab28);
400+ t84 = t61 + t62; t107 = MUL(t61 - t62, costab28); */
401+
402+/* t70 = t35 + t36; t90 = MUL(t35 - t36, costab28);
403+ t79 = t52 + t53; t101 = MUL(t52 - t53, costab28); */
404+ butterfly4 r5/*t74*/, r1/*t95*/, r7/*t84*/, r3/*t107*/, r4/*t70*/, r0/*t90*/, r6/*t79*/, r2/*t101*/, costab28, r10, r11, lr
405+
406+/* Store away the computed butterflies:
407+ sp[8-15] = t84, t79, t74, t70, t107, t101, t95, t90 */
408+ sub r10, sp, -8*4
409+ stm r10, r0-r7
410+
411+
412+/* t8 = in[1] + in[30]; t24 = MUL(in[1] - in[30], costab3);
413+ t9 = in[14] + in[17]; t25 = MUL(in[14] - in[17], costab29); */
414+ butterfly2_in r4/*t8*/, r5/*t24*/, r6/*t9*/, r7/*t25*/, r12, 1, 30, 14, 17, costab3, costab29, r10, r11
415+
416+
417+/* t45 = t24 + t25;
418+ t63 = MUL(t24 - t25, costab6);
419+ t37 = t8 + t9;
420+ t54 = MUL(t8 - t9, costab6); */
421+ butterfly2 r5/*t45*/, r7/*t63*/, r4/*t37*/, r6/*t54*/, costab6, r10, r11, lr
422+
423+/* t10 = in[6] + in[25]; t26 = MUL(in[6] - in[25], costab13);
424+ t11 = in[9] + in[22]; t27 = MUL(in[9] - in[22], costab19); */
425+ butterfly2_in r0/*t10*/, r1/*t26*/, r2/*t11*/, r3/*t27*/, r12, 6, 25, 9, 22, costab13, costab19, r10, r11
426+
427+/* t46 = t26 + t27;
428+ t64 = MUL(t26 - t27, costab26);
429+ t38 = t10 + t11;
430+ t55 = MUL(t10 - t11, costab26); */
431+ butterfly2 r1/*t46*/, r3/*t64*/, r0/*t38*/, r2/*t55*/, costab26, r10, r11, lr
432+
433+/* t75 = t45 + t46; t96 = MUL(t45 - t46, costab12);
434+ t85 = t63 + t64; t108 = MUL(t63 - t64, costab12); */
435+
436+/* t71 = t37 + t38; t91 = MUL(t37 - t38, costab12);
437+ t80 = t54 + t55; t102 = MUL(t54 - t55, costab12); */
438+ butterfly4 r5/*t75*/, r1/*t96*/, r7/*t85*/, r3/*t108*/, r4/*t71*/, r0/*t91*/, r6/*t80*/, r2/*t102*/, costab12, r10, r11, lr
439+
440+/* Store away the computed butterflies:
441+ sp[16-23] = t85, t80, t75, t71, t108, t102, t96, t91 */
442+ sub r10, sp, -16*4
443+ stm r10, r0-r7
444+
445+/* t12 = in[2] + in[29]; t28 = MUL(in[2] - in[29], costab5);
446+ t13 = in[13] + in[18]; t29 = MUL(in[13] - in[18], costab27); */
447+ butterfly2_in r4/*t12*/, r5/*t28*/, r6/*t13*/, r7/*t29*/, r12, 2, 29, 13, 18, costab5, costab27, r10, r11
448+
449+/* t47 = t28 + t29;
450+ t65 = MUL(t28 - t29, costab10);
451+ t39 = t12 + t13;
452+ t56 = MUL(t12 - t13, costab10); */
453+ butterfly2 r5/*t47*/, r7/*t65*/, r4/*t39*/, r6/*t56*/, costab10, r10, r11, lr
454+
455+/* t14 = in[5] + in[26]; t30 = MUL(in[5] - in[26], costab11);
456+ t15 = in[10] + in[21]; t31 = MUL(in[10] - in[21], costab21);*/
457+ butterfly2_in r0/*t14*/, r1/*t30*/, r2/*t15*/, r3/*t31*/, r12, 5, 26, 10, 21, costab11, costab21, r10, r11
458+
459+/* t48 = t30 + t31;
460+ t66 = MUL(t30 - t31, costab22);
461+ t40 = t14 + t15;
462+ t57 = MUL(t14 - t15, costab22);*/
463+ butterfly2 r1/*t48*/, r3/*t66*/, r0/*t40*/, r2/*t57*/, costab22, r10, r11, lr
464+
465+/* t76 = t47 + t48; t97 = MUL(t47 - t48, costab20);
466+ t86 = t65 + t66; t109 = MUL(t65 - t66, costab20);*/
467+
468+/* t72 = t39 + t40; t92 = MUL(t39 - t40, costab20);
469+ t81 = t56 + t57; t103 = MUL(t56 - t57, costab20);*/
470+ butterfly4 r5/*t76*/, r1/*t97*/, r7/*t86*/, r3/*t109*/,r4/*t72*/, r0/*t92*/, r6/*t81*/, r2/*t103*/, costab20, r10, r11, lr
471+
472+/* Store away the computed butterflies:
473+ sp[24-31] = t86, t81, t76, t72, t109, t103, t97, t92 */
474+ sub r10, sp, -24*4
475+ stm r10, r0-r7
476+
477+/* We now have the following on the stack:
478+
479+ sp[0-7] = t83, t78, t73, t69, t106, t100, t94, t89
480+ sp[8-15] = t84, t79, t74, t70, t107, t101, t95, t90
481+ sp[16-23] = t85, t80, t75, t71, t108, t102, t96, t91
482+ sp[24-31] = t86, t81, t76, t72, t109, t103, t97, t92 */
483+
484+/* Load {r0...r7} = { t72, t76, t71, t75, t70, t74, t69, t73 } */
485+ ld.d r6, sp[2*4]
486+ ld.d r4, sp[10*4]
487+ ld.d r2, sp[18*4]
488+ ld.d r0, sp[26*4]
489+
490+
491+/* t113 = t69 + t70;
492+ t141 = MUL(t69 - t70, costab8);
493+
494+ t115 = t73 + t74;
495+ t144 = MUL(t73 - t74, costab8); */
496+ butterfly2 r6/*t113*/, r4/*t141*/, r7/*t115*/, r5/*t144*/, costab8, r10, r11, lr
497+
498+/* t114 = t71 + t72;
499+ t142 = MUL(t71 - t72, costab24);
500+
501+ t116 = t75 + t76;
502+ t145 = MUL(t75 - t76, costab24); */
503+ butterfly2 r2/*t114*/, r0/*t142*/, r3/*t116*/, r1/*t145*/, costab24, r10, r11, lr
504+
505+
506+/*
507+ t191 = t113 + t114;
508+ t192 = MUL(t113 - t114, costab16)
509+
510+ t32 = t115 + t116;
511+ t177 = MUL(t115 - t116, costab16) ;
512+
513+ t143 = t141 + t142;
514+ t190 = MUL(t141 - t142, costab16) ;
515+
516+ t146 = t144 + t145;
517+ t184 = MUL(t144 - t145, costab16) ; */
518+ butterfly4 r6/*t191*/, r2/*t192*/, r7/*t32*/, r3/*t177*/, r4/*t143*/, r0/*190*/, r5/*t146*/, r1/*t184*/, costab16, r10, r11, lr
519+
520+/* Store away the computed butterflies:
521+ sp[2-3] = t32, t191
522+ sp[10-11] = t146, t143
523+ sp[18-19] = t177, t192
524+ sp[26-27] = t184, t190 */
525+ st.d sp[2*4] , r6
526+ st.d sp[10*4], r4
527+ st.d sp[18*4], r2
528+ st.d sp[26*4], r0
529+
530+/* Load {r0...r7} = { t81, t86, t80, t85, t79, t84, t78, t83 } */
531+ ld.d r6, sp[0*4]
532+ ld.d r4, sp[8*4]
533+ ld.d r2, sp[16*4]
534+ ld.d r0, sp[24*4]
535+
536+
537+/* t118 = t78 + t79;
538+ t148 = MUL(t78 - t79, costab8);
539+
540+ t121 = t83 + t84;
541+ t152 = MUL(t83 - t84, costab8); */
542+ butterfly2 r6/*t118*/, r4/*t148*/, r7/*t121*/, r5/*t152*/, costab8, r10, r11, lr
543+
544+/* t119 = t80 + t81;
545+ t149 = MUL(t80 - t81, costab24);
546+
547+ t122 = t85 + t86;
548+ t153 = MUL(t85 - t86, costab24); */
549+ butterfly2 r2/*t119*/, r0/*t149*/, r3/*t122*/, r1/*t153*/, costab24, r10, r11, lr
550+
551+
552+
553+/* t58 = t118 + t119;
554+ t178 = MUL(t118 - t119, costab16) ;
555+
556+ t67 = t121 + t122;
557+ t179 = MUL(t121 - t122, costab16) ;
558+
559+ t150 = t148 + t149;
560+ t185 = MUL(t148 - t149, costab16) ;
561+
562+ t154 = t152 + t153;
563+ t186 = MUL(t152 - t153, costab16) ; */
564+ butterfly4 r6/*t58*/, r2/*t178*/, r7/*t67*/, r3/*t179*/, r4/*t150*/, r0/*185*/, r5/*t154*/, r1/*t186*/, costab16, r10, r11, lr
565+
566+/* Store away the computed butterflies:
567+ sp[0-1] = t67, t58
568+ sp[8-9] = t154, t150
569+ sp[16-17] = t179, t178
570+ sp[24-25] = t186, t185 */
571+ st.d sp[0*4] , r6
572+ st.d sp[8*4], r4
573+ st.d sp[16*4], r2
574+ st.d sp[24*4], r0
575+
576+/* Load {r0...r7} = { t92, t97, t91, t96, t90, t95, t89, t94 } */
577+ ld.d r6, sp[6*4]
578+ ld.d r4, sp[14*4]
579+ ld.d r2, sp[22*4]
580+ ld.d r0, sp[30*4]
581+
582+
583+/* t125 = t89 + t90;
584+ t157 = MUL(t89 - t90, costab8);
585+
586+ t128 = t94 + t95;
587+ t161 = MUL(t94 - t95, costab8); */
588+ butterfly2 r6/*t125*/, r4/*t157*/, r7/*t128*/, r5/*t161*/, costab8, r10, r11, lr
589+
590+/* t126 = t91 + t92;
591+ t158 = MUL(t91 - t92, costab24);
592+
593+ t129 = t96 + t97;
594+ t162 = MUL(t96 - t97, costab24); */
595+ butterfly2 r2/*t126*/, r0/*t158*/, r3/*t129*/, r1/*t162*/, costab24, r10, r11, lr
596+
597+
598+/*
599+ t93 = t125 + t126;
600+ t180 = MUL(t125 - t126, costab16) ;
601+
602+ t98 = t128 + t129;
603+ t181 = MUL(t128 - t129, costab16) ;
604+
605+ t159 = t157 + t158;
606+ t187 = MUL(t157 - t158, costab16) ;
607+
608+ t163 = t161 + t162;
609+ t188 = MUL(t161 - t162, costab16) ; */
610+ butterfly4 r6/*t93*/, r2/*t180*/, r7/*t98*/, r3/*t181*/, r4/*t159*/, r0/*187*/, r5/*t163*/, r1/*t188*/, costab16, r10, r11, lr
611+
612+
613+/* Store away the computed butterflies:
614+ sp[6-7] = t98, t93
615+ sp[14-15] = t163, t159
616+ sp[22-23] = t181, t180
617+ sp[30-31] = t188, t187 */
618+ st.d sp[6*4] , r6
619+ st.d sp[14*4], r4
620+ st.d sp[22*4], r2
621+ st.d sp[30*4], r0
622+
623+/* Load {r0...r7} = { t103, t109, t102, t108, t101, t107, t100, t106 } */
624+ ld.d r6, sp[4*4]
625+ ld.d r4, sp[12*4]
626+ ld.d r2, sp[20*4]
627+ ld.d r0, sp[28*4]
628+
629+
630+
631+/* t132 = t100 + t101;
632+ t166 = MUL(t100 - t101, costab8);
633+
634+ t136 = t106 + t107;
635+ t171 = MUL(t106 - t107, costab8); */
636+ butterfly2 r6/*t132*/, r4/*t166*/, r7/*t136*/, r5/*t171*/, costab8, r10, r11, lr
637+
638+/* t133 = t102 + t103;
639+ t167 = MUL(t102 - t103, costab24);
640+
641+ t137 = t108 + t109;
642+ t172 = MUL(t108 - t109, costab24);*/
643+ butterfly2 r2/*t133*/, r0/*t167*/, r3/*t137*/, r1/*t172*/, costab24, r10, r11, lr
644+
645+
646+/* t104 = t132 + t133;
647+ t182 = MUL(t132 - t133, costab16) ;
648+
649+ t110 = t136 + t137;
650+ t183 = MUL(t136 - t137, costab16) ;
651+
652+ t168 = t166 + t167;
653+ t189 = MUL(t166 - t167, costab16) ;
654+
655+ t173 = t171 + t172;
656+ t208 = MUL(t171 - t172, costab16) ; */
657+ butterfly4 r6/*t104*/, r2/*t182*/, r7/*t110*/, r3/*t183*/, r4/*t168*/, r0/*189*/, r5/*t173*/, r1/*t208*/, costab16, r10, r11, lr
658+
659+/* Store away the computed butterflies:
660+ sp[4-5] = t110, t104
661+ sp[12-13] = t173, t168
662+ sp[20-21] = t183, t182
663+ sp[28-29] = t208, t189 */
664+ st.d sp[4*4] , r6
665+ st.d sp[12*4], r4
666+ st.d sp[20*4], r2
667+ st.d sp[28*4], r0
668+
669+/* Now we have the following stack
670+
671+ sp[0-7] = t67, t58 , t32, t191, t110, t104, t98, t93
672+ sp[8-15] = t154, t150, t146, t143, t173, t168, t163, t159
673+ sp[16-23] = t179, t178, t177, t192, t183, t182, t181, t180
674+ sp[24-31] = t186, t185, t184, t190, t208, t189, t188, t187
675+*/
676+
677+ /* Get slot, lo and hi from stack */
678+ lddsp lr, sp[32*4 + 4] /*slot*/
679+ lddsp r12, sp[32*4 + 8] /*lo*/
680+ lddsp r11, sp[32*4 + 12] /*hi*/
681+
682+ add r12, r12, lr << 2
683+ add r11, r11, lr << 2
684+
685+
686+/* t49 = -(t67 * 2) + t32;
687+ hi[14][slot] = SHIFT(t32);
688+ t87 = -(t110 * 2) + t67;
689+ t138 = -(t173 * 2) + t110;
690+ t203 = -(t208 * 2) + t173; */
691+
692+ lddsp r0/*t67*/, sp[0]
693+ lddsp r1/*t32*/, sp[2*4]
694+ lddsp r2/*t110*/, sp[4*4]
695+ lddsp r3/*t173*/, sp[12*4]
696+ lddsp r5/*t208*/, sp[28*4]
697+
698+ sub r4/*t49*/, r1, r0 << 1
699+ scale r1
700+ sub r0/*t87*/, r0, r2 << 1
701+ st.w r11[14*SLOTS*4], r1
702+ sub r2/*t138*/, r2, r3 << 1
703+ sub r1/*t203*/, r3, r5 << 1
704+
705+/* Live: r0 = t87, r1= t203, r2= t138, r4 = t49
706+ Free: r3, r5, r6, r7, r8, r9, r10, lr */
707+
708+/* t68 = (t98 * 2) + t49;
709+ hi[12][slot] = SHIFT(-t49);
710+ t130 = -(t163 * 2) + t98;
711+ t201 = -(t188 * 2) + t163;
712+ t200 = -(t186 * 2) + t154;
713+ t111 = (t154 * 2) + t87;
714+ t77 = -(-(t87 * 2) - t68);
715+ t88 = (t146 * 2) + t77;
716+ t199 = -(t184 * 2) + t146;
717+ hi[ 8][slot] = SHIFT(-t77);
718+ hi[10][slot] = SHIFT(t68);*/
719+ lddsp r3/*t98*/, sp[6*4]
720+ lddsp r5/*t163*/, sp[14*4]
721+ lddsp r6/*t188*/, sp[30*4]
722+ lddsp r10/*t186*/, sp[24*4]
723+
724+ add r7/*t68*/, r4, r3 << 1
725+ neg r4
726+ scale r4
727+ lddsp r9/*t154*/, sp[8*4]
728+ sub r3/*t130*/, r3, r5 << 1
729+ st.w r11[12*SLOTS*4], r4
730+ sub r8/*t201*/, r5, r6 << 1
731+ sub r4/*t200*/, r9, r10 << 1
732+ lddsp lr/*t146*/, sp[10*4]
733+ lddsp r6/*t184*/, sp[26*4]
734+ add r10/*t111*/, r0, r9 << 1
735+ add r5/*t77*/,r7, r0 << 1
736+ add r0/*t88*/, r5, lr << 1
737+ sub r6/*t199*/, lr, r6 << 1
738+ neg r5
739+ scale r5
740+ scale r7
741+ st.w r11[8*SLOTS*4], r5
742+ st.w r11[10*SLOTS*4], r7
743+
744+/* Live: r0 = t88, r1= t203, r2= t138, r3 = t130, r4 = t200,
745+ r6 = 199, r8 = t201, r10 = t111
746+ Free: r5, r7, r9, lr */
747+
748+
749+/*
750+ t123 = -(-(t138 * 2) - t111);
751+ t174 = (t183 * 2) + t138;
752+ t99 = -(t111 * 2) + t88;
753+ hi[ 6][slot] = SHIFT(t88); */
754+ lddsp r5/*t183*/, sp[20*4]
755+
756+ add r7/*t123*/, r10, r2 << 1
757+ sub r10/*t99*/, r0, r10 << 1
758+ scale r0
759+ add r2/*t174*/, r2, r5 << 1
760+ st.w r11[6*SLOTS*4], r0
761+
762+/* Live: r1 = t203, r2 = t174, r3 = t130, r4 = t200,
763+ r6 = t199, r7 = t123, r8 = t201, r10 = t99
764+ Free: r0, r5, r9, lr */
765+
766+/* t112 = -(t130 * 2) + t99;
767+ t164 = (t181 * 2) + t130;
768+ hi[ 4][slot] = SHIFT(-t99); */
769+ lddsp r0/*t181*/, sp[22*4]
770+
771+ sub r5/*t112*/, r10, r3 << 1
772+ neg r10
773+ scale r10
774+ add r3/*164*/, r3, r0 << 1
775+ st.w r11[4*SLOTS*4], r10
776+
777+/* Live: r1 = t203, r2 = t174, r3 = t164, r4 = t200,
778+ r5 = t112, r6 = t199, r7 = t123, r8 = t201
779+ Free: r0, r9, r10, lr */
780+
781+
782+/* t117 = -(-(t123 * 2) - t112);
783+ t139 = (t179 * 2) + t123;
784+ hi[ 2][slot] = SHIFT(t112); */
785+ lddsp r0/*t179*/, sp[16*4]
786+
787+ add r9/*t117*/, r5, r7 << 1
788+ scale r5
789+ add r7/*t139*/, r7, r0 << 1
790+ st.w r11[2*SLOTS*4], r5
791+
792+/* Live: r1 = t203, r2 = t174, r3 = t164, r4 = t200,
793+ r6 = t199, r7 = t139, r8 = t201, r9 = t117
794+ Free: r0, r5, r10, lr */
795+
796+/* t155 = -(t174 * 2) + t139;
797+ t204 = -(-(t203 * 2) - t174);
798+ t124 = (t177 * 2) + t117;
799+ hi[ 0][slot] = SHIFT(-t117);
800+ t131 = -(t139 * 2) + t124;
801+ lo[ 1][slot] = SHIFT(t124);*/
802+ lddsp r0/*t177*/, sp[18*4]
803+
804+ sub r5/*t155*/, r7, r2 << 1
805+ add r2/*t204*/, r2, r1 << 1
806+ add r0/*t124*/, r9, r0 << 1
807+ neg r9
808+ scale r9
809+ sub r7/*t131*/, r0, r7 << 1
810+ scale r0
811+ st.w r11[0*SLOTS*4], r9
812+ st.w r12[1*SLOTS*4], r0
813+
814+/* Live: r2 = t204, r3 = t164, r4 = t200,
815+ r5 = t155, r6 = t199, r7 = t131, r8 = t201
816+ Free: r0, r1, r9, r10, lr */
817+
818+/* t140 = (t164 * 2) + t131;
819+ lo[ 3][slot] = SHIFT(-t131);
820+ t202 = -(-(t201 * 2) - t164); */
821+ add r0/*t140*/, r7, r3 << 1
822+ neg r7
823+ scale r7
824+ add r3/*t202*/, r3, r8 << 1
825+ st.w r12[3*SLOTS*4], r7
826+
827+/* Live: r0 = t140, r2 = t204, r3 = t202, r4 = t200,
828+ r5 = t155, r6 = t199
829+ Free: r1, r7, r8, r9, r10, lr */
830+
831+
832+/* t147 = -(-(t155 * 2) - t140);
833+ lo[ 5][slot] = SHIFT(t140);
834+ t175 = -(t200 * 2) + t155;
835+ t156 = -(t199 * 2) + t147;
836+ lo[ 7][slot] = SHIFT(-t147); */
837+ add r1/*t147*/, r0, r5 << 1
838+ scale r0
839+ sub r5/*t175*/, r5, r4 << 1
840+ sub r4/*156*/, r1, r6 << 1
841+ neg r1
842+ scale r1
843+ st.w r12[5*SLOTS*4], r0
844+ st.w r12[7*SLOTS*4], r1
845+
846+/* Live: r2 = t204, r3 = t202,
847+ r4 = t156, r5 = t175
848+ Free: r0, r1, r6, r7, r8, r9, r10, lr */
849+
850+
851+/* t205 = -(-(t204 * 2) - t175);
852+ t165 = -(t175 * 2) + t156;
853+ lo[ 9][slot] = SHIFT(t156);
854+ t176 = -(t202 * 2) + t165;
855+ lo[11][slot] = SHIFT(-t165);
856+ t206 = -(-(t205 * 2) - t176);
857+ lo[15][slot] = SHIFT(-t206)
858+ lo[13][slot] = SHIFT(t176) */
859+ add r0/*t205*/, r5, r2 << 1
860+ sub r1/*t165*/, r4, r5 << 1
861+ scale r4
862+ sub r3/*t176*/, r1, r3 << 1
863+ st.w r12[9*SLOTS*4], r4
864+ neg r1
865+ scale r1
866+ add r6/*t206*/, r3, r0 << 1
867+ neg r6
868+ scale r6
869+ scale r3
870+ st.w r12[11*SLOTS*4], r1
871+ st.w r12[15*SLOTS*4], r6
872+ st.w r12[13*SLOTS*4], r3
873+
874+/* t193 = -((t190 * 2) - t143)
875+ hi[ 7][slot] = SHIFT(t143);
876+ lo[ 8][slot] = SHIFT(-t193);
877+ t82 = -(t104 * 2) + t58;
878+ hi[13][slot] = SHIFT(t58);
879+ t134 = -(t168 * 2) + t104;
880+ t196 = -(t189 * 2) + t168; */
881+
882+ lddsp r0/*t190*/, sp[27*4]
883+ lddsp r1/*t143*/, sp[11*4]
884+ lddsp r2/*t104*/, sp[5*4]
885+ lddsp r3/*t58*/, sp[1*4]
886+ lddsp r4/*t168*/, sp[13*4]
887+ lddsp r5/*t189*/, sp[29*4]
888+ sub r0/*t193*/, r1, r0 << 1
889+ neg r0
890+ scale r1
891+ scale r0
892+ st.w r11[7*SLOTS*4], r1
893+ st.w r12[8*SLOTS*4], r0
894+ sub r0/*t82*/, r3, r2 << 1
895+ scale r3
896+ sub r2/*t134*/, r2, r4 << 1
897+ sub r4/*t196*/, r4, r5 << 1
898+ st.w r11[13*SLOTS*4], r3
899+
900+/* Live: r0 = t82, r2 = t134,
901+ r4 = t196
902+ Free: r1, r3, r5, r6, r7, r8, r9, r10, lr */
903+
904+
905+
906+/*
907+
908+ t207 = -(t185 * 2) + t150;
909+ t105 = (t150 * 2) + t82;
910+ hi[ 9][slot] = SHIFT(-t82);
911+ t120 = -(-(t134 * 2) - t105);
912+ hi[ 5][slot] = SHIFT(t105);
913+ t169 = (t182 * 2) + t134;
914+
915+ t135 = (t178 * 2) + t120;
916+ hi[ 1][slot] = SHIFT(-t120);
917+ t197 = -(-(t196 * 2) - t169);
918+ t151 = -(t169 * 2) + t135;
919+ lo[ 2][slot] = SHIFT(t135); */
920+ lddsp r1/*t185*/, sp[25*4]
921+ lddsp r3/*t150*/, sp[9*4]
922+ lddsp r5/*t182*/, sp[21*4]
923+ lddsp r8/*t178*/, sp[17*4]
924+
925+ sub r6/*t207*/, r3, r1 << 1
926+ add r3/*t105*/, r0, r3 << 1
927+ neg r0
928+ scale r0
929+ add r7/*t120*/, r3, r2 << 1
930+ scale r3
931+ st.w r11[9*SLOTS*4], r0
932+ st.w r11[5*SLOTS*4], r3
933+ add r2/*t169*/, r2, r5 << 1
934+ add r8/*t135*/, r7, r8 << 1
935+ neg r7
936+ scale r7
937+ add r4/*t197*/, r2, r4 << 1
938+ sub r2/*t151*/, r8, r2 << 1
939+ scale r8
940+ st.w r11[1*SLOTS*4], r7
941+ st.w r12[2*SLOTS*4], r8
942+
943+/* Live: r2 = t151, r4 = t197, r6 = t207
944+
945+ Free: r0, r1, r3, r5, r7, r8, r9, r10, lr */
946+
947+
948+
949+/* t170 = -(t207 * 2) + t151;
950+ lo[ 6][slot] = SHIFT(-t151);
951+
952+ t198 = -(-(t197 * 2) - t170);
953+ lo[10][slot] = SHIFT(t170);
954+ lo[14][slot] = SHIFT(-t198);
955+
956+ t127 = -(t159 * 2) + t93;
957+ hi[11][slot] = SHIFT(t93);
958+ t194 = -(t187 * 2) + t159; */
959+ lddsp r0/*t159*/, sp[15*4]
960+ lddsp r1/*t93*/, sp[7*4]
961+ lddsp r3/*t187*/, sp[31*4]
962+ sub r5/*t170*/, r2, r6 << 1
963+ neg r2
964+ scale r2
965+ add r4/*t198*/,r5, r4 << 1
966+ neg r4
967+ scale r5
968+ scale r4
969+ st.w r12[6*SLOTS*4], r2
970+ st.w r12[10*SLOTS*4], r5
971+ st.w r12[14*SLOTS*4], r4
972+ sub r7/*t127*/, r1, r0 << 1
973+ scale r1
974+ sub r0/*t194*/, r0, r3 << 1
975+ st.w r11[11*SLOTS*4], r1
976+
977+
978+/* Live: r0 = t194, r7 = t127
979+ Free: r1, r2, r3, r4, r6, r5, r8, r9, r10, lr */
980+
981+/* t160 = (t180 * 2) + t127;
982+ hi[ 3][slot] = SHIFT(-t127);
983+ t195 = -(-(t194 * 2) - t160);
984+ lo[ 4][slot] = SHIFT(t160);
985+ lo[12][slot] = SHIFT(-t195);
986+
987+ hi[15][slot] = SHIFT(t191);
988+ lo[ 0][slot] = SHIFT(t192); */
989+ lddsp r1/*t180*/, sp[23*4]
990+ lddsp r2/*t191*/, sp[3*4]
991+ lddsp r3/*t192*/, sp[19*4]
992+ add r4/*t160*/, r7, r1 << 1
993+ neg r7
994+ scale r7
995+ add r6/*t195*/, r4, r0 << 1
996+ scale r4
997+ neg r6
998+ scale r6
999+ st.w r11[3*SLOTS*4], r7
1000+ st.w r12[4*SLOTS*4], r4
1001+ st.w r12[12*SLOTS*4], r6
1002+ scale r2
1003+ scale r3
1004+ st.w r11[15*SLOTS*4], r2
1005+ st.w r12[0*SLOTS*4], r3
1006+
1007+ sub sp, -32*4
1008+ ldm sp++,r0-r7, r9-r11, pc
1009diff --git a/fixed.h b/fixed.h
1010index 4b58abf..0a1350a 100644
1011--- a/fixed.h
1012+++ b/fixed.h
1013@@ -237,6 +237,46 @@ mad_fixed_t mad_f_mul_inline(mad_fixed_t x, mad_fixed_t y)
1014 # define MAD_F_SCALEBITS MAD_F_FRACBITS
1015 # endif
1016
1017+/* --- AVR32 ----------------------------------------------------------------- */
1018+
1019+# elif defined(FPM_AVR32)
1020+
1021+typedef signed short mad_coeff_t;
1022+
1023+struct DWstruct {int high, low;};
1024+
1025+typedef union {
1026+ struct DWstruct s;
1027+ long long ll;
1028+} DWunion;
1029+
1030+# define MAD_F_MLX(hi, lo, x, y) \
1031+ { register DWunion __res; \
1032+ __res.ll = (long long)x * (long long)y; \
1033+ /* asm ("muls.d\t%0, %1, %2" : "=r" (__res.ll) : "r" (x), "r" (y));*/ \
1034+ hi = __res.s.high; \
1035+ lo = __res.s.low; }
1036+
1037+# define MAD_F_MLA(hi, lo, x, y) \
1038+ { register DWunion __res; \
1039+ __res.s.high = hi; \
1040+ __res.s.low = lo; \
1041+ __res.ll += (long long)x * (long long)y; \
1042+/* asm ("macs.d\t%0, %1, %2" : "+r" (__res.ll) : "r" (x), "r" (y));*/ \
1043+ hi = __res.s.high; \
1044+ lo = __res.s.low; }
1045+
1046+
1047+# define MAD_F_MLN(hi, lo) \
1048+ asm ("neg %0\n" \
1049+ "acr %1\n" \
1050+ "neg %1" \
1051+ : "+r" (lo), "+r" (hi) \
1052+ :: "cc")
1053+
1054+
1055+# define MAD_F_SCALEBITS MAD_F_FRACBITS
1056+
1057 /* --- ARM ----------------------------------------------------------------- */
1058
1059 # elif defined(FPM_ARM)
1060@@ -433,6 +473,8 @@ mad_fixed_t mad_f_mul_inline(mad_fixed_t x, mad_fixed_t y)
1061 *
1062 * Pre-rounding is required to stay within the limits of compliance.
1063 */
1064+typedef signed int mad_coeff_t;
1065+
1066 # if defined(OPT_SPEED)
1067 # define mad_f_mul(x, y) (((x) >> 12) * ((y) >> 16))
1068 # else
1069diff --git a/imdct_avr32.S b/imdct_avr32.S
1070new file mode 100644
1071index 0000000..d0ee6b4
1072--- /dev/null
1073+++ b/imdct_avr32.S
1074@@ -0,0 +1,789 @@
1075+/*
1076+ Optimized 36-point Inverse Modified Cosine Transform (IMDCT)
1077+ Copyright 2003-2006 Atmel Corporation.
1078+
1079+ Written by Ronny Pedersen, Atmel Norway
1080+
1081+ This program is free software; you can redistribute it and/or modify
1082+ it under the terms of the GNU General Public License as published by
1083+ the Free Software Foundation; either version 2 of the License, or
1084+ (at your option) any later version.
1085+
1086+ This program is distributed in the hope that it will be useful,
1087+ but WITHOUT ANY WARRANTY; without even the implied warranty of
1088+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1089+ GNU General Public License for more details.
1090+
1091+ You should have received a copy of the GNU General Public License
1092+ along with this program; if not, write to the Free Software
1093+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
1094+
1095+#define MAD_F(x) ((x + (1 << 13)) >> 14)
1096+
1097+ .public imdct36_avr32
1098+
1099+/*
1100+ void imdct36(mad_fixed_t const x[18], mad_fixed_t y[36])
1101+ {
1102+ mad_fixed_t tmp[18];
1103+ int i;
1104+*/
1105+/* DCT-IV */
1106+imdct36_avr32:
1107+ pushm r0-r7,r11,lr
1108+ sub sp, 4*18
1109+/*
1110+ {
1111+ mad_fixed_t tmp2[18];
1112+ int i;
1113+
1114+ /* scale[i] = 2 * cos(PI * (2 * i + 1) / (4 * 18)) */
1115+/*
1116+ static mad_fixed_t const scale[18] = {
1117+ MAD_F(0x1ff833fa), MAD_F(0x1fb9ea93), MAD_F(0x1f3dd120),
1118+ MAD_F(0x1e84d969), MAD_F(0x1d906bcf), MAD_F(0x1c62648b),
1119+ MAD_F(0x1afd100f), MAD_F(0x1963268b), MAD_F(0x1797c6a4),
1120+ MAD_F(0x159e6f5b), MAD_F(0x137af940), MAD_F(0x11318ef3),
1121+ MAD_F(0x0ec6a507), MAD_F(0x0c3ef153), MAD_F(0x099f61c5),
1122+ MAD_F(0x06ed12c5), MAD_F(0x042d4544), MAD_F(0x0165547c)
1123+ };
1124+*/
1125+
1126+ /* scaling */
1127+
1128+/*
1129+ for (i = 0; i < 18; i += 3) {
1130+ tmp2[i + 0] = mad_f_mul(x[i + 0], scale[i + 0]);
1131+ tmp2[i + 1] = mad_f_mul(x[i + 1], scale[i + 1]);
1132+ tmp2[i + 2] = mad_f_mul(x[i + 2], scale[i + 2]);
1133+ }
1134+*/
1135+ /* even input butterfly */
1136+
1137+/*
1138+ for (i = 0; i < 9; i += 3) {
1139+ tmp3[i + 0] = tmp2[i + 0] + tmp2[18 - (i + 0) - 1];
1140+ tmp3[i + 1] = tmp2[i + 1] + tmp2[18 - (i + 1) - 1];
1141+ tmp3[i + 2] = tmp2[i + 2] + tmp2[18 - (i + 2) - 1];
1142+ }
1143+ for (i = 0; i < 9; i += 3) {
1144+ tmp4[i + 0] = tmp2[i + 0] - tmp2[18 - (i + 0) - 1];
1145+ tmp4[i + 1] = tmp2[i + 1] - tmp2[18 - (i + 1) - 1];
1146+ tmp4[i + 2] = tmp2[i + 2] - tmp2[18 - (i + 2) - 1];
1147+ }
1148+*/
1149+
1150+ ld.d r8, r12[0] /*r8 = x[1], r9 = x[0]*/
1151+ ld.d r0, pc[scale_dctIV - .] /*r0 = {scale[2], scale[3]}, r1 = { scale[0], scale[1] }*/
1152+ ld.d r2, r12[2*4] /*r2 = x[3], r3 = x[2]*/
1153+ ld.d r4, pc[scale_dctIV - . + 14*2] /*r4 = {scale[16], scale[17]}, r5 = { scale[14], scale[15] }*/
1154+ mulsatrndwh.w r9/*tmp2[0]*/, r9, r1:t /*tmp2[0] = mad_f_mul(x[0], scale[0]) */
1155+ ld.d r6, r12[16*4] /*r6 = x[17], r7 = x[16]*/
1156+ mulsatrndwh.w r8/*tmp2[1]*/, r8, r1:b /*tmp2[1] = mad_f_mul(x[1], scale[1]) */
1157+ mulsatrndwh.w r3/*tmp2[2]*/, r3, r0:t /*tmp2[2] = mad_f_mul(x[2], scale[2]) */
1158+ mulsatrndwh.w r2/*tmp2[3]*/, r2, r0:b /*tmp2[3] = mad_f_mul(x[3], scale[3]) */
1159+ ld.d r0, r12[14*4] /*r0 = x[15], r1 = x[14]*/
1160+ mulsatrndwh.w r7/*tmp2[16]*/, r7, r4:t /*tmp2[16] = mad_f_mul(x[16], scale[16]) */
1161+ mulsatrndwh.w r6/*tmp2[17]*/, r6, r4:b /*tmp2[17] = mad_f_mul(x[17], scale[17]) */
1162+ mulsatrndwh.w r1/*tmp2[14]*/, r1, r5:t /*tmp2[14] = mad_f_mul(x[14], scale[14]) */
1163+ mulsatrndwh.w r0/*tmp2[15]*/, r0, r5:b /*tmp2[15] = mad_f_mul(x[15], scale[15]) */
1164+
1165+ ld.d r4, r12[4*4] /*r4 = x[5], r5 = x[4]*/
1166+
1167+ sub lr/*tmp4[0]*/, r9, r6
1168+ add r6/*tmp3[0]*/, r9, r6
1169+ sub r10/*tmp4[1]*/, r8, r7
1170+ add r7/*tmp3[1]*/, r8, r7
1171+ sub r9/*tmp4[2]*/, r3, r0
1172+ add r0/*tmp3[2]*/, r3, r0
1173+ sub r8/*tmp4[3]*/, r2, r1
1174+ add r1/*tmp3[3]*/, r2, r1
1175+
1176+ ld.d r2, pc[scale_dctIV - . + 4*2] /*r2 = {scale[6], scale[7]}, r3 = { scale[4], scale[5] }*/
1177+
1178+ stm --sp, r8-r10, lr /*sp[0] = tmp4[0],sp[1] = tmp4[1],
1179+ sp[2] = tmp4[2],sp[3] = tmp4[3] */
1180+
1181+ /* Registers used: r0 = tmp3[2], r1 = tmp3[3], r6 = tmp3[0], r7 = tmp3[1], r12 = x
1182+ Free registers: r2-r5, r8-r11, lr
1183+ */
1184+ ld.d r8, r12[6*4] /*r8 = x[7], r9 = x[6]*/
1185+ ld.d r10, pc[scale_dctIV - . + 10*2] /*r10 = {scale[12], scale[13]}, r11 = { scale[10], scale[11] }*/
1186+ mulsatrndwh.w r5/*tmp2[4]*/, r5, r3:t /*tmp2[4] = mad_f_mul(x[4], scale[4]) */
1187+ mulsatrndwh.w r4/*tmp2[5]*/, r4, r3:b /*tmp2[5] = mad_f_mul(x[5], scale[5]) */
1188+ mulsatrndwh.w r9/*tmp2[6]*/, r9, r2:t /*tmp2[6] = mad_f_mul(x[6], scale[6]) */
1189+ mulsatrndwh.w r8/*tmp2[7]*/, r8, r2:b /*tmp2[7] = mad_f_mul(x[7], scale[7]) */
1190+
1191+ ld.d r2, r12[12*4] /*r2 = x[13], r3 = x[12]*/
1192+ ld.w lr, r12[11*4] /*lr = x[11] */
1193+ mulsatrndwh.w r3/*tmp2[12]*/, r3, r10:t /*tmp2[12] = mad_f_mul(x[12], scale[12]) */
1194+ mulsatrndwh.w r2/*tmp2[13]*/, r2, r10:b /*tmp2[13] = mad_f_mul(x[13], scale[13]) */
1195+ ld.w r10, r12[10*4] /*r10 = x[10] */
1196+ mulsatrndwh.w lr/*tmp2[11]*/, lr, r11:b /*tmp2[11] = mad_f_mul(x[11], scale[11]) */
1197+ mulsatrndwh.w r10/*tmp2[10]*/, r10, r11:t /*tmp2[10] = mad_f_mul(x[10], scale[10]) */
1198+
1199+ sub r11/*tmp4[4]*/, r5, r2
1200+ add r2/*tmp3[4]*/, r5, r2
1201+ sub r5/*tmp4[5]*/, r4, r3
1202+ add r3/*tmp3[5]*/, r4, r3
1203+ sub r4/*tmp4[6]*/, r9, lr
1204+ add lr/*tmp3[6]*/, r9, lr
1205+ sub r9/*tmp4[7]*/, r8, r10
1206+ add r10/*tmp3[7]*/, r8, r10
1207+ lddpc r8, scale_dctIV + 8*2 /*r8 = {scale[8], scale[9]} */
1208+
1209+ stm --sp, r4, r5, r9, r11 /*sp[0] = tmp4[4],sp[1] = tmp4[7],
1210+ sp[2] = tmp4[5],sp[3] = tmp4[6] */
1211+ ld.d r4, r12[8*4] /*r4 = x[9], r5 = x[8]*/
1212+ mulsatrndwh.w r5/*tmp2[8]*/, r5, r8:t /*tmp2[8] = mad_f_mul(x[8], scale[8]) */
1213+ mulsatrndwh.w r4/*tmp2[9]*/, r4, r8:b /*tmp2[9] = mad_f_mul(x[9], scale[9]) */
1214+ sub r9/*tmp4[8]*/, r5, r4
1215+ add r5/*tmp3[8]*/, r5, r4
1216+
1217+ st.w --sp, r9 /* sp[0] = tmp4[8] */
1218+
1219+ /* Registers used:
1220+
1221+ r0=tmp3[2], r1=tmp3[3], r2=tmp3[4], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1222+ r7 = tmp3[1], r10=tmp3[7], lr=tmp3[6]
1223+ Free registers:
1224+ r4, r8, r9, r11, r12
1225+ */
1226+
1227+
1228+ /* SDCT-II */
1229+/*
1230+
1231+ {
1232+ mad_fixed_t tmp3[9];
1233+ int i;
1234+*/
1235+ /* scale[i] = 2 * cos(PI * (2 * i + 1) / (2 * 18)) */
1236+/*
1237+ static mad_fixed_t const scale[9] = {
1238+ MAD_F(0x1fe0d3b4), MAD_F(0x1ee8dd47), MAD_F(0x1d007930),
1239+ MAD_F(0x1a367e59), MAD_F(0x16a09e66), MAD_F(0x125abcf8),
1240+ MAD_F(0x0d8616bc), MAD_F(0x08483ee1), MAD_F(0x02c9fad7)
1241+ };
1242+*/
1243+ /* divide the 18-point SDCT-II into two 9-point SDCT-IIs */
1244+
1245+
1246+ /* fastdct */
1247+
1248+/*
1249+ {
1250+ mad_fixed_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12;
1251+ mad_fixed_t a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25;
1252+ mad_fixed_t m0, m1, m2, m3, m4, m5, m6, m7;
1253+*/
1254+// enum {
1255+// c0 = MAD_F(0x1f838b8d), /* 2 * cos( 1 * PI / 18) */
1256+// c1 = MAD_F(0x1bb67ae8), /* 2 * cos( 3 * PI / 18) */
1257+// c2 = MAD_F(0x18836fa3), /* 2 * cos( 4 * PI / 18) */
1258+// c3 = MAD_F(0x1491b752), /* 2 * cos( 5 * PI / 18) */
1259+// c4 = MAD_F(0x0af1d43a), /* 2 * cos( 7 * PI / 18) */
1260+// c5 = MAD_F(0x058e86a0), /* 2 * cos( 8 * PI / 18) */
1261+// c6 = -MAD_F(0x1e11f642) /* 2 * cos(16 * PI / 18) */
1262+// };
1263+
1264+/*
1265+ a2 = tmp3[6] + tmp3[2];
1266+ a6 = tmp3[8] + tmp3[0];
1267+ a11 = a2 - a6;
1268+ m5 = mad_f_mul(a11, -c6) ;
1269+ a4 = tmp3[1] + tmp3[7];
1270+
1271+ a18 = tmp3[4] + a4;
1272+ a19 = -2 * tmp3[4] + a4;
1273+
1274+ a0 = tmp3[3] + tmp3[5];
1275+
1276+*/
1277+ add r11/*a4*/, r7, r10
1278+ add r12/*a18*/, r2, r11
1279+ sub r11/*a19*/, r11, r2<<1
1280+
1281+ add r4/*a2*/, lr, r0
1282+ add r8/*a6*/, r5, r6
1283+ sub r9/*a11*/, r4, r8
1284+
1285+ st.d --sp, r0 /* sp[0] = tmp3[3], sp1[1] = tmp3[2]*/
1286+
1287+ mov r2, MAD_F(0x1e11f642)
1288+ mulsatrndwh.w r9/*m5*/, r9, r2:b
1289+
1290+ add r2/*a0*/, r1, r3
1291+
1292+ /* Registers used:
1293+
1294+ r2=a0, r3=tmp3[5], r4=a2, r5=tmp3[8], r6 = tmp3[0],
1295+ r7 = tmp3[1], r8=a6, r10=tmp3[7], r9=m5, r11=a19, r12=a18,lr=tmp3[6]
1296+ Free registers:
1297+ r0, r1
1298+ */
1299+
1300+/*
1301+ a8 = a0 + a2;
1302+ a12 = a8 + a6;
1303+ a10 = a0 - a6;
1304+ a9 = a0 - a2;
1305+ m7 = mad_f_mul(a9, -c2) ;
1306+ m6 = mad_f_mul(a10, -c5) ;
1307+*/
1308+
1309+ add r0/*a8*/, r2, r4
1310+ add r0/*a12*/, r8
1311+ rsub r8/*a10*/, r2
1312+ sub r2/*a9*/, r4
1313+ mov r1, -MAD_F(0x18836fa3)
1314+ mulsatrndwh.w r2/*m7*/, r2, r1:b
1315+ mov r1, -MAD_F(0x058e86a0)
1316+ mulsatrndwh.w r8/*m6*/, r8, r1:b
1317+
1318+ /* Registers used:
1319+
1320+ r0=a12, r2=m7, r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1321+ r7 = tmp3[1], r8=m6, r10=tmp3[7], r9=m5, r11=a19, r12=a18,lr=tmp3[6]
1322+ Free registers:
1323+ r1, r4
1324+ */
1325+
1326+
1327+/*
1328+ a21 = -a19 - (m5 << 1);
1329+ tmp[ 8] = a21 - (m6 << 1);
1330+
1331+ a20 = a19 - (m5 << 1);
1332+ tmp[ 4] = (m7 << 1) + a20;
1333+ a22 = -a19 + (m6 << 1);
1334+ tmp[16] = a22 + (m7 << 1);
1335+ tmp[ 0] = a18 + a12;
1336+ tmp[12] = a12 - 2 * a18;
1337+*/
1338+ add r1/*a21*/, r11, r9 << 1
1339+ neg r1
1340+ sub r1/*tmp[8]*/, r1, r8 << 1
1341+ stdsp sp[4*11/*tmp3[..] on the stack*/ + 8*4], r1
1342+ sub r4/*a20*/, r11, r9 << 1
1343+ add r4/*tmp[4]*/, r4, r2 << 1
1344+ stdsp sp[4*11/*tmp3[..] on the stack*/ + 4*4], r4
1345+ neg r11
1346+ add r1/*a22*/, r11, r8 << 1
1347+ add r1/*tmp[16]*/, r1, r2 << 1
1348+ stdsp sp[4*11/*tmp3[..] on the stack*/ + 16*4], r1
1349+ add r4, r12, r0
1350+ sub r1, r0, r12 << 1
1351+ stdsp sp[4*11/*tmp3[..] on the stack*/ + 0*4], r4
1352+ stdsp sp[4*11/*tmp3[..] on the stack*/ + 12*4], r1
1353+
1354+ ld.d r0, sp++
1355+
1356+ /* Registers used:
1357+
1358+ r0 = tmp3[2], r1 = tmp3[3], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1359+ r7 = tmp3[1], r10=tmp3[7], r11=a19, lr=tmp3[6]
1360+ Free registers:
1361+ r2,r4,r8,r9,r12
1362+ */
1363+
1364+/*
1365+ a5 = tmp3[1] - tmp3[7];
1366+ a7 = tmp3[8] - tmp3[0];
1367+ a3 = tmp3[6] - tmp3[2];
1368+ a1 = tmp3[3] - tmp3[5];
1369+ a13 = a1 - a3;
1370+ a14 = a13 + a7;
1371+ m3 = mad_f_mul(a14, -c1) ;
1372+ m4 = mad_f_mul(a5, -c1) ;
1373+ tmp[ 6] = m3 << 1;
1374+*/
1375+ sub r7/*a5*/, r10
1376+ sub r2/*a7*/, r5, r6
1377+ sub r4/*a3*/, lr, r0
1378+ sub r8/*a1*/, r1, r3
1379+ sub r9/*a13*/, r8, r4
1380+ add r12/*a14*/, r9, r2
1381+ mov r0, -MAD_F(0x1bb67ae8)
1382+ mulsatrndwh.w r12/*m3*/, r12, r0:b
1383+ mulsatrndwh.w r7/*m4*/, r7, r0:b
1384+ lsl r12, 1
1385+ stdsp sp[4*9/*tmp3[..] on the stack*/ + 6*4], r12
1386+
1387+ /* Registers used:
1388+ r2 = a7, r4 = a3, r7 = m4, r8 = a1, r12 = m3
1389+
1390+ Free registers:
1391+ r0, r1, r3, r5, r6, r10, r9, r11, lr
1392+ */
1393+
1394+
1395+/*
1396+ a15 = a3 + a7;
1397+ m2 = mad_f_mul(a15, -c4) ;
1398+ a17 = a1 + a3;
1399+ m0 = mad_f_mul(a17, -c3) ;
1400+ a23 = (m4 << 1) + (m2 << 1);
1401+ tmp[14] = a23 + (m0 << 1); */
1402+ add r0/*a15*/, r4, r2
1403+ mov r1, -MAD_F(0x0af1d43a)
1404+ mulsatrndwh.w r0/*m2*/, r0, r1:b
1405+ mov r3, -MAD_F(0x1491b752)
1406+ add r5/*a17*/, r8, r4
1407+ mulsatrndwh.w r5/*m0*/, r5, r3:b
1408+ lsl r7, 1
1409+ add r6/*a23*/, r7, r0 << 1
1410+ add r6/*tmp[14]*/, r6, r5 << 1
1411+ stdsp sp[4*9/*tmp3[..] on the stack*/ + 14*4], r6
1412+
1413+ /* Registers used:
1414+ r0 = m2, r2 = a7, r5 = m0, r7 = m4, r8 = a1
1415+
1416+ Free registers:
1417+ r1, r3, r4, r6, r10, r9, r11, lr
1418+ */
1419+
1420+/*
1421+ a16 = a1 - a7;
1422+ m1 = mad_f_mul(a16, -c0) ;
1423+ a24 = (m4 << 1) - (m2 << 1);
1424+ tmp[10] = a24 - (m1 << 1);
1425+
1426+ a25 = (m4 << 1) + (m1 << 1);
1427+ tmp[ 2] = (m0 << 1) - a25;
1428+*/
1429+ sub r3/*a16*/, r8, r2
1430+ mov r4, -MAD_F(0x1f838b8d)
1431+ mulsatrndwh.w r3/*m1*/, r3, r4:b
1432+ sub r1/*a24*/, r7, r0 << 1
1433+ sub r1/*tmp[10]*/, r1, r3 << 1
1434+ stdsp sp[4*9/*tmp3[..] on the stack*/ + 10*4], r1
1435+ add r7/*a25*/, r7, r3 << 1
1436+ sub r7, r7, r5 << 1
1437+ neg r7
1438+ stdsp sp[4*9/*tmp3[..] on the stack*/ + 2*4], r7
1439+
1440+
1441+
1442+
1443+ /* output to every other slot for convenience */
1444+
1445+ /*} */
1446+ /* End fastdct */
1447+
1448+ /* odd input butterfly and scaling */
1449+
1450+
1451+ /* On the stack:
1452+ sp[0] = tmp4[8], sp[1] = tmp4[4],sp[2] = tmp4[7], sp[3] = tmp4[5],sp[4] = tmp4[6]
1453+ sp[5] = tmp4[0], sp[6] = tmp4[1],sp[7] = tmp4[2],sp[8] = tmp4[3]
1454+ */
1455+
1456+ /*
1457+ tmp3[0] = mad_f_mul(tmp4[0], scale[0]);
1458+ tmp3[1] = mad_f_mul(tmp4[1], scale[1]) << 1;
1459+ tmp3[2] = mad_f_mul(tmp4[2], scale[2]);
1460+ tmp3[3] = mad_f_mul(tmp4[3], scale[3]) << 1;
1461+ tmp3[4] = mad_f_mul(tmp4[4], scale[4]);
1462+ tmp3[5] = mad_f_mul(tmp4[5], scale[5]);
1463+ tmp3[6] = mad_f_mul(tmp4[6], scale[6]) << 1;
1464+ tmp3[7] = mad_f_mul(tmp4[7], scale[7]);
1465+ tmp3[8] = mad_f_mul(tmp4[8], scale[8]) << 1;
1466+ */
1467+ /* Registers used:
1468+ r1 = tmp4[3], r2 = tmp4[2], r3 = tmp4[1], r4 = tmp4[0], r7 = tmp4[6]
1469+ r10 = tmp4[5], r11 = tmp4[7], r12 = tmp4[4], lr = tmp4[8]
1470+
1471+ Free registers:
1472+ r0, r5, r6, r8, r9
1473+ */
1474+ ld.d r8, pc[ scale_sdctII - . + 4*2] /* r8 = { scale[6], scale[7] }, r9 = { scale[4], scale[5]} */
1475+ ldm sp++, r1, r2, r3, r4, r7, r10, r11, r12, lr
1476+ mov r5, MAD_F(0x02c9fad7) /* r3 = scale[8] */
1477+ mulsatrndwh.w r5/*tmp3[8]*/, lr, r5:b
1478+ mulsatrndwh.w lr/*tmp3[6]*/, r7, r8:t
1479+ ld.d r6, pc[ scale_sdctII - . + 0*2] /* r6 = { scale[2], scale[3] }, r7 = { scale[0], scale[1]} */
1480+ lsl lr, 1
1481+ lsl r5, 1
1482+ mulsatrndwh.w r0/*tmp3[2]*/, r2, r6:t
1483+ mulsatrndwh.w r1/*tmp3[3]*/, r1, r6:b
1484+ mulsatrndwh.w r6/*tmp3[0]*/, r4, r7:t
1485+ mulsatrndwh.w r7/*tmp3[1]*/, r3, r7:b
1486+ mulsatrndwh.w r3/*tmp3[5]*/, r10, r9:b
1487+ mulsatrndwh.w r2/*tmp3[4]*/, r12, r9:t
1488+ mulsatrndwh.w r9/*tmp3[7]*/, r11, r8:b
1489+ lsl r1, 1
1490+ lsl r7, 1
1491+
1492+
1493+ /* fastdct */
1494+
1495+/*
1496+ {
1497+ mad_fixed_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12;
1498+ mad_fixed_t a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25;
1499+ mad_fixed_t m0, m1, m2, m3, m4, m5, m6, m7;
1500+*/
1501+// enum {
1502+// c0 = MAD_F(0x1f838b8d), /* 2 * cos( 1 * PI / 18) */
1503+// c1 = MAD_F(0x1bb67ae8), /* 2 * cos( 3 * PI / 18) */
1504+// c2 = MAD_F(0x18836fa3), /* 2 * cos( 4 * PI / 18) */
1505+// c3 = MAD_F(0x1491b752), /* 2 * cos( 5 * PI / 18) */
1506+// c4 = MAD_F(0x0af1d43a), /* 2 * cos( 7 * PI / 18) */
1507+// c5 = MAD_F(0x058e86a0), /* 2 * cos( 8 * PI / 18) */
1508+// c6 = -MAD_F(0x1e11f642) /* 2 * cos(16 * PI / 18) */
1509+// };
1510+
1511+ /* Registers used:
1512+
1513+ r0=tmp3[2], r1=tmp3[3], r2=tmp3[4], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1514+ r7 = tmp3[1], r9=tmp3[7], lr=tmp3[6]
1515+ Free registers:
1516+ r4, r8, r10, r11, r12
1517+ */
1518+
1519+/*
1520+ a2 = tmp3[6] + (tmp3[2] << 1);
1521+ a6 = tmp3[8] + (tmp3[0] << 1);
1522+ a11 = a2 - a6;
1523+ m5 = mad_f_mul(a11, c6) ;
1524+ a4 = tmp3[1] + (tmp3[7] << 1);
1525+
1526+ a18 = (tmp3[4] << 1) + a4;
1527+ a19 = -2 * (tmp3[4] << 1) + a4;
1528+
1529+ a0 = tmp3[3] + (tmp3[5] << 1);
1530+
1531+*/
1532+ add r11/*a4*/, r7, r9 << 1
1533+ add r12/*a18*/, r11, r2 << 1
1534+ sub r11/*a19*/, r11, r2 << 2
1535+
1536+ add r4/*a2*/, lr, r0 << 1
1537+ add r8/*a6*/, r5, r6 << 1
1538+ sub r10/*a11*/, r4, r8
1539+
1540+ st.d --sp, r0 /* sp[0] = tmp3[3], sp1[1] = tmp3[2]*/
1541+
1542+ mov r2, -MAD_F(0x1e11f642)
1543+ mulsatrndwh.w r10/*m5*/, r10, r2:b
1544+
1545+ add r2/*a0*/, r1, r3 << 1
1546+
1547+ /* Registers used:
1548+
1549+ r2=a0, r3=tmp3[5], r4=a2, r5=tmp3[8], r6 = tmp3[0],
1550+ r7 = tmp3[1], r8=a6, r9=tmp3[7], r10=m5, r11=a19, r12=a18,lr=tmp3[6]
1551+ Free registers:
1552+ r0, r1
1553+ */
1554+
1555+/*
1556+ a8 = a0 + a2;
1557+ a12 = a8 + a6;
1558+ a10 = a0 - a6;
1559+ a9 = a0 - a2;
1560+ m7 = mad_f_mul(a9, -c2) ;
1561+ m6 = mad_f_mul(a10, -c5) ;
1562+*/
1563+
1564+ add r0/*a8*/, r2, r4
1565+ add r0/*a12*/, r8
1566+ rsub r8/*a10*/, r2
1567+ sub r2/*a9*/, r4
1568+ mov r1, -MAD_F(0x18836fa3)
1569+ mulsatrndwh.w r2/*m7*/, r2, r1:b
1570+ mov r1, -MAD_F(0x058e86a0)
1571+ mulsatrndwh.w r8/*m6*/, r8, r1:b
1572+
1573+ /* Registers used:
1574+
1575+ r0=a12, r2=m7, r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1576+ r7 = tmp3[1], r8=m6, r9=tmp3[7], r10=m5, r11=a19, r12=a18,lr=tmp3[6]
1577+ Free registers:
1578+ r1, r4
1579+ */
1580+
1581+
1582+/*
1583+ a21 = -a19 + (m5 << 1);
1584+ tmp[ 9] = a21 - (m6 << 1);
1585+
1586+ a20 = -(-a19 - (m5 << 1));
1587+ tmp[ 5] = (m7 << 1) + a20;
1588+ a22 = -a19 + (m6 << 1);
1589+ tmp[17] = a22 + (m7 << 1);
1590+ tmp[ 1] = a18 + a12;
1591+ tmp[13] = a12 - 2 * a18;
1592+*/
1593+ sub r1/*a21*/, r11, r10 << 1
1594+ neg r1
1595+ sub r1/*tmp[9]*/, r1, r8 << 1
1596+ stdsp sp[4*2/*tmp3[..] on the stack*/ + 9*4], r1
1597+ add r4/*a20*/, r11, r10 << 1
1598+ add r4/*tmp[5]*/, r4, r2 << 1
1599+ stdsp sp[4*2/*tmp3[..] on the stack*/ + 5*4], r4
1600+ neg r11
1601+ add r1/*a22*/, r11, r8 << 1
1602+ add r1/*tmp[17]*/, r1, r2 << 1
1603+ stdsp sp[4*2/*tmp3[..] on the stack*/ + 17*4], r1
1604+ add r4, r12, r0
1605+ sub r1, r0, r12 << 1
1606+ stdsp sp[4*2/*tmp3[..] on the stack*/ + 1*4], r4
1607+ stdsp sp[4*2/*tmp3[..] on the stack*/ + 13*4], r1
1608+
1609+ ld.d r0, sp++
1610+
1611+ /* Registers used:
1612+
1613+ r0 = tmp3[2], r1 = tmp3[3], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1614+ r7 = tmp3[1], r9=tmp3[7], r11=a19, lr=tmp3[6]
1615+ Free registers:
1616+ r2,r4,r8,r10,r12
1617+ */
1618+
1619+/*
1620+ a5 = tmp3[1] - (tmp3[7] << 1);
1621+ a7 = tmp3[8] - (tmp3[0] << 1);
1622+ a3 = tmp3[6] - (tmp3[2] << 1);
1623+ a1 = tmp3[3] - (tmp3[5] << 1);
1624+ a13 = a1 - a3;
1625+ a14 = a13 + a7;
1626+ m3 = mad_f_mul(a14, -c1) ;
1627+ m4 = mad_f_mul(a5, -c1) ;
1628+ tmp[ 7] = m3 << 1;
1629+*/
1630+ sub r7/*a5*/, r7, r9 << 1
1631+ sub r2/*a7*/, r5, r6 << 1
1632+ sub r4/*a3*/, lr, r0 << 1
1633+ sub r8/*a1*/, r1, r3 << 1
1634+ sub r10/*a13*/, r8, r4
1635+ add r12/*a14*/, r10, r2
1636+ mov r0, -MAD_F(0x1bb67ae8)
1637+ mulsatrndwh.w r12/*m3*/, r12, r0:b
1638+ mulsatrndwh.w r7/*m4*/, r7, r0:b
1639+ lsl r12, 1
1640+ stdsp sp[7*4], r12
1641+
1642+ /* Registers used:
1643+ r2 = a7, r4 = a3, r7 = m4, r8 = a1, r12 = m3
1644+
1645+ Free registers:
1646+ r0, r1, r3, r5, r6, r9, r10, r11, lr
1647+ */
1648+
1649+
1650+/*
1651+ a15 = a3 + a7;
1652+ m2 = mad_f_mul(a15, -c4) ;
1653+ a17 = a1 + a3;
1654+ m0 = mad_f_mul(a17, -c3) ;
1655+ a23 = (m4 << 1) + (m2 << 1);
1656+ tmp[15] = a23 + (m0 << 1); */
1657+ add r0/*a15*/, r4, r2
1658+ mov r1, -MAD_F(0x0af1d43a)
1659+ mulsatrndwh.w r0/*m2*/, r0, r1:b
1660+ mov r3, -MAD_F(0x1491b752)
1661+ add r5/*a17*/, r8, r4
1662+ mulsatrndwh.w r5/*m0*/, r5, r3:b
1663+ lsl r7, 1
1664+ add r6/*a23*/, r7, r0 << 1
1665+ add r6/*tmp[15]*/, r6, r5 << 1
1666+ stdsp sp[15*4], r6
1667+
1668+ /* Registers used:
1669+ r0 = m2, r2 = a7, r5 = m0, r7 = m4, r8 = a1
1670+
1671+ Free registers:
1672+ r1, r3, r4, r6, r9, r10, r11, lr
1673+ */
1674+
1675+/*
1676+ a16 = a1 - a7;
1677+ m1 = mad_f_mul(a16, -c0) ;
1678+ a24 = (m4 << 1) - (m2 << 1);
1679+ tmp[11] = a24 - (m1 << 1);
1680+
1681+ a25 = (m4 << 1) + (m1 << 1);
1682+ tmp[ 3] = (m0 << 1) - a25;
1683+*/
1684+ sub r3/*a16*/, r8, r2
1685+ mov r4, -MAD_F(0x1f838b8d)
1686+ mulsatrndwh.w r3/*m1*/, r3, r4:b
1687+ sub r1/*a24*/, r7, r0 << 1
1688+ sub r1/*tmp[11]*/, r1, r3 << 1
1689+ stdsp sp[11*4], r1
1690+ add r7/*a25*/, r7, r3 << 1
1691+ sub r7, r7, r5 << 1
1692+ neg r7
1693+ lddsp r12, sp[4*18+4] /* Get y from stack */
1694+ stdsp sp[3*4], r7
1695+
1696+
1697+ /* output to every other slot for convenience */
1698+
1699+ /* End fastdct */
1700+
1701+ /* output accumulation */
1702+
1703+/* for (i = 3; i < 18; i += 8) {
1704+ tmp[i + 0] -= tmp[(i + 0) - 2];
1705+ tmp[i + 2] -= tmp[(i + 2) - 2];
1706+ tmp[i + 4] -= tmp[(i + 4) - 2];
1707+ tmp[i + 6] -= tmp[(i + 6) - 2];
1708+ }
1709+ }
1710+*/
1711+
1712+/* End SDCT-II */
1713+
1714+
1715+
1716+ /* scale reduction and output accumulation */
1717+
1718+/*
1719+ for (i = 1; i < 17; i += 4) {
1720+ tmp[i + 0] = tmp[i + 0] - tmp[(i + 0) - 1];
1721+ tmp[i + 1] = tmp[i + 1] - tmp[(i + 1) - 1];
1722+ tmp[i + 2] = tmp[i + 2] - tmp[(i + 2) - 1];
1723+ tmp[i + 3] = tmp[i + 3] - tmp[(i + 3) - 1];
1724+ }
1725+ tmp[17] = tmp[17] - tmp[16];
1726+ }
1727+*/
1728+/* End DCT-IV */
1729+
1730+
1731+ /* convert 18-point DCT-IV to 36-point IMDCT */
1732+
1733+/*
1734+ for (i = 0; i < 9; i += 3) {
1735+ y[i + 0] = tmp[9 + (i + 0)];
1736+ y[i + 1] = tmp[9 + (i + 1)];
1737+ y[i + 2] = tmp[9 + (i + 2)];
1738+ }
1739+ for (i = 9; i < 27; i += 3) {
1740+ y[i + 0] = -tmp[36 - (9 + (i + 0)) - 1];
1741+ y[i + 1] = -tmp[36 - (9 + (i + 1)) - 1];
1742+ y[i + 2] = -tmp[36 - (9 + (i + 2)) - 1];
1743+ }
1744+ for (i = 27; i < 36; i += 3) {
1745+ y[i + 0] = -tmp[(i + 0) - 27];
1746+ y[i + 1] = -tmp[(i + 1) - 27];
1747+ y[i + 2] = -tmp[(i + 2) - 27];
1748+ }
1749+ }
1750+*/
1751+
1752+ /* Registers used:
1753+ r0 = tmp[8], r1 = tmp[7], r2 = tmp[6], r3 = tmp[5], r4 = tmp[4]
1754+ r5 = tmp[3], r6 = tmp[2], r7 = tmp[1], r8 = tmp[0], r12 = y
1755+
1756+ Free registers:
1757+ r9, r10, r11, lr
1758+ */
1759+
1760+ ldm sp++, r0-r8 /* Get tmp[0]-tmp[8] from stack */
1761+ sub r5, r7 /* tmp[3] -= tmp[1]*/
1762+ sub r3, r5 /* tmp[5] -= tmp[3]*/
1763+ sub r1, r3 /* tmp[7] -= tmp[5]*/
1764+
1765+ sub r7, r8 /* tmp[1] -= tmp[0]*/
1766+ sub r6, r7 /* tmp[2] -= tmp[1]*/
1767+ sub r5, r6 /* tmp[3] -= tmp[2]*/
1768+ neg r8
1769+ st.w r12[26*4], r8 /* y[26] = -tmp[0] */
1770+ st.w r12[27*4], r8 /* y[27] = -tmp[0] */
1771+ neg r7
1772+ neg r6
1773+ st.w r12[25*4], r7 /* y[25] = -tmp[1] */
1774+ st.w r12[24*4], r6 /* y[24] = -tmp[2] */
1775+ st.d r12[28*4], r6 /* y[28] = -tmp[1], y[29] = -tmp[2]*/
1776+
1777+ sub r4, r5 /* tmp[4] -= tmp[3]*/
1778+ sub r3, r4 /* tmp[5] -= tmp[4]*/
1779+ neg r5
1780+ neg r4
1781+ st.w r12[23*4], r5 /* y[23] = -tmp[3] */
1782+ st.w r12[22*4], r4 /* y[22] = -tmp[4] */
1783+ st.d r12[30*4], r4 /* y[30] = -tmp[3], y[31] = -tmp[4]*/
1784+
1785+ ldm sp++, r4-r11,lr /* Get tmp[9]-tmp[17] from stack */
1786+
1787+ sub r2, r3 /* tmp[6] -= tmp[5]*/
1788+
1789+ sub lr, r1 /* tmp[9] -= tmp[7]*/
1790+ sub r10, lr /* tmp[11] -= tmp[9]*/
1791+ sub r8, r10 /* tmp[13] -= tmp[11]*/
1792+ sub r6, r8 /* tmp[15] -= tmp[13]*/
1793+ sub r4, r6 /* tmp[17] -= tmp[15]*/
1794+
1795+ sub r1, r2 /* tmp[7] -= tmp[6]*/
1796+ sub r0, r1 /* tmp[8] -= tmp[7]*/
1797+ neg r3
1798+ neg r2
1799+ st.w r12[21*4], r3 /* y[21] = -tmp[5] */
1800+ st.w r12[20*4], r2 /* y[20] = -tmp[6] */
1801+ st.d r12[32*4], r2 /* y[32] = -tmp[5], y[33] = -tmp[6]*/
1802+
1803+ sub lr, r0 /* tmp[9] -= tmp[8]*/
1804+ sub r11, lr /* tmp[10] -= tmp[9]*/
1805+ neg r1
1806+ neg r0
1807+ st.w r12[19*4], r1 /* y[19] = -tmp[7] */
1808+ st.w r12[18*4], r0 /* y[18] = -tmp[8] */
1809+ st.d r12[34*4], r0 /* y[34] = -tmp[7], y[35] = -tmp[8]*/
1810+
1811+ sub r10, r11 /* tmp[11] -= tmp[10]*/
1812+ sub r9, r10 /* tmp[12] -= tmp[11]*/
1813+
1814+ st.w r12[0*4], lr /* y[0] = tmp[9]*/
1815+ neg lr
1816+ st.w r12[17*4], lr /* y[17] = -tmp[9]*/
1817+ st.d r12[1*4], r10 /* y[1] = tmp[10], y[2] = tmp[11] */
1818+ neg r11
1819+ neg r10
1820+ st.w r12[16*4], r11 /* y[16] = -tmp[10] */
1821+ st.w r12[15*4], r10 /* y[15] = -tmp[11] */
1822+
1823+
1824+ sub r8, r9 /* tmp[13] -= tmp[12]*/
1825+ sub r7, r8 /* tmp[14] -= tmp[13]*/
1826+ st.d r12[3*4], r8 /* y[3] = tmp[12], y[4] = tmp[13] */
1827+ neg r9
1828+ neg r8
1829+ st.w r12[14*4], r9 /* y[14] = -tmp[12] */
1830+ st.w r12[13*4], r8 /* y[13] = -tmp[13] */
1831+
1832+ sub r6, r7 /* tmp[15] -= tmp[14]*/
1833+ sub r5, r6 /* tmp[16] -= tmp[15]*/
1834+ sub r4, r5 /* tmp[17] -= tmp[16]*/
1835+
1836+ st.d r12[5*4], r6 /* y[5] = tmp[14], y[6] = tmp[15] */
1837+ neg r7
1838+ neg r6
1839+ st.w r12[12*4], r7 /* y[12] = -tmp[14] */
1840+ st.w r12[11*4], r6 /* y[11] = -tmp[15] */
1841+
1842+ st.d r12[7*4], r4 /* y[7] = tmp[16], y[8] = tmp[17] */
1843+ neg r5
1844+ neg r4
1845+ st.w r12[10*4], r5 /* y[10] = -tmp[16] */
1846+ st.w r12[9*4], r4 /* y[9] = -tmp[17] */
1847+
1848+ popm r0-r7,r11,pc
1849+
1850+ .align 2
1851+scale_dctIV:
1852+ .short MAD_F(0x1ff833fa), MAD_F(0x1fb9ea93), MAD_F(0x1f3dd120)
1853+ .short MAD_F(0x1e84d969), MAD_F(0x1d906bcf), MAD_F(0x1c62648b)
1854+ .short MAD_F(0x1afd100f), MAD_F(0x1963268b), MAD_F(0x1797c6a4)
1855+ .short MAD_F(0x159e6f5b), MAD_F(0x137af940), MAD_F(0x11318ef3)
1856+ .short MAD_F(0x0ec6a507), MAD_F(0x0c3ef153), MAD_F(0x099f61c5)
1857+ .short MAD_F(0x06ed12c5), MAD_F(0x042d4544), MAD_F(0x0165547c)
1858+
1859+ .align 2
1860+scale_sdctII:
1861+ .short MAD_F(0x1fe0d3b4), MAD_F(0x1ee8dd47), MAD_F(0x1d007930)
1862+ .short MAD_F(0x1a367e59), MAD_F(0x16a09e66), MAD_F(0x125abcf8)
1863+ .short MAD_F(0x0d8616bc), MAD_F(0x08483ee1), MAD_F(0x02c9fad7)
1864diff --git a/layer3.c b/layer3.c
1865index 4e5d3fa..dffdab3 100644
1866--- a/layer3.c
1867+++ b/layer3.c
1868@@ -378,6 +378,11 @@ mad_fixed_t const ca[8] = {
1869 -MAD_F(0x003a2847) /* -0.014198569 */, -MAD_F(0x000f27b4) /* -0.003699975 */
1870 };
1871
1872+#ifdef FPM_AVR32
1873+# undef MAD_F
1874+# define MAD_F(x) ((x + (1 << 12)) >> 13)
1875+#endif
1876+
1877 /*
1878 * IMDCT coefficients for short blocks
1879 * derived from section 2.4.3.4.10.2 of ISO/IEC 11172-3
1880@@ -386,7 +391,7 @@ mad_fixed_t const ca[8] = {
1881 * imdct_s[i /odd][k] = cos((PI / 24) * (2 * (6 + (i-1)/2) + 7) * (2 * k + 1))
1882 */
1883 static
1884-mad_fixed_t const imdct_s[6][6] = {
1885+mad_coeff_t const imdct_s[6][6] = {
1886 # include "imdct_s.dat"
1887 };
1888
1889@@ -398,7 +403,7 @@ mad_fixed_t const imdct_s[6][6] = {
1890 * window_l[i] = sin((PI / 36) * (i + 1/2))
1891 */
1892 static
1893-mad_fixed_t const window_l[36] = {
1894+mad_coeff_t const window_l[36] = {
1895 MAD_F(0x00b2aa3e) /* 0.043619387 */, MAD_F(0x0216a2a2) /* 0.130526192 */,
1896 MAD_F(0x03768962) /* 0.216439614 */, MAD_F(0x04cfb0e2) /* 0.300705800 */,
1897 MAD_F(0x061f78aa) /* 0.382683432 */, MAD_F(0x07635284) /* 0.461748613 */,
1898@@ -429,7 +434,7 @@ mad_fixed_t const window_l[36] = {
1899 * window_s[i] = sin((PI / 12) * (i + 1/2))
1900 */
1901 static
1902-mad_fixed_t const window_s[12] = {
1903+mad_coeff_t const window_s[12] = {
1904 MAD_F(0x0216a2a2) /* 0.130526192 */, MAD_F(0x061f78aa) /* 0.382683432 */,
1905 MAD_F(0x09bd7ca0) /* 0.608761429 */, MAD_F(0x0cb19346) /* 0.793353340 */,
1906 MAD_F(0x0ec835e8) /* 0.923879533 */, MAD_F(0x0fdcf549) /* 0.991444861 */,
1907@@ -438,6 +443,11 @@ mad_fixed_t const window_s[12] = {
1908 MAD_F(0x061f78aa) /* 0.382683432 */, MAD_F(0x0216a2a2) /* 0.130526192 */,
1909 };
1910
1911+#ifdef FPM_AVR32
1912+# undef MAD_F
1913+# define MAD_F(x) ((mad_fixed_t) (x##L))
1914+#endif
1915+
1916 /*
1917 * coefficients for intensity stereo processing
1918 * derived from section 2.4.3.4.9.3 of ISO/IEC 11172-3
1919@@ -879,6 +889,42 @@ void III_exponents(struct channel const *channel,
1920 * NAME: III_requantize()
1921 * DESCRIPTION: requantize one (positive) value
1922 */
1923+
1924+#if 0
1925+/*static*/
1926+mad_fixed_t III_requantize(unsigned int value, signed int exp)
1927+{
1928+ register mad_fixed_t tmp2, tmp3;
1929+ long long tmp_d;
1930+
1931+ asm ("asr\t%0, %1, 2\n"
1932+ "ld.w\t%2, %4[%5 << 2]\n"
1933+ "sub\t%1, %1, %0 << 2\n"
1934+ "asr\t%3, %2, 7\n"
1935+ "andl\t%2, 0x7f, COH\n"
1936+ "add\t%0, %2\n"
1937+ "lsl\t%m0,%3,%0\n"
1938+ "neg\t%0\n"
1939+ "asr\t%3,%3,%0\n"
1940+ "add\t%2, %6, %1 << 2\n"
1941+ "ld.w\t%2, %2[12]\n"
1942+ "cp.w\t%0, 0\n"
1943+ "movlt\t%3, %m0\n"
1944+ "muls.d\t%0, %3, %2\n"
1945+ "cp.w\t%1, 0\n"
1946+ "breq\t0f\n"
1947+ "lsr\t%0, %0, 28\n"
1948+ "or\t%3, %0, %m0 << 4\n"
1949+ "0:\n"
1950+ : "=&r"(tmp_d), "+r"(exp), "=&r"(tmp2), "=&r"(tmp3)
1951+ : "r"(&rq_table), "r"(value), "r"(root_table));
1952+
1953+
1954+ return tmp3;
1955+}
1956+
1957+#else
1958+
1959 static
1960 mad_fixed_t III_requantize(unsigned int value, signed int exp)
1961 {
1962@@ -918,6 +964,7 @@ mad_fixed_t III_requantize(unsigned int value, signed int exp)
1963
1964 return frac ? mad_f_mul(requantized, root_table[3 + frac]) : requantized;
1965 }
1966+#endif
1967
1968 /* we must take care that sz >= bits and sz < sizeof(long) lest bits == 0 */
1969 # define MASK(cache, sz, bits) \
1970@@ -2054,27 +2101,42 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36])
1971 }
1972 # endif
1973
1974+
1975+#ifdef FPM_AVR32
1976+# undef mad_f_mul
1977+# define mad_f_mul(x, y) __builtin_mulsatrndwh_w(x, y)
1978+#endif
1979+
1980 /*
1981 * NAME: III_imdct_l()
1982 * DESCRIPTION: perform IMDCT and windowing for long blocks
1983 */
1984 static
1985-void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
1986+void III_imdct_l(mad_fixed_t /*const*/ X[18], mad_fixed_t z[36],
1987 unsigned int block_type)
1988 {
1989 unsigned int i;
1990+ mad_fixed_t *z_ptr;
1991+ mad_coeff_t *w_ptr;
1992
1993 /* IMDCT */
1994
1995+#ifdef FPM_AVR32
1996+ imdct36_avr32(X, z);
1997+#else
1998 imdct36(X, z);
1999+#endif
2000
2001 /* windowing */
2002
2003+ z_ptr = &z[0];
2004+ w_ptr = &window_l[0];
2005+
2006 switch (block_type) {
2007 case 0: /* normal window */
2008 # if defined(ASO_INTERLEAVE1)
2009 {
2010- register mad_fixed_t tmp1, tmp2;
2011+ register mad_coeff_t tmp1, tmp2;
2012
2013 tmp1 = window_l[0];
2014 tmp2 = window_l[1];
2015@@ -2091,15 +2153,16 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
2016 }
2017 # elif defined(ASO_INTERLEAVE2)
2018 {
2019- register mad_fixed_t tmp1, tmp2;
2020+ register mad_fixed_t tmp1;
2021+ register mad_coeff_t tmp2;
2022
2023- tmp1 = z[0];
2024- tmp2 = window_l[0];
2025+ tmp1 = *z_ptr;
2026+ tmp2 = *w_ptr++;
2027
2028 for (i = 0; i < 35; ++i) {
2029- z[i] = mad_f_mul(tmp1, tmp2);
2030- tmp1 = z[i + 1];
2031- tmp2 = window_l[i + 1];
2032+ *z_ptr++ = mad_f_mul(tmp1, tmp2);
2033+ tmp1 = *z_ptr;
2034+ tmp2 = *w_ptr++;
2035 }
2036
2037 z[35] = mad_f_mul(tmp1, tmp2);
2038@@ -2118,23 +2181,28 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
2039
2040 case 1: /* start block */
2041 for (i = 0; i < 18; i += 3) {
2042- z[i + 0] = mad_f_mul(z[i + 0], window_l[i + 0]);
2043- z[i + 1] = mad_f_mul(z[i + 1], window_l[i + 1]);
2044- z[i + 2] = mad_f_mul(z[i + 2], window_l[i + 2]);
2045+ *(z_ptr++) = mad_f_mul(*z_ptr, *w_ptr++);
2046+ *(z_ptr++) = mad_f_mul(*z_ptr, *w_ptr++);
2047+ *(z_ptr++) = mad_f_mul(*z_ptr, *w_ptr++);
2048 }
2049+ z_ptr += 6;
2050+ w_ptr = &window_s[6];
2051 /* (i = 18; i < 24; ++i) z[i] unchanged */
2052- for (i = 24; i < 30; ++i) z[i] = mad_f_mul(z[i], window_s[i - 18]);
2053- for (i = 30; i < 36; ++i) z[i] = 0;
2054+ for (i = 24; i < 30; ++i) *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
2055+ for (i = 30; i < 36; ++i) *z_ptr++ = 0;
2056 break;
2057
2058 case 3: /* stop block */
2059- for (i = 0; i < 6; ++i) z[i] = 0;
2060- for (i = 6; i < 12; ++i) z[i] = mad_f_mul(z[i], window_s[i - 6]);
2061+ w_ptr = &window_s[0];
2062+ for (i = 0; i < 6; ++i) *z_ptr++ = 0;
2063+ for (i = 6; i < 12; ++i) *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
2064 /* (i = 12; i < 18; ++i) z[i] unchanged */
2065+ w_ptr = &window_l[18];
2066+ z_ptr += 6;
2067 for (i = 18; i < 36; i += 3) {
2068- z[i + 0] = mad_f_mul(z[i + 0], window_l[i + 0]);
2069- z[i + 1] = mad_f_mul(z[i + 1], window_l[i + 1]);
2070- z[i + 2] = mad_f_mul(z[i + 2], window_l[i + 2]);
2071+ *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++ );
2072+ *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
2073+ *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
2074 }
2075 break;
2076 }
2077@@ -2146,10 +2214,10 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
2078 * DESCRIPTION: perform IMDCT and windowing for short blocks
2079 */
2080 static
2081-void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2082+void III_imdct_s(mad_fixed_t /*const*/ X[18], mad_fixed_t z[36])
2083 {
2084 mad_fixed_t y[36], *yptr;
2085- mad_fixed_t const *wptr;
2086+ mad_coeff_t const *wptr;
2087 int w, i;
2088 register mad_fixed64hi_t hi;
2089 register mad_fixed64lo_t lo;
2090@@ -2159,11 +2227,56 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2091 yptr = &y[0];
2092
2093 for (w = 0; w < 3; ++w) {
2094- register mad_fixed_t const (*s)[6];
2095+ register mad_coeff_t const (*s)[6];
2096
2097 s = imdct_s;
2098
2099 for (i = 0; i < 3; ++i) {
2100+#ifdef FPM_AVR32
2101+ register long long int acc, tmp1, tmp2, tmp3, tmp4;
2102+ asm volatile ("ld.d\t%0, %5++\n"
2103+ "ld.d\t%1, %6[0]\n"
2104+ "ld.d\t%2, %6[2*4]\n"
2105+ "ld.d\t%3, %6[4*4]\n"
2106+ "mulwh.d\t%4, %m1, %m0:t\n"
2107+ "macwh.d\t%4, %1, %m0:b\n"
2108+ "ld.w\t%m0, %5++\n"
2109+ "macwh.d\t%4, %m2, %0:t\n"
2110+ "macwh.d\t%4, %2, %0:b\n"
2111+ "macwh.d\t%4, %m3, %m0:t\n"
2112+ "macwh.d\t%4, %3, %m0:b\n"
2113+ "ld.d\t%0, %5++\n"
2114+ "rol\t%4\n"
2115+ "rol\t%m4\n"
2116+ : "=&r"(tmp1), "=&r"(tmp2), "=&r"(tmp3), "=&r"(tmp4),
2117+ "=&r"(acc), "+r"(s)
2118+ : "r"(X));
2119+
2120+ asm volatile ("st.w\t%1[0], %m0\n"
2121+ "neg\t%m0\n"
2122+ "st.w\t%2[5*4], %m0\n"
2123+ : "+r"(acc)
2124+ : "r"(&yptr[i]), "r"(&yptr[-i]));
2125+
2126+ asm volatile ("mulwh.d\t%4, %m1, %m0:t\n"
2127+ "macwh.d\t%4, %1, %m0:b\n"
2128+ "ld.w\t%m0, %5++\n"
2129+ "macwh.d\t%4, %m2, %0:t\n"
2130+ "macwh.d\t%4, %2, %0:b\n"
2131+ "macwh.d\t%4, %m3, %m0:t\n"
2132+ "macwh.d\t%4, %3, %m0:b\n"
2133+ "rol\t%4\n"
2134+ "rol\t%m4\n"
2135+ : "+r"(tmp1), "+r"(tmp2), "+r"(tmp3), "+r"(tmp4),
2136+ "=&r"(acc), "+r"(s)
2137+ : "r"(X));
2138+
2139+ asm volatile ( "st.w\t%1[6*4], %m0\n"
2140+ "st.w\t%2[11*4], %m0\n"
2141+ :: "r"(acc), "r"(&yptr[i]), "r"(&yptr[-i]));
2142+
2143+
2144+#else
2145 MAD_F_ML0(hi, lo, X[0], (*s)[0]);
2146 MAD_F_MLA(hi, lo, X[1], (*s)[1]);
2147 MAD_F_MLA(hi, lo, X[2], (*s)[2]);
2148@@ -2187,6 +2300,7 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2149 yptr[11 - i] = yptr[i + 6];
2150
2151 ++s;
2152+#endif
2153 }
2154
2155 yptr += 12;
2156@@ -2198,6 +2312,196 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2157 yptr = &y[0];
2158 wptr = &window_s[0];
2159
2160+#ifdef FPM_AVR32
2161+ /* z[0] = 0;
2162+ z[1] = 0;
2163+ z[2] = 0;
2164+ z[3] = 0;
2165+ z[4] = 0;
2166+ z[5] = 0;
2167+ z[30] = 0;
2168+ z[31] = 0;
2169+ z[32] = 0;
2170+ z[33] = 0;
2171+ z[34] = 0;
2172+ z[35] = 0;
2173+ */
2174+ {
2175+ register long long int tmp, tmp2, tmp3, w0123, w4567, w891011;
2176+ asm volatile ("mov\t%m0, 0\n"
2177+ "mov\t%0, %m0\n"
2178+ "st.d\t%1[0], %0\n"
2179+ "st.d\t%1[2*4], %0\n"
2180+ "st.d\t%1[4*4], %0\n"
2181+ "st.d\t%1[30*4], %0\n"
2182+ "st.d\t%1[32*4], %0\n"
2183+ "st.d\t%1[34*4], %0\n"
2184+ : "=&r"(tmp) : "r"(z));
2185+
2186+
2187+
2188+ /*
2189+ z[6] = mad_f_mul(yptr [0], wptr[0]);
2190+ z[7] = mad_f_mul(yptr [1], wptr[1]);
2191+ z[8] = mad_f_mul(yptr [2], wptr[2]);
2192+ z[9] = mad_f_mul(yptr [3], wptr[3]);
2193+ z[10] = mad_f_mul(yptr[4], wptr[4]);
2194+ z[11] = mad_f_mul(yptr[5], wptr[5]);
2195+ z[24] = mad_f_mul(yptr [30], wptr[6]);
2196+ z[25] = mad_f_mul(yptr [31], wptr[7]);
2197+ z[26] = mad_f_mul(yptr [32], wptr[8]);
2198+ z[27] = mad_f_mul(yptr [33], wptr[9]);
2199+ z[28] = mad_f_mul(yptr[34], wptr[10]);
2200+ z[29] = mad_f_mul(yptr[35], wptr[11]);
2201+ */
2202+
2203+
2204+ asm volatile ("ld.d\t%0, %5[0*4]\n"
2205+ "ld.d\t%3, %6[0*4]\n"
2206+ "ld.d\t%1, %5[2*4]\n"
2207+ "ld.d\t%2, %5[4*4]\n"
2208+ "mulsatrndwh.w\t%m3, %m3, %m0:t\n"
2209+ "mulsatrndwh.w\t%3, %3, %m0:b\n"
2210+ "ld.d\t%4, %6[2*4]\n"
2211+ "st.d\t%7[6*4], %3\n"
2212+
2213+ "mulsatrndwh.w\t%m4, %m4, %0:t\n"
2214+ "mulsatrndwh.w\t%4, %4, %0:b\n"
2215+ "ld.d\t%3, %6[4*4]\n"
2216+ "st.d\t%7[8*4], %4\n"
2217+
2218+ "mulsatrndwh.w\t%m3, %m3, %m1:t\n"
2219+ "mulsatrndwh.w\t%3, %3, %m1:b\n"
2220+ "ld.d\t%4, %6[30*4]\n"
2221+ "st.d\t%7[10*4], %3\n"
2222+
2223+ "mulsatrndwh.w\t%m4, %m4, %1:t\n"
2224+ "mulsatrndwh.w\t%4, %4, %1:b\n"
2225+ "ld.d\t%3, %6[32*4]\n"
2226+ "st.d\t%7[24*4], %4\n"
2227+
2228+ "mulsatrndwh.w\t%m3, %m3, %m2:t\n"
2229+ "mulsatrndwh.w\t%3, %3, %m2:b\n"
2230+ "ld.d\t%4, %6[34*4]\n"
2231+ "st.d\t%7[26*4], %3\n"
2232+
2233+ "mulsatrndwh.w\t%m4, %m4, %2:t\n"
2234+ "mulsatrndwh.w\t%4, %4, %2:b\n"
2235+ "st.d\t%7[28*4], %4\n"
2236+
2237+ : "=&r"(w0123), "=&r"(w4567), "=&r"(w891011), "=&r"(tmp), "=&r"(tmp2)
2238+ : "r"(wptr), "r"(yptr), "r"(z));
2239+ /*
2240+ MAD_F_ML0(hi, lo, yptr[6], wptr[6]);
2241+ MAD_F_MLA(hi, lo, yptr[12], wptr[0]);
2242+ z[12] = MAD_F_MLZ(hi, lo);
2243+ MAD_F_ML0(hi, lo, yptr[7], wptr[7]);
2244+ MAD_F_MLA(hi, lo, yptr[13], wptr[1]);
2245+ z[13] = MAD_F_MLZ(hi, lo);
2246+ MAD_F_ML0(hi, lo, yptr[8], wptr[8]);
2247+ MAD_F_MLA(hi, lo, yptr[14], wptr[2]);
2248+ z[14] = MAD_F_MLZ(hi, lo);
2249+ MAD_F_ML0(hi, lo, yptr[9], wptr[9]);
2250+ MAD_F_MLA(hi, lo, yptr[15], wptr[3]);
2251+ z[15] = MAD_F_MLZ(hi, lo);
2252+ MAD_F_ML0(hi, lo, yptr[10], wptr[10]);
2253+ MAD_F_MLA(hi, lo, yptr[16], wptr[4]);
2254+ z[16] = MAD_F_MLZ(hi, lo);
2255+ MAD_F_ML0(hi, lo, yptr[11], wptr[11]);
2256+ MAD_F_MLA(hi, lo, yptr[17], wptr[5]);
2257+ z[17] = MAD_F_MLZ(hi, lo);
2258+
2259+ MAD_F_ML0(hi, lo, yptr[18], wptr[6]);
2260+ MAD_F_MLA(hi, lo, yptr[24], wptr[0]);
2261+ z[18] = MAD_F_MLZ(hi, lo);
2262+ MAD_F_ML0(hi, lo, yptr[19], wptr[7]);
2263+ MAD_F_MLA(hi, lo, yptr[25], wptr[1]);
2264+ z[19] = MAD_F_MLZ(hi, lo);
2265+ MAD_F_ML0(hi, lo, yptr[20], wptr[8]);
2266+ MAD_F_MLA(hi, lo, yptr[26], wptr[2]);
2267+ z[20] = MAD_F_MLZ(hi, lo);
2268+ MAD_F_ML0(hi, lo, yptr[21], wptr[9]);
2269+ MAD_F_MLA(hi, lo, yptr[27], wptr[3]);
2270+ z[21] = MAD_F_MLZ(hi, lo);
2271+ MAD_F_ML0(hi, lo, yptr[22], wptr[10]);
2272+ MAD_F_MLA(hi, lo, yptr[28], wptr[4]);
2273+ z[22] = MAD_F_MLZ(hi, lo);
2274+ MAD_F_ML0(hi, lo, yptr[23], wptr[11]);
2275+ MAD_F_MLA(hi, lo, yptr[29], wptr[5]);
2276+ z[23] = MAD_F_MLZ(hi, lo);*/
2277+
2278+
2279+ asm volatile ("ld.d\t%0, %3[6*4]\n"
2280+ "ld.d\t%1, %3[12*4]\n"
2281+ "mulwh.d\t%2, %m0, %5:t\n"
2282+ "macwh.d\t%2, %m1, %m4:t\n"
2283+ "mulwh.d\t%0, %0, %5:b\n"
2284+ "macwh.d\t%0, %1, %m4:b\n"
2285+ "lsl\t%m2, 1\n"
2286+ "lsl\t%2, %m0, 1\n"
2287+ "st.d\t%6[12*4], %2\n"
2288+
2289+ "ld.d\t%0, %3[18*4]\n"
2290+ "ld.d\t%1, %3[24*4]\n"
2291+ "mulwh.d\t%2, %m0, %5:t\n"
2292+ "macwh.d\t%2, %m1, %m4:t\n"
2293+ "mulwh.d\t%0, %0, %5:b\n"
2294+ "macwh.d\t%0, %1, %m4:b\n"
2295+ "lsl\t%m2, 1\n"
2296+ "lsl\t%2, %m0, 1\n"
2297+ "st.d\t%6[18*4], %2\n"
2298+
2299+ : "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
2300+ : "r"(yptr), "r"(w0123), "r"(w4567), "r"(z));
2301+
2302+ asm volatile ("ld.d\t%0, %3[8*4]\n"
2303+ "ld.d\t%1, %3[14*4]\n"
2304+ "mulwh.d\t%2, %m0, %m5:t\n"
2305+ "macwh.d\t%2, %m1, %4:t\n"
2306+ "mulwh.d\t%0, %0, %m5:b\n"
2307+ "macwh.d\t%0, %1, %4:b\n"
2308+ "lsl\t%m2, 1\n"
2309+ "lsl\t%2, %m0, 1\n"
2310+ "st.d\t%6[14*4], %2\n"
2311+
2312+ "ld.d\t%0, %3[20*4]\n"
2313+ "ld.d\t%1, %3[26*4]\n"
2314+ "mulwh.d\t%2, %m0, %m5:t\n"
2315+ "macwh.d\t%2, %m1, %4:t\n"
2316+ "mulwh.d\t%0, %0, %m5:b\n"
2317+ "macwh.d\t%0, %1, %4:b\n"
2318+ "lsl\t%m2, 1\n"
2319+ "lsl\t%2, %m0, 1\n"
2320+ "st.d\t%6[20*4], %2\n"
2321+
2322+ : "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
2323+ : "r"(yptr), "r"(w0123), "r"(w891011), "r"(z));
2324+
2325+ asm volatile ("ld.d\t%0, %3[10*4]\n"
2326+ "ld.d\t%1, %3[16*4]\n"
2327+ "mulwh.d\t%2, %m0, %5:t\n"
2328+ "macwh.d\t%2, %m1, %m4:t\n"
2329+ "mulwh.d\t%0, %0, %5:b\n"
2330+ "macwh.d\t%0, %1, %m4:b\n"
2331+ "lsl\t%m2, 1\n"
2332+ "lsl\t%2, %m0, 1\n"
2333+ "st.d\t%6[16*4], %2\n"
2334+
2335+ "ld.d\t%0, %3[22*4]\n"
2336+ "ld.d\t%1, %3[28*4]\n"
2337+ "mulwh.d\t%2, %m0, %5:t\n"
2338+ "macwh.d\t%2, %m1, %m4:t\n"
2339+ "mulwh.d\t%0, %0, %5:b\n"
2340+ "macwh.d\t%0, %1, %m4:b\n"
2341+ "lsl\t%m2, 1\n"
2342+ "lsl\t%2, %m0, 1\n"
2343+ "st.d\t%6[22*4], %2\n"
2344+
2345+ : "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
2346+ : "r"(yptr), "r"(w4567), "r"(w891011), "r"(z));
2347+
2348+ }
2349+#else
2350 for (i = 0; i < 6; ++i) {
2351 z[i + 0] = 0;
2352 z[i + 6] = mad_f_mul(yptr[ 0 + 0], wptr[0]);
2353@@ -2218,8 +2522,15 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2354 ++yptr;
2355 ++wptr;
2356 }
2357+#endif
2358 }
2359
2360+#ifdef FPM_AVR32
2361+# undef mad_f_mul
2362+# define mad_f_mul(x, y) ((((x) + (1L << 11)) >> 12) * \
2363+ (((y) + (1L << 15)) >> 16))
2364+#endif
2365+
2366 /*
2367 * NAME: III_overlap()
2368 * DESCRIPTION: perform overlap-add of windowed IMDCT outputs
2369diff --git a/synth.c b/synth.c
2370index 1d28d43..f42d49b 100644
2371--- a/synth.c
2372+++ b/synth.c
2373@@ -29,20 +29,6 @@
2374 # include "frame.h"
2375 # include "synth.h"
2376
2377-/*
2378- * NAME: synth->init()
2379- * DESCRIPTION: initialize synth struct
2380- */
2381-void mad_synth_init(struct mad_synth *synth)
2382-{
2383- mad_synth_mute(synth);
2384-
2385- synth->phase = 0;
2386-
2387- synth->pcm.samplerate = 0;
2388- synth->pcm.channels = 0;
2389- synth->pcm.length = 0;
2390-}
2391
2392 /*
2393 * NAME: synth->mute()
2394@@ -88,6 +74,10 @@ void mad_synth_mute(struct mad_synth *synth)
2395
2396 /* FPM_DEFAULT without OPT_SSO will actually lose accuracy and performance */
2397
2398+# if defined(FPM_AVR32)
2399+# define OPT_SSO
2400+# endif
2401+
2402 # if defined(FPM_DEFAULT) && !defined(OPT_SSO)
2403 # define OPT_SSO
2404 # endif
2405@@ -522,9 +512,15 @@ void dct32(mad_fixed_t const in[32], unsigned int slot,
2406 # endif
2407 # define ML0(hi, lo, x, y) ((lo) = (x) * (y))
2408 # define MLA(hi, lo, x, y) ((lo) += (x) * (y))
2409-# define MLN(hi, lo) ((lo) = -(lo))
2410-# define MLZ(hi, lo) ((void) (hi), (mad_fixed_t) (lo))
2411-# define SHIFT(x) ((x) >> 2)
2412+# if defined(FPM_AVR32)
2413+# define MLN(hi, lo) MAD_F_MLN((hi), (lo))
2414+# define MLZ(hi, lo) (hi)
2415+# define SHIFT(x) ((x) << 2)
2416+# else
2417+# define MLN(hi, lo) ((lo) = -(lo))
2418+# define MLZ(hi, lo) ((void) (hi), (mad_fixed_t) (lo))
2419+# define SHIFT(x) ((x) >> 2)
2420+# endif
2421 # define PRESHIFT(x) ((MAD_F(x) + (1L << 13)) >> 14)
2422 # else
2423 # define ML0(hi, lo, x, y) MAD_F_ML0((hi), (lo), (x), (y))
2424@@ -541,11 +537,54 @@ void dct32(mad_fixed_t const in[32], unsigned int slot,
2425 # endif
2426 # endif
2427
2428+/*
2429+ * NAME: synth->init()
2430+ * DESCRIPTION: initialize synth struct
2431+ */
2432+
2433+#ifdef FPM_AVR32
2434+short Dmod[17][33];
2435+#endif
2436+
2437 static
2438+#ifdef FPM_AVR32
2439+short const D[17][32] = {
2440+#else
2441 mad_fixed_t const D[17][32] = {
2442+#endif
2443 # include "D.dat"
2444 };
2445
2446+void mad_synth_init(struct mad_synth *synth)
2447+{
2448+
2449+ mad_synth_mute(synth);
2450+
2451+ synth->phase = 0;
2452+
2453+ synth->pcm.samplerate = 0;
2454+ synth->pcm.channels = 0;
2455+ synth->pcm.length = 0;
2456+
2457+#ifdef FPM_AVR32
2458+ {
2459+ int i, j;
2460+ for ( i = 0; i < 17; i++ ){
2461+ for ( j = 0; j < 32; j++ ){
2462+ if ( j & 1 ){
2463+ Dmod[i][17 + (j >> 1)]= D[i][j];
2464+ } else {
2465+ Dmod[i][(j >> 1)]= D[i][j];
2466+ }
2467+ }
2468+
2469+ Dmod[i][16]= Dmod[i][16+8];
2470+ }
2471+ }
2472+#endif
2473+
2474+}
2475+
2476 # if defined(ASO_SYNTH)
2477 void synth_full(struct mad_synth *, struct mad_frame const *,
2478 unsigned int, unsigned int);
2479@@ -560,9 +599,13 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
2480 {
2481 unsigned int phase, ch, s, sb, pe, po;
2482 mad_fixed_t *pcm1, *pcm2, (*filter)[2][2][16][8];
2483- mad_fixed_t const (*sbsample)[36][32];
2484+ mad_fixed_t /*const*/ (*sbsample)[36][32];
2485 register mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8];
2486+#ifdef FPM_AVR32
2487+ register short const (*Dptr)[32], *ptr;
2488+#else
2489 register mad_fixed_t const (*Dptr)[32], *ptr;
2490+#endif
2491 register mad_fixed64hi_t hi;
2492 register mad_fixed64lo_t lo;
2493
2494@@ -573,6 +616,20 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
2495 pcm1 = synth->pcm.samples[ch];
2496
2497 for (s = 0; s < ns; ++s) {
2498+# ifdef FPM_AVR32
2499+/*
2500+ int i;
2501+ for ( i = 0; i < 32; i++ ){
2502+ (*sbsample)[s][i] = ((*sbsample)[s][i] + (1 << 13)) & 0xFFFFC000;
2503+ }
2504+*/
2505+ dct32_avr32((*sbsample)[s], phase >> 1,
2506+ (*filter)[0][phase & 1], (*filter)[1][phase & 1]);
2507+ /* printf("dct32: %d\n", GET_CYCLES);*/
2508+ pcm1 = synth_avr32(phase, (mad_fixed_t *)filter, \
2509+ pcm1, (short *)&Dmod[0]);
2510+ /* printf("synth_window: %d\n", GET_CYCLES);*/
2511+# else
2512 dct32((*sbsample)[s], phase >> 1,
2513 (*filter)[0][phase & 1], (*filter)[1][phase & 1]);
2514
2515@@ -679,6 +736,7 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
2516 MLA(hi, lo, (*fo)[7], ptr[ 2]);
2517
2518 *pcm1 = SHIFT(-MLZ(hi, lo));
2519+# endif
2520 pcm1 += 16;
2521
2522 phase = (phase + 1) % 16;
2523diff --git a/synth_avr32.S b/synth_avr32.S
2524new file mode 100644
2525index 0000000..701077b
2526--- /dev/null
2527+++ b/synth_avr32.S
2528@@ -0,0 +1,394 @@
2529+/*
2530+ Optimized function for speeding up synthesis filter
2531+ in MPEG Audio Decoding.
2532+ Copyright 2003-2006 Atmel Corporation.
2533+
2534+ Written by Ronny Pedersen and Lars Even Almås, Atmel Norway
2535+
2536+ This program is free software; you can redistribute it and/or modify
2537+ it under the terms of the GNU General Public License as published by
2538+ the Free Software Foundation; either version 2 of the License, or
2539+ (at your option) any later version.
2540+
2541+ This program is distributed in the hope that it will be useful,
2542+ but WITHOUT ANY WARRANTY; without even the implied warranty of
2543+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2544+ GNU General Public License for more details.
2545+
2546+ You should have received a copy of the GNU General Public License
2547+ along with this program; if not, write to the Free Software
2548+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
2549+
2550+
2551+/* *****************
2552+ Defining macros
2553+ ***************** */
2554+
2555+ .macro window_1 f, ptr, acc, ptr_offset, mul, tmp1_lo, tmp1_hi, tmp2_lo, tmp2_hi, tmp3_lo, tmp3_hi
2556+ ld.d \tmp1_lo, \f[0*4] /* tmp1 = { f[0], f[1] } */
2557+ ld.w \tmp2_lo, \ptr[0*2+\ptr_offset*2] /* tmp2_lo = { ptr[0], ptr[1] }*/
2558+ ld.d \tmp3_lo, \f[6*4] /* tmp3 = { f[6], f[7] } */
2559+ ld.w \tmp2_hi, \ptr[6*2+\ptr_offset*2] /* tmp2_hi = { ptr[6], ptr[7] }*/
2560+ .if \mul
2561+ mulwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[0]*/
2562+ .else
2563+ macwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[0]*/
2564+ .endif
2565+ macwh.d \acc, \tmp3_lo, \tmp2_lo:b /* f[7] * ptr[1]*/
2566+ ld.w \tmp2_lo, \ptr[2*2+\ptr_offset*2] /* tmp2_lo = { ptr[2], ptr[3] }*/
2567+ macwh.d \acc, \tmp1_lo, \tmp2_hi:b /* f[1] * ptr[7]*/
2568+ ld.d \tmp1_lo, \f[2*4] /* tmp1 = { f[2], f[3] } */
2569+
2570+ macwh.d \acc, \tmp3_hi, \tmp2_lo:t /* f[6] * ptr[2]*/
2571+ macwh.d \acc, \tmp1_hi, \tmp2_hi:t /* f[2] * ptr[6]*/
2572+ ld.d \tmp3_lo, \f[4*4] /* tmp3 = { f[4], f[5] } */
2573+ ld.w \tmp2_hi, \ptr[4*2+\ptr_offset*2] /* tmp2_hi = { ptr[4], ptr[5] }*/
2574+ macwh.d \acc, \tmp3_lo, \tmp2_lo:b /* f[5] * ptr[3]*/
2575+
2576+ macwh.d \acc, \tmp1_lo, \tmp2_hi:b /* f[3] * ptr[5]*/
2577+ macwh.d \acc, \tmp3_hi, \tmp2_hi:t /* f[4] * ptr[4]*/
2578+ .endm
2579+
2580+ .macro window_2 f, ptr, acc, ptr_offset, mul, tmp1_lo, tmp1_hi, tmp2_lo, tmp2_hi, tmp3_lo, tmp3_hi
2581+ ld.d \tmp1_lo, \f[0*4] /* tmp1 = { f[0], f[1] } */
2582+ ld.w \tmp2_lo, \ptr[7*2+\ptr_offset*2] /* tmp2_lo = { ptr[7], ptr[8] }*/
2583+ ld.d \tmp3_lo, \f[2*4] /* tmp3 = { f[2], f[3] } */
2584+ ld.w \tmp2_hi, \ptr[9*2+\ptr_offset*2] /* tmp2_hi = { ptr[9], ptr[10] }*/
2585+ .if \mul
2586+ mulwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[7]*/
2587+ .else
2588+ macwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[7]*/
2589+ .endif
2590+ macwh.d \acc, \tmp1_lo, \tmp2_lo:b /* f[1] * ptr[8]*/
2591+
2592+ ld.d \tmp1_lo, \f[4*4] /* tmp1 = { f[4], f[5] } */
2593+ ld.w \tmp2_lo, \ptr[11*2+\ptr_offset*2] /* tmp2_lo = { ptr[11], ptr[12] }*/
2594+
2595+ macwh.d \acc, \tmp3_hi, \tmp2_hi:t /* f[2] * ptr[9]*/
2596+ macwh.d \acc, \tmp3_lo, \tmp2_hi:b /* f[3] * ptr[10]*/
2597+
2598+ ld.d \tmp3_lo, \f[6*4] /* tmp3 = { f[6], f[7] } */
2599+ ld.w \tmp2_hi, \ptr[13*2+\ptr_offset*2] /* tmp2_hi = { ptr[13], ptr[14] }*/
2600+
2601+ macwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[4] * ptr[11]*/
2602+ macwh.d \acc, \tmp1_lo, \tmp2_lo:b /* f[5] * ptr[12]*/
2603+ macwh.d \acc, \tmp3_hi, \tmp2_hi:t /* f[6] * ptr[13]*/
2604+ macwh.d \acc, \tmp3_lo, \tmp2_hi:b /* f[7] * ptr[14]*/
2605+ .endm
2606+
2607+ .macro scale res, d_lo, d_hi
2608+ lsl \d_hi, 2
2609+ .endm
2610+
2611+/* **********************
2612+ Starting main function
2613+ ********************** */
2614+
2615+/* Function synth_avr32 is called from synth.c with arguments:
2616+ phase, filter, *pcm1, &D[0] */
2617+
2618+ .global synth_avr32
2619+synth_avr32:
2620+ pushm r0-r7, lr
2621+ sub sp, 8
2622+
2623+ /* R12 = phase, R11 = filter, R10 = pcm1, r9 = D*/
2624+ bld r12, 0
2625+ brcc synth_even
2626+
2627+ /* Filter for odd phases */
2628+
2629+ /* fe = &(*filter)[0][1][0];
2630+ fx = &(*filter)[0][0][0];
2631+ fo = &(*filter)[1][0][0]; */
2632+ sub lr /*fe*/, r11, -16*8*4
2633+ sub r8 /*fo*/, r11, -16*8*4*2
2634+
2635+ /* pe = phase >> 1; */
2636+ lsr r12, 1
2637+ stdsp sp[4], r12
2638+ /* ptr = (short const *)Dmod + pe; */
2639+ add r12, r9, r12 << 1
2640+
2641+ /* ML0(hi, lo, (*fx)[0], ptr[0 + 17]);
2642+ MLA(hi, lo, (*fx)[1], ptr[7 + 17]);
2643+ MLA(hi, lo, (*fx)[2], ptr[6 + 17]);
2644+ MLA(hi, lo, (*fx)[3], ptr[5 + 17]);
2645+ MLA(hi, lo, (*fx)[4], ptr[4 + 17]);
2646+ MLA(hi, lo, (*fx)[5], ptr[3 + 17]);
2647+ MLA(hi, lo, (*fx)[6], ptr[2 + 17]);
2648+ MLA(hi, lo, (*fx)[7], ptr[1 + 17]); */
2649+ window_1 r11/*fx*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2650+
2651+ /* MLN(hi, lo); */
2652+ neg r0
2653+ acr r1
2654+ neg r1
2655+
2656+ /* MLA(hi, lo, (*fe)[0], ptr[0]);
2657+ MLA(hi, lo, (*fe)[1], ptr[7]);
2658+ MLA(hi, lo, (*fe)[2], ptr[6]);
2659+ MLA(hi, lo, (*fe)[3], ptr[5]);
2660+ MLA(hi, lo, (*fe)[4], ptr[4]);
2661+ MLA(hi, lo, (*fe)[5], ptr[3]);
2662+ MLA(hi, lo, (*fe)[6], ptr[2]);
2663+ MLA(hi, lo, (*fe)[7], ptr[1]); */
2664+ window_1 lr/*fe*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2665+
2666+ /* *pcm1++ = SHIFT(MLZ(hi, lo));
2667+
2668+ pcm2 = pcm1 + 31; */
2669+ scale r1, r0, r1
2670+ st.w r10/*pcm_1*/++, r1
2671+ sub r11/*pcm2*/, r10, -4*31
2672+
2673+ /* for (sb = 1; sb < 16; ++sb) { */
2674+ mov r2, 15
2675+ stdsp sp[0], r2
2676+odd_loop:
2677+ /* ++fe;
2678+ ptr += 33; */
2679+ sub lr /*fe*/, -8*4
2680+ sub r12, -33*2
2681+
2682+ /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
2683+ MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
2684+ MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
2685+ MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
2686+ MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
2687+ MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
2688+ MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
2689+ MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
2690+ window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2691+ /* MLN(hi, lo); */
2692+
2693+ neg r0
2694+ acr r1
2695+ neg r1
2696+
2697+ /* MLA(hi, lo, (*fe)[7], ptr[1]);
2698+ MLA(hi, lo, (*fe)[6], ptr[2]);
2699+ MLA(hi, lo, (*fe)[5], ptr[3]);
2700+ MLA(hi, lo, (*fe)[4], ptr[4]);
2701+ MLA(hi, lo, (*fe)[3], ptr[5]);
2702+ MLA(hi, lo, (*fe)[2], ptr[6]);
2703+ MLA(hi, lo, (*fe)[1], ptr[7]);
2704+ MLA(hi, lo, (*fe)[0], ptr[0]); */
2705+ window_1 lr/*fe*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2706+
2707+ /* ptr -= 2*pe; */
2708+ lddsp r2, sp[4]
2709+
2710+ /* *pcm1++ = SHIFT(MLZ(hi, lo)); */
2711+
2712+ scale r1, r0, r1
2713+ sub r12/*ptr*/, r12, r2/*pe*/<< 2
2714+ st.w r10/*pcm_1*/++, r1
2715+
2716+
2717+ /* ML0(hi, lo, (*fe)[0], ptr[7 + 17]);
2718+ MLA(hi, lo, (*fe)[1], ptr[8 + 17]);
2719+ MLA(hi, lo, (*fe)[2], ptr[9 + 17]);
2720+ MLA(hi, lo, (*fe)[3], ptr[10 + 17]);
2721+ MLA(hi, lo, (*fe)[4], ptr[11 + 17]);
2722+ MLA(hi, lo, (*fe)[5], ptr[12 + 17]);
2723+ MLA(hi, lo, (*fe)[6], ptr[13 + 17]);
2724+ MLA(hi, lo, (*fe)[7], ptr[14 + 17]); */
2725+ window_2 lr/*fe*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2726+ /* MLA(hi, lo, (*fo)[7], ptr[14]);
2727+ MLA(hi, lo, (*fo)[6], ptr[13]);
2728+ MLA(hi, lo, (*fo)[5], ptr[12]);
2729+ MLA(hi, lo, (*fo)[4], ptr[11]);
2730+ MLA(hi, lo, (*fo)[3], ptr[10]);
2731+ MLA(hi, lo, (*fo)[2], ptr[9]);
2732+ MLA(hi, lo, (*fo)[1], ptr[8]);
2733+ MLA(hi, lo, (*fo)[0], ptr[7]); */
2734+ window_2 r8/*fo*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2735+
2736+
2737+ /* *pcm2-- = SHIFT(MLZ(hi, lo)); */
2738+ lddsp r3, sp[4]
2739+ lddsp r2, sp[0]
2740+ scale r1, r0, r1
2741+ st.w --r11/*pcm_2*/, r1
2742+
2743+ /* ptr += 2*pe; */
2744+ add r12/*ptr*/, r12, r3/*pe*/<< 2
2745+
2746+ /* ++fo;
2747+ } */
2748+ sub r8/*fo*/, -8*4
2749+
2750+ sub r2, 1
2751+ stdsp sp[0], r2
2752+ brne odd_loop
2753+
2754+ /* ptr += 33; */
2755+ sub r12/*ptr*/, -33*2
2756+
2757+ /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
2758+ MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
2759+ MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
2760+ MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
2761+ MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
2762+ MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
2763+ MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
2764+ MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
2765+ window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2766+
2767+ rjmp synth_end
2768+synth_even:
2769+ /* Filter for even phases */
2770+
2771+ /* fe = &(*filter)[0][0][0];
2772+ fx = &(*filter)[0][1][0];
2773+ fo = &(*filter)[1][1][0]; */
2774+ sub lr /*fx*/, r11, -16*8*4
2775+ sub r8 /*fo*/, r11, -(16*8*4*2 + 16*8*4)
2776+
2777+ /* po = ((phase - 1) & 0xF) >> 1; */
2778+ sub r12, 1
2779+ andl r12, 0xe, COH
2780+ stdsp sp[4], r12
2781+ /* ptr = (short const *)Dmod + po; */
2782+ add r12, r9, r12
2783+
2784+ /* ML0(hi, lo, (*fx)[0], ptr[0 + 17]);
2785+ MLA(hi, lo, (*fx)[1], ptr[7 + 17]);
2786+ MLA(hi, lo, (*fx)[2], ptr[6 + 17]);
2787+ MLA(hi, lo, (*fx)[3], ptr[5 + 17]);
2788+ MLA(hi, lo, (*fx)[4], ptr[4 + 17]);
2789+ MLA(hi, lo, (*fx)[5], ptr[3 + 17]);
2790+ MLA(hi, lo, (*fx)[6], ptr[2 + 17]);
2791+ MLA(hi, lo, (*fx)[7], ptr[1 + 17]); */
2792+ window_1 lr/*fx*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2793+
2794+ /* MLN(hi, lo); */
2795+ neg r0
2796+ acr r1
2797+ neg r1
2798+
2799+ /* MLA(hi, lo, (*fe)[0], ptr[0 + 1]);
2800+ MLA(hi, lo, (*fe)[1], ptr[7 + 1]);
2801+ MLA(hi, lo, (*fe)[2], ptr[6 + 1]);
2802+ MLA(hi, lo, (*fe)[3], ptr[5 + 1]);
2803+ MLA(hi, lo, (*fe)[4], ptr[4 + 1]);
2804+ MLA(hi, lo, (*fe)[5], ptr[3 + 1]);
2805+ MLA(hi, lo, (*fe)[6], ptr[2 + 1]);
2806+ MLA(hi, lo, (*fe)[7], ptr[1 + 1]); */
2807+ window_1 r11/*fe*/,r12/*ptr*/,r0/*acc*/,1/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2808+
2809+ /* *pcm1++ = SHIFT(MLZ(hi, lo));
2810+
2811+ pcm2 = pcm1 + 31; */
2812+ scale r1, r0, r1
2813+ st.w r10/*pcm_1*/++, r1
2814+ sub lr/*pcm2*/, r10, -4*31
2815+
2816+ /* for (sb = 1; sb < 16; ++sb) { */
2817+ mov r2, 15
2818+ stdsp sp[0], r2
2819+even_loop:
2820+ /* ++fe;
2821+ ptr += 33; */
2822+ sub r11 /*fe*/, -8*4
2823+ sub r12, -33*2
2824+
2825+ /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
2826+ MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
2827+ MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
2828+ MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
2829+ MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
2830+ MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
2831+ MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
2832+ MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
2833+ window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2834+ /* MLN(hi, lo); */
2835+ neg r0
2836+ acr r1
2837+ neg r1
2838+
2839+ /* MLA(hi, lo, (*fe)[7], ptr[1 + 1]);
2840+ MLA(hi, lo, (*fe)[6], ptr[2 + 1]);
2841+ MLA(hi, lo, (*fe)[5], ptr[3 + 1]);
2842+ MLA(hi, lo, (*fe)[4], ptr[4 + 1]);
2843+ MLA(hi, lo, (*fe)[3], ptr[5 + 1]);
2844+ MLA(hi, lo, (*fe)[2], ptr[6 + 1]);
2845+ MLA(hi, lo, (*fe)[1], ptr[7 + 1]);
2846+ MLA(hi, lo, (*fe)[0], ptr[0 + 1]); */
2847+ window_1 r11/*fe*/,r12/*ptr*/,r0/*acc*/,1/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2848+
2849+ /* *pcm1++ = SHIFT(MLZ(hi, lo)); */
2850+ lddsp r2, sp[4]
2851+ scale r1, r0, r1
2852+ /* ptr -= 2*po; */
2853+ sub r12/*ptr*/, r12, r2/*po*/<< 1
2854+ st.w r10/*pcm_1*/++, r1
2855+
2856+
2857+ /* ML0(hi, lo, (*fe)[0], ptr[7 + 17 - 1]);
2858+ MLA(hi, lo, (*fe)[1], ptr[8 + 17 - 1]);
2859+ MLA(hi, lo, (*fe)[2], ptr[9 + 17 - 1]);
2860+ MLA(hi, lo, (*fe)[3], ptr[10 + 17 - 1]);
2861+ MLA(hi, lo, (*fe)[4], ptr[11 + 17 - 1]);
2862+ MLA(hi, lo, (*fe)[5], ptr[12 + 17 - 1]);
2863+ MLA(hi, lo, (*fe)[6], ptr[13 + 17 - 1]);
2864+ MLA(hi, lo, (*fe)[7], ptr[14 + 17 - 1]); */
2865+ window_2 r11/*fe*/,r12/*ptr*/,r0/*acc*/,16/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2866+ /* MLA(hi, lo, (*fo)[7], ptr[14]);
2867+ MLA(hi, lo, (*fo)[6], ptr[13]);
2868+ MLA(hi, lo, (*fo)[5], ptr[12]);
2869+ MLA(hi, lo, (*fo)[4], ptr[11]);
2870+ MLA(hi, lo, (*fo)[3], ptr[10]);
2871+ MLA(hi, lo, (*fo)[2], ptr[9]);
2872+ MLA(hi, lo, (*fo)[1], ptr[8]);
2873+ MLA(hi, lo, (*fo)[0], ptr[7]); */
2874+ window_2 r8/*fo*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2875+
2876+
2877+ /* *pcm2-- = SHIFT(MLZ(hi, lo)); */
2878+ lddsp r3, sp[4]
2879+ lddsp r2, sp[0]
2880+ scale r1, r0, r1
2881+ st.w --lr/*pcm_2*/, r1
2882+
2883+ /* ptr += 2*po; */
2884+ add r12/*ptr*/, r12, r3/*po*/<< 1
2885+
2886+ /* ++fo;
2887+ } */
2888+ sub r8/*fo*/, -8*4
2889+
2890+ sub r2, 1
2891+ stdsp sp[0], r2
2892+ brne even_loop
2893+
2894+ /* ptr += 33; */
2895+ sub r12/*ptr*/, -33*2
2896+
2897+ /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
2898+ MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
2899+ MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
2900+ MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
2901+ MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
2902+ MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
2903+ MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
2904+ MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
2905+ window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2906+
2907+
2908+
2909+synth_end:
2910+ /* *pcm1 = SHIFT(-MLZ(hi, lo)); */
2911+ scale r1, r0, r1
2912+ neg r1
2913+ st.w r10/*pcm_1*/, r1
2914+
2915+ mov r12, r10
2916+ sub sp, -8
2917+ popm r0-r7, pc
2918+
2919+
2920+
2921+
2922+