summaryrefslogtreecommitdiffstats
path: root/meta-oe/recipes-multimedia
diff options
context:
space:
mode:
authorPaul Eggleton <paul.eggleton@linux.intel.com>2013-04-06 00:35:40 +0100
committerMartin Jansa <Martin.Jansa@gmail.com>2013-04-09 21:34:17 +0200
commit23c711ed619a0362167720fe92fcc4dd02a8caea (patch)
tree7dacf6cce64f2e205e0dfacaf204537f8f22dc37 /meta-oe/recipes-multimedia
parentd5de18c7e9816d3428050f4e3ec6590259e6af00 (diff)
downloadmeta-openembedded-23c711ed619a0362167720fe92fcc4dd02a8caea.tar.gz
libmad: remove
This is largely equivalent to the recipe in OE-Core apart from LICENSE_FLAGS, insignificant patch differences, and an additional patch for avr32 optimisation (and since there appears to be no public layer for an avr32 machine, there's not a great deal of point in preserving the latter). Signed-off-by: Paul Eggleton <paul.eggleton@linux.intel.com> Signed-off-by: Martin Jansa <Martin.Jansa@gmail.com>
Diffstat (limited to 'meta-oe/recipes-multimedia')
-rw-r--r--meta-oe/recipes-multimedia/libmad/files/add-pkgconfig.patch68
-rw-r--r--meta-oe/recipes-multimedia/libmad/files/libmad-0.15.1b-avr32-optimization.patch2922
-rw-r--r--meta-oe/recipes-multimedia/libmad/files/mad-mips-h-constraint.patch70
-rw-r--r--meta-oe/recipes-multimedia/libmad/files/mad.diff24
-rw-r--r--meta-oe/recipes-multimedia/libmad/libmad-0.15.1b/obsolete_automake_macros.patch14
-rw-r--r--meta-oe/recipes-multimedia/libmad/libmad_0.15.1b.bb38
6 files changed, 0 insertions, 3136 deletions
diff --git a/meta-oe/recipes-multimedia/libmad/files/add-pkgconfig.patch b/meta-oe/recipes-multimedia/libmad/files/add-pkgconfig.patch
deleted file mode 100644
index 636b27a92..000000000
--- a/meta-oe/recipes-multimedia/libmad/files/add-pkgconfig.patch
+++ /dev/null
@@ -1,68 +0,0 @@
1Here is a patch for adding pkg-config support to libmad.
2It would make life a bit easier for distro maintainers if this was applied.
3In case you didn't know, pkg-config is a tool for providing LDFLAGS and
4CFLAGS for packages using shared libraries. It's on freedesktop.org.
5Debian has already been distributing the pkg-config file mad.pc with
6libmad for some time, and people developing on debian (notably xmms2
7developers) have started relying on this support being present, causing
8some confusion for people installing from source and on some BSDs which
9do not provide mad.pc (google: pkgconfig libmad).
10
11EMH
12
13--h31gzZEtNLTqOjlF
14Content-Type: text/plain; charset=us-ascii
15Content-Disposition: attachment; filename=&quot;libmad-0.15.1b-pkgconfig.patch&quot;
16
17diff -Naur libmad-0.15.1b.old/configure.ac libmad-0.15.1b/configure.ac
18--- libmad-0.15.1b.old/configure.ac 2004-01-23 10:41:32.000000000 +0100
19+++ libmad-0.15.1b/configure.ac 2004-08-07 02:25:24.633462168 +0200
20@@ -429,5 +429,5 @@
21 dnl AC_SUBST(LTLIBOBJS)
22
23 AC_CONFIG_FILES([Makefile msvc++/Makefile \
24- libmad.list])
25+ libmad.list mad.pc])
26 AC_OUTPUT
27diff -Naur libmad-0.15.1b.old/mad.pc.in libmad-0.15.1b/mad.pc.in
28--- libmad-0.15.1b.old/mad.pc.in 1970-01-01 01:00:00.000000000 +0100
29+++ libmad-0.15.1b/mad.pc.in 2004-08-07 02:04:59.617692872 +0200
30@@ -0,0 +1,14 @@
31+# libmad pkg-config source file
32+
33+prefix=@prefix@
34+exec_prefix=@exec_prefix@
35+libdir=@libdir@
36+includedir=@includedir@
37+
38+Name: mad
39+Description: MPEG Audio Decoder
40+Version: @VERSION@
41+Requires:
42+Conflicts:
43+Libs: -L${libdir} -lmad -lm
44+Cflags: -I${includedir}
45diff -Naur libmad-0.15.1b.old/Makefile.am libmad-0.15.1b/Makefile.am
46--- libmad-0.15.1b.old/Makefile.am 2004-02-17 03:02:03.000000000 +0100
47+++ libmad-0.15.1b/Makefile.am 2004-08-07 02:03:19.859858368 +0200
48@@ -24,6 +24,9 @@
49 SUBDIRS =
50 DIST_SUBDIRS = msvc++
51
52+pkgconfigdir = $(libdir)/pkgconfig
53+pkgconfig_DATA = mad.pc
54+
55 lib_LTLIBRARIES = libmad.la
56 include_HEADERS = mad.h
57
58@@ -34,7 +37,8 @@
59 minimad_LDADD = libmad.la
60
61 EXTRA_DIST = mad.h.sed \
62- CHANGES COPYRIGHT CREDITS README TODO VERSION
63+ CHANGES COPYRIGHT CREDITS README TODO VERSION \
64+ mad.pc.in
65
66 exported_headers = version.h fixed.h bit.h timer.h stream.h frame.h \
67 synth.h decoder.h
68
diff --git a/meta-oe/recipes-multimedia/libmad/files/libmad-0.15.1b-avr32-optimization.patch b/meta-oe/recipes-multimedia/libmad/files/libmad-0.15.1b-avr32-optimization.patch
deleted file mode 100644
index b74eea322..000000000
--- a/meta-oe/recipes-multimedia/libmad/files/libmad-0.15.1b-avr32-optimization.patch
+++ /dev/null
@@ -1,2922 +0,0 @@
1diff --git a/bit.c b/bit.c
2index c2bfb24..262ce3a 100644
3--- a/bit.c
4+++ b/bit.c
5@@ -25,12 +25,6 @@
6
7 # include "global.h"
8
9-# ifdef HAVE_LIMITS_H
10-# include <limits.h>
11-# else
12-# define CHAR_BIT 8
13-# endif
14-
15 # include "bit.h"
16
17 /*
18@@ -81,6 +75,8 @@ unsigned short const crc_table[256] = {
19
20 # define CRC_POLY 0x8005
21
22+#ifndef FPM_AVR32
23+
24 /*
25 * NAME: bit->init()
26 * DESCRIPTION: initialize bit pointer struct
27@@ -190,6 +186,8 @@ void mad_bit_write(struct mad_bitptr *bitptr, unsigned int len,
28 }
29 # endif
30
31+#endif
32+
33 /*
34 * NAME: bit->crc()
35 * DESCRIPTION: compute CRC-check word
36diff --git a/bit.h b/bit.h
37index 5a51570..70f550a 100644
38--- a/bit.h
39+++ b/bit.h
40@@ -22,6 +22,92 @@
41 # ifndef LIBMAD_BIT_H
42 # define LIBMAD_BIT_H
43
44+# ifdef HAVE_LIMITS_H
45+# include <limits.h>
46+# else
47+# define CHAR_BIT 8
48+# endif
49+
50+#ifdef FPM_AVR32
51+
52+struct mad_bitptr {
53+ unsigned char const *byte;
54+ unsigned int read_bytes;
55+};
56+
57+/*
58+ * NAME: bit->init()
59+ * DESCRIPTION: initialize bit pointer struct
60+ */
61+static void mad_bit_init(struct mad_bitptr *bitptr, unsigned char const *byte)
62+{
63+ bitptr->byte = byte;
64+ bitptr->read_bytes = 0;
65+}
66+
67+/*
68+ * NAME: bit->length()
69+ * DESCRIPTION: return number of bits between start and end points
70+ */
71+static unsigned int mad_bit_length(struct mad_bitptr const *begin,
72+ struct mad_bitptr const *end)
73+{
74+ return (end->read_bytes - begin->read_bytes) +
75+ 8 * (end->byte - begin->byte);
76+}
77+
78+/*
79+ * NAME: bit->nextbyte()
80+ * DESCRIPTION: return pointer to next unprocessed byte
81+ */
82+static unsigned char const *mad_bit_nextbyte(struct mad_bitptr const *bitptr)
83+{
84+ return bitptr->byte + ((bitptr->read_bytes + 0x7) >> 3);
85+}
86+
87+/*
88+ * NAME: bit->skip()
89+ * DESCRIPTION: advance bit pointer
90+ */
91+static void mad_bit_skip(struct mad_bitptr *bitptr, unsigned int len)
92+{
93+ bitptr->read_bytes += len;
94+ bitptr->byte += (bitptr->read_bytes >> 3);
95+ bitptr->read_bytes &= 0x7;
96+}
97+
98+/*
99+ * NAME: bit->read()
100+ * DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value
101+ */
102+static unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len)
103+{
104+ register unsigned long value;
105+
106+ if (!len)
107+ return 0;
108+
109+ value = *(unsigned int *)bitptr->byte;
110+
111+ value <<= bitptr->read_bytes;
112+ value >>= (32 - len);
113+
114+ bitptr->read_bytes += len;
115+ bitptr->byte += (bitptr->read_bytes >> 3);
116+ bitptr->read_bytes &= 0x7;
117+
118+ return value;
119+}
120+
121+# define mad_bit_finish(bitptr) /* nothing */
122+
123+static unsigned long mad_bit_bitsleft(struct mad_bitptr *bitptr)
124+{
125+ return (8 - (bitptr)->read_bytes);
126+}
127+
128+#else /* #ifdef FPM_AVR32 */
129+
130 struct mad_bitptr {
131 unsigned char const *byte;
132 unsigned short cache;
133@@ -42,6 +128,8 @@ void mad_bit_skip(struct mad_bitptr *, unsigned int);
134 unsigned long mad_bit_read(struct mad_bitptr *, unsigned int);
135 void mad_bit_write(struct mad_bitptr *, unsigned int, unsigned long);
136
137+#endif
138+
139 unsigned short mad_bit_crc(struct mad_bitptr, unsigned int, unsigned short);
140
141 # endif
142diff --git a/configure.ac b/configure.ac
143index 9b79399..063cb9b 100644
144--- a/configure.ac
145+++ b/configure.ac
146@@ -274,13 +274,14 @@ fi
147 AC_MSG_CHECKING(for architecture-specific fixed-point math routines)
148 AC_ARG_ENABLE(fpm, AC_HELP_STRING([--enable-fpm=ARCH],
149 [use ARCH-specific fixed-point math routines
150- (one of: intel, arm, mips, sparc, ppc, 64bit, default)]),
151+ (one of: intel, arm, avr32, mips, sparc, ppc, 64bit, default)]),
152 [
153 case "$enableval" in
154 yes) ;;
155 no|default|approx) FPM="DEFAULT" ;;
156 intel|i?86) FPM="INTEL" ;;
157 arm) FPM="ARM" ;;
158+ avr32) FPM="AVR32" ;;
159 mips) FPM="MIPS" ;;
160 sparc) FPM="SPARC" ;;
161 ppc|powerpc) FPM="PPC" ;;
162@@ -298,6 +299,7 @@ then
163 case "$host" in
164 i?86-*) FPM="INTEL" ;;
165 arm*-*) FPM="ARM" ;;
166+ avr32*-*) FPM="AVR32" ;;
167 mips*-*) FPM="MIPS" ;;
168 sparc*-*) FPM="SPARC" ;;
169 powerpc*-*) FPM="PPC" ;;
170@@ -343,6 +345,11 @@ then
171 ASO="$ASO -DASO_IMDCT"
172 ASO_OBJS="imdct_l_arm.lo"
173 ;;
174+ avr32*-*)
175+ ASO="$ASO -DASO_INTERLEAVE2"
176+ ASO="$ASO -DASO_ZEROCHECK"
177+ ASO_OBJS="dct32_avr32.lo synth_avr32.lo imdct_avr32.lo"
178+ ;;
179 mips*-*)
180 ASO="$ASO -DASO_INTERLEAVE2"
181 ASO="$ASO -DASO_ZEROCHECK"
182diff --git a/configure b/configure
183index ee421cc..7a9f0c8 100755
184--- a/configure
185+++ b/configure
186@@ -1048,7 +1048,7 @@ Optional Features:
187 --enable-speed optimize for speed over accuracy
188 --enable-accuracy optimize for accuracy over speed
189 --enable-fpm=ARCH use ARCH-specific fixed-point math routines (one of:
190- intel, arm, mips, sparc, ppc, 64bit, default)
191+ intel, arm, avr32, mips, sparc, ppc, 64bit, default)
192 --enable-sso use subband synthesis optimization
193 --disable-aso disable architecture-specific optimizations
194 --enable-strict-iso use strict ISO/IEC interpretations
195@@ -21477,6 +21477,7 @@ if test "${enable_fpm+set}" = set; then
196 no|default|approx) FPM="DEFAULT" ;;
197 intel|i?86) FPM="INTEL" ;;
198 arm) FPM="ARM" ;;
199+ avr32) FPM="AVR32" ;;
200 mips) FPM="MIPS" ;;
201 sparc) FPM="SPARC" ;;
202 ppc|powerpc) FPM="PPC" ;;
203@@ -21498,6 +21499,7 @@ then
204 case "$host" in
205 i?86-*) FPM="INTEL" ;;
206 arm*-*) FPM="ARM" ;;
207+ avr32*-*) FPM="AVR32" ;;
208 mips*-*) FPM="MIPS" ;;
209 sparc*-*) FPM="SPARC" ;;
210 powerpc*-*) FPM="PPC" ;;
211@@ -21554,6 +21556,11 @@ then
212 ASO="$ASO -DASO_IMDCT"
213 ASO_OBJS="imdct_l_arm.lo"
214 ;;
215+ avr32*-*)
216+ ASO="$ASO -DASO_INTERLEAVE2"
217+ ASO="$ASO -DASO_ZEROCHECK"
218+ ASO_OBJS="dct32_avr32.lo synth_avr32.lo imdct_avr32.lo"
219+ ;;
220 mips*-*)
221 ASO="$ASO -DASO_INTERLEAVE2"
222 ASO="$ASO -DASO_ZEROCHECK"
223diff --git a/dct32_avr32.S b/dct32_avr32.S
224new file mode 100644
225index 0000000..7513340
226--- /dev/null
227+++ b/dct32_avr32.S
228@@ -0,0 +1,780 @@
229+/*
230+ Optimized 32-point Discrete Cosine Transform (DCT)
231+ Copyright 2003-2006 Atmel Corporation.
232+
233+ Written by Ronny Pedersen, Atmel Norway
234+
235+ This program is free software; you can redistribute it and/or modify
236+ it under the terms of the GNU General Public License as published by
237+ the Free Software Foundation; either version 2 of the License, or
238+ (at your option) any later version.
239+
240+ This program is distributed in the hope that it will be useful,
241+ but WITHOUT ANY WARRANTY; without even the implied warranty of
242+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
243+ GNU General Public License for more details.
244+
245+ You should have received a copy of the GNU General Public License
246+ along with this program; if not, write to the Free Software
247+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
248+
249+#define SHIFT 12
250+#define MAD_F_SCALEBITS 28
251+#define SLOTS 8
252+
253+#define MAD_F(x) ((x + (1 << 15)) >> 16)
254+
255+# define costab1 MAD_F(0x7fd8878e)
256+# define costab2 MAD_F(0x7f62368f)
257+# define costab3 MAD_F(0x7e9d55fc)
258+# define costab4 MAD_F(0x7d8a5f40)
259+# define costab5 MAD_F(0x7c29fbee)
260+# define costab6 MAD_F(0x7a7d055b)
261+# define costab7 MAD_F(0x78848414)
262+# define costab8 MAD_F(0x7641af3d)
263+# define costab9 MAD_F(0x73b5ebd1)
264+# define costab10 MAD_F(0x70e2cbc6)
265+# define costab11 MAD_F(0x6dca0d14)
266+# define costab12 MAD_F(0x6a6d98a4)
267+# define costab13 MAD_F(0x66cf8120)
268+# define costab14 MAD_F(0x62f201ac)
269+# define costab15 MAD_F(0x5ed77c8a)
270+# define costab16 MAD_F(0x5a82799a)
271+# define costab17 MAD_F(0x55f5a4d2)
272+# define costab18 MAD_F(0x5133cc94)
273+# define costab19 MAD_F(0x4c3fdff4)
274+# define costab20 MAD_F(0x471cece7)
275+# define costab21 MAD_F(0x41ce1e65)
276+# define costab22 MAD_F(0x3c56ba70)
277+# define costab23 MAD_F(0x36ba2014)
278+# define costab24 MAD_F(0x30fbc54d)
279+# define costab25 MAD_F(0x2b1f34eb)
280+# define costab26 MAD_F(0x25280c5e)
281+# define costab27 MAD_F(0x1f19f97b)
282+# define costab28 MAD_F(0x18f8b83c)
283+# define costab29 MAD_F(0x12c8106f)
284+# define costab30 MAD_F(0x0c8bd35e)
285+# define costab31 MAD_F(0x0647d97c)
286+
287+
288+ .macro butterfly2_in out1, out2, out3, out4, in, idx_in1, idx_in2, idx_in3, idx_in4, coeff1, coeff2, tmplo, tmphi
289+ mov \tmplo, \coeff1
290+ ld.w \out1, \in[\idx_in1 * 4]
291+ ld.w \out2, \in[\idx_in2 * 4]
292+ ld.w \out3, \in[\idx_in3 * 4]
293+ ld.w \out4, \in[\idx_in4 * 4]
294+ sub \tmphi, \out1, \out2
295+ add \out1, \out2
296+ mulsatrndwh.w \out2, \tmphi, \tmplo:b
297+
298+ sub \tmphi, \out3, \out4
299+ mov \tmplo, \coeff2
300+ add \out3, \out4
301+ mulsatrndwh.w \out4, \tmphi, \tmplo:b
302+ .endm
303+
304+ .macro butterfly2 in1, in2, in3, in4, coeff1, tmplo, tmphi, tmp
305+ mov \tmp, \coeff1
306+ sub \tmphi, \in1, \in2
307+ add \in1, \in2
308+ mulsatrndwh.w \in2, \tmphi, \tmp:b
309+
310+ sub \tmphi, \in3, \in4
311+ add \in3, \in4
312+ mulsatrndwh.w \in4, \tmphi, \tmp:b
313+ .endm
314+
315+ .macro butterfly4 in1, in2, in3, in4, in5, in6, in7, in8, coeff1, tmplo, tmphi, tmp
316+ mov \tmp, \coeff1
317+ sub \tmphi, \in1, \in2
318+ add \in1, \in2
319+ mulsatrndwh.w \in2, \tmphi, \tmp:b
320+
321+ sub \tmphi, \in3, \in4
322+ add \in3, \in4
323+ mulsatrndwh.w \in4, \tmphi, \tmp:b
324+
325+ sub \tmphi, \in5, \in6
326+ add \in5, \in6
327+ mulsatrndwh.w \in6, \tmphi, \tmp:b
328+
329+ sub \tmphi, \in7, \in8
330+ add \in7, \in8
331+ mulsatrndwh.w \in8, \tmphi, \tmp:b
332+ .endm
333+
334+ .macro scale reg
335+ .endm
336+
337+/*void dct32( mad_fixed_t const in[32], unsigned int slot,
338+ mad_fixed_t lo[16][8], mad_fixed_t hi[16][8]) */
339+
340+ .global dct32_avr32
341+dct32_avr32:
342+ stm --sp, r0-r7, r9-r11, lr
343+
344+ sub sp, 32*4
345+
346+/* t0 = in[0] + in[31]; t16 = MUL(in[0] - in[31], costab1);
347+ t1 = in[15] + in[16]; t17 = MUL(in[15] - in[16], costab31); */
348+ butterfly2_in r4/*t0*/, r5/*t16*/, r6/*t1*/, r7/*t17*/, r12, 0, 31, 15, 16, costab1, costab31, r10, r11
349+
350+/* t41 = t16 + t17;
351+ t59 = MUL(t16 - t17, costab2);
352+ t33 = t0 + t1;
353+ t50 = MUL(t0 - t1, costab2);*/
354+ butterfly2 r5/*t41*/, r7/*t59*/, r4/*t33*/, r6/*t50*/, costab2, r10, r11, lr
355+
356+/* t2 = in[7] + in[24]; t18 = MUL(in[7] - in[24], costab15);
357+ t3 = in[8] + in[23]; t19 = MUL(in[8] - in[23], costab17); */
358+ butterfly2_in r0/*t2*/, r1/*t18*/, r2/*t3*/, r3/*t19*/, r12, 7, 24, 8, 23, costab15, costab17, r10, r11
359+
360+/* t42 = t18 + t19;
361+ t60 = MUL(t18 - t19, costab30);
362+ t34 = t2 + t3;
363+ t51 = MUL(t2 - t3, costab30); */
364+ butterfly2 r1/*t42*/, r3/*t60*/, r0/*t34*/, r2/*t51*/, costab30, r10, r11, lr
365+
366+/* t73 = t41 + t42; t94 = MUL(t41 - t42, costab4);
367+ t83 = t59 + t60; t106 = MUL(t59 - t60, costab4); */
368+
369+
370+/* t69 = t33 + t34; t89 = MUL(t33 - t34, costab4);
371+ t78 = t50 + t51; t100 = MUL(t50 - t51, costab4); */
372+ butterfly4 r5/*t73*/, r1/*t94*/, r7/*t83*/, r3/*t106*/,r4/*t69*/, r0/*t89*/, r6/*t78*/, r2/*t100*/, costab4, r10, r11, lr
373+
374+/* Store away the computed butterflies:
375+ sp[0-7] = t83, t78, t73, t69, t106, t100, t94, t89 */
376+ stm sp, r0-r7
377+
378+
379+/* t4 = in[3] + in[28]; t20 = MUL(in[3] - in[28], costab7);
380+ t5 = in[12] + in[19]; t21 = MUL(in[12] - in[19], costab25); */
381+ butterfly2_in r4/*t4*/, r5/*t20*/, r6/*t5*/, r7/*t21*/, r12, 3, 28, 12, 19, costab7, costab25, r10, r11
382+
383+/* t43 = t20 + t21;
384+ t61 = MUL(t20 - t21, costab14);
385+ t35 = t4 + t5;
386+ t52 = MUL(t4 - t5, costab14); */
387+ butterfly2 r5/*t43*/, r7/*t61*/, r4/*t35*/, r6/*t52*/, costab14, r10, r11, lr
388+
389+/* t6 = in[4] + in[27]; t22 = MUL(in[4] - in[27], costab9);
390+ t7 = in[11] + in[20]; t23 = MUL(in[11] - in[20], costab23); */
391+ butterfly2_in r0/*t6*/, r1/*t22*/, r2/*t7*/, r3/*t23*/, r12, 4, 27, 11, 20, costab9, costab23, r10, r11
392+
393+/* t44 = t22 + t23;
394+ t62 = MUL(t22 - t23, costab18);
395+ t36 = t6 + t7;
396+ t53 = MUL(t6 - t7, costab18); */
397+ butterfly2 r1/*t44*/, r3/*t62*/, r0/*t36*/, r2/*t53*/, costab18, r10, r11, lr
398+
399+/* t74 = t43 + t44; t95 = MUL(t43 - t44, costab28);
400+ t84 = t61 + t62; t107 = MUL(t61 - t62, costab28); */
401+
402+/* t70 = t35 + t36; t90 = MUL(t35 - t36, costab28);
403+ t79 = t52 + t53; t101 = MUL(t52 - t53, costab28); */
404+ butterfly4 r5/*t74*/, r1/*t95*/, r7/*t84*/, r3/*t107*/, r4/*t70*/, r0/*t90*/, r6/*t79*/, r2/*t101*/, costab28, r10, r11, lr
405+
406+/* Store away the computed butterflies:
407+ sp[8-15] = t84, t79, t74, t70, t107, t101, t95, t90 */
408+ sub r10, sp, -8*4
409+ stm r10, r0-r7
410+
411+
412+/* t8 = in[1] + in[30]; t24 = MUL(in[1] - in[30], costab3);
413+ t9 = in[14] + in[17]; t25 = MUL(in[14] - in[17], costab29); */
414+ butterfly2_in r4/*t8*/, r5/*t24*/, r6/*t9*/, r7/*t25*/, r12, 1, 30, 14, 17, costab3, costab29, r10, r11
415+
416+
417+/* t45 = t24 + t25;
418+ t63 = MUL(t24 - t25, costab6);
419+ t37 = t8 + t9;
420+ t54 = MUL(t8 - t9, costab6); */
421+ butterfly2 r5/*t45*/, r7/*t63*/, r4/*t37*/, r6/*t54*/, costab6, r10, r11, lr
422+
423+/* t10 = in[6] + in[25]; t26 = MUL(in[6] - in[25], costab13);
424+ t11 = in[9] + in[22]; t27 = MUL(in[9] - in[22], costab19); */
425+ butterfly2_in r0/*t10*/, r1/*t26*/, r2/*t11*/, r3/*t27*/, r12, 6, 25, 9, 22, costab13, costab19, r10, r11
426+
427+/* t46 = t26 + t27;
428+ t64 = MUL(t26 - t27, costab26);
429+ t38 = t10 + t11;
430+ t55 = MUL(t10 - t11, costab26); */
431+ butterfly2 r1/*t46*/, r3/*t64*/, r0/*t38*/, r2/*t55*/, costab26, r10, r11, lr
432+
433+/* t75 = t45 + t46; t96 = MUL(t45 - t46, costab12);
434+ t85 = t63 + t64; t108 = MUL(t63 - t64, costab12); */
435+
436+/* t71 = t37 + t38; t91 = MUL(t37 - t38, costab12);
437+ t80 = t54 + t55; t102 = MUL(t54 - t55, costab12); */
438+ butterfly4 r5/*t75*/, r1/*t96*/, r7/*t85*/, r3/*t108*/, r4/*t71*/, r0/*t91*/, r6/*t80*/, r2/*t102*/, costab12, r10, r11, lr
439+
440+/* Store away the computed butterflies:
441+ sp[16-23] = t85, t80, t75, t71, t108, t102, t96, t91 */
442+ sub r10, sp, -16*4
443+ stm r10, r0-r7
444+
445+/* t12 = in[2] + in[29]; t28 = MUL(in[2] - in[29], costab5);
446+ t13 = in[13] + in[18]; t29 = MUL(in[13] - in[18], costab27); */
447+ butterfly2_in r4/*t12*/, r5/*t28*/, r6/*t13*/, r7/*t29*/, r12, 2, 29, 13, 18, costab5, costab27, r10, r11
448+
449+/* t47 = t28 + t29;
450+ t65 = MUL(t28 - t29, costab10);
451+ t39 = t12 + t13;
452+ t56 = MUL(t12 - t13, costab10); */
453+ butterfly2 r5/*t47*/, r7/*t65*/, r4/*t39*/, r6/*t56*/, costab10, r10, r11, lr
454+
455+/* t14 = in[5] + in[26]; t30 = MUL(in[5] - in[26], costab11);
456+ t15 = in[10] + in[21]; t31 = MUL(in[10] - in[21], costab21);*/
457+ butterfly2_in r0/*t14*/, r1/*t30*/, r2/*t15*/, r3/*t31*/, r12, 5, 26, 10, 21, costab11, costab21, r10, r11
458+
459+/* t48 = t30 + t31;
460+ t66 = MUL(t30 - t31, costab22);
461+ t40 = t14 + t15;
462+ t57 = MUL(t14 - t15, costab22);*/
463+ butterfly2 r1/*t48*/, r3/*t66*/, r0/*t40*/, r2/*t57*/, costab22, r10, r11, lr
464+
465+/* t76 = t47 + t48; t97 = MUL(t47 - t48, costab20);
466+ t86 = t65 + t66; t109 = MUL(t65 - t66, costab20);*/
467+
468+/* t72 = t39 + t40; t92 = MUL(t39 - t40, costab20);
469+ t81 = t56 + t57; t103 = MUL(t56 - t57, costab20);*/
470+ butterfly4 r5/*t76*/, r1/*t97*/, r7/*t86*/, r3/*t109*/,r4/*t72*/, r0/*t92*/, r6/*t81*/, r2/*t103*/, costab20, r10, r11, lr
471+
472+/* Store away the computed butterflies:
473+ sp[24-31] = t86, t81, t76, t72, t109, t103, t97, t92 */
474+ sub r10, sp, -24*4
475+ stm r10, r0-r7
476+
477+/* We now have the following on the stack:
478+
479+ sp[0-7] = t83, t78, t73, t69, t106, t100, t94, t89
480+ sp[8-15] = t84, t79, t74, t70, t107, t101, t95, t90
481+ sp[16-23] = t85, t80, t75, t71, t108, t102, t96, t91
482+ sp[24-31] = t86, t81, t76, t72, t109, t103, t97, t92 */
483+
484+/* Load {r0...r7} = { t72, t76, t71, t75, t70, t74, t69, t73 } */
485+ ld.d r6, sp[2*4]
486+ ld.d r4, sp[10*4]
487+ ld.d r2, sp[18*4]
488+ ld.d r0, sp[26*4]
489+
490+
491+/* t113 = t69 + t70;
492+ t141 = MUL(t69 - t70, costab8);
493+
494+ t115 = t73 + t74;
495+ t144 = MUL(t73 - t74, costab8); */
496+ butterfly2 r6/*t113*/, r4/*t141*/, r7/*t115*/, r5/*t144*/, costab8, r10, r11, lr
497+
498+/* t114 = t71 + t72;
499+ t142 = MUL(t71 - t72, costab24);
500+
501+ t116 = t75 + t76;
502+ t145 = MUL(t75 - t76, costab24); */
503+ butterfly2 r2/*t114*/, r0/*t142*/, r3/*t116*/, r1/*t145*/, costab24, r10, r11, lr
504+
505+
506+/*
507+ t191 = t113 + t114;
508+ t192 = MUL(t113 - t114, costab16)
509+
510+ t32 = t115 + t116;
511+ t177 = MUL(t115 - t116, costab16) ;
512+
513+ t143 = t141 + t142;
514+ t190 = MUL(t141 - t142, costab16) ;
515+
516+ t146 = t144 + t145;
517+ t184 = MUL(t144 - t145, costab16) ; */
518+ butterfly4 r6/*t191*/, r2/*t192*/, r7/*t32*/, r3/*t177*/, r4/*t143*/, r0/*190*/, r5/*t146*/, r1/*t184*/, costab16, r10, r11, lr
519+
520+/* Store away the computed butterflies:
521+ sp[2-3] = t32, t191
522+ sp[10-11] = t146, t143
523+ sp[18-19] = t177, t192
524+ sp[26-27] = t184, t190 */
525+ st.d sp[2*4] , r6
526+ st.d sp[10*4], r4
527+ st.d sp[18*4], r2
528+ st.d sp[26*4], r0
529+
530+/* Load {r0...r7} = { t81, t86, t80, t85, t79, t84, t78, t83 } */
531+ ld.d r6, sp[0*4]
532+ ld.d r4, sp[8*4]
533+ ld.d r2, sp[16*4]
534+ ld.d r0, sp[24*4]
535+
536+
537+/* t118 = t78 + t79;
538+ t148 = MUL(t78 - t79, costab8);
539+
540+ t121 = t83 + t84;
541+ t152 = MUL(t83 - t84, costab8); */
542+ butterfly2 r6/*t118*/, r4/*t148*/, r7/*t121*/, r5/*t152*/, costab8, r10, r11, lr
543+
544+/* t119 = t80 + t81;
545+ t149 = MUL(t80 - t81, costab24);
546+
547+ t122 = t85 + t86;
548+ t153 = MUL(t85 - t86, costab24); */
549+ butterfly2 r2/*t119*/, r0/*t149*/, r3/*t122*/, r1/*t153*/, costab24, r10, r11, lr
550+
551+
552+
553+/* t58 = t118 + t119;
554+ t178 = MUL(t118 - t119, costab16) ;
555+
556+ t67 = t121 + t122;
557+ t179 = MUL(t121 - t122, costab16) ;
558+
559+ t150 = t148 + t149;
560+ t185 = MUL(t148 - t149, costab16) ;
561+
562+ t154 = t152 + t153;
563+ t186 = MUL(t152 - t153, costab16) ; */
564+ butterfly4 r6/*t58*/, r2/*t178*/, r7/*t67*/, r3/*t179*/, r4/*t150*/, r0/*185*/, r5/*t154*/, r1/*t186*/, costab16, r10, r11, lr
565+
566+/* Store away the computed butterflies:
567+ sp[0-1] = t67, t58
568+ sp[8-9] = t154, t150
569+ sp[16-17] = t179, t178
570+ sp[24-25] = t186, t185 */
571+ st.d sp[0*4] , r6
572+ st.d sp[8*4], r4
573+ st.d sp[16*4], r2
574+ st.d sp[24*4], r0
575+
576+/* Load {r0...r7} = { t92, t97, t91, t96, t90, t95, t89, t94 } */
577+ ld.d r6, sp[6*4]
578+ ld.d r4, sp[14*4]
579+ ld.d r2, sp[22*4]
580+ ld.d r0, sp[30*4]
581+
582+
583+/* t125 = t89 + t90;
584+ t157 = MUL(t89 - t90, costab8);
585+
586+ t128 = t94 + t95;
587+ t161 = MUL(t94 - t95, costab8); */
588+ butterfly2 r6/*t125*/, r4/*t157*/, r7/*t128*/, r5/*t161*/, costab8, r10, r11, lr
589+
590+/* t126 = t91 + t92;
591+ t158 = MUL(t91 - t92, costab24);
592+
593+ t129 = t96 + t97;
594+ t162 = MUL(t96 - t97, costab24); */
595+ butterfly2 r2/*t126*/, r0/*t158*/, r3/*t129*/, r1/*t162*/, costab24, r10, r11, lr
596+
597+
598+/*
599+ t93 = t125 + t126;
600+ t180 = MUL(t125 - t126, costab16) ;
601+
602+ t98 = t128 + t129;
603+ t181 = MUL(t128 - t129, costab16) ;
604+
605+ t159 = t157 + t158;
606+ t187 = MUL(t157 - t158, costab16) ;
607+
608+ t163 = t161 + t162;
609+ t188 = MUL(t161 - t162, costab16) ; */
610+ butterfly4 r6/*t93*/, r2/*t180*/, r7/*t98*/, r3/*t181*/, r4/*t159*/, r0/*187*/, r5/*t163*/, r1/*t188*/, costab16, r10, r11, lr
611+
612+
613+/* Store away the computed butterflies:
614+ sp[6-7] = t98, t93
615+ sp[14-15] = t163, t159
616+ sp[22-23] = t181, t180
617+ sp[30-31] = t188, t187 */
618+ st.d sp[6*4] , r6
619+ st.d sp[14*4], r4
620+ st.d sp[22*4], r2
621+ st.d sp[30*4], r0
622+
623+/* Load {r0...r7} = { t103, t109, t102, t108, t101, t107, t100, t106 } */
624+ ld.d r6, sp[4*4]
625+ ld.d r4, sp[12*4]
626+ ld.d r2, sp[20*4]
627+ ld.d r0, sp[28*4]
628+
629+
630+
631+/* t132 = t100 + t101;
632+ t166 = MUL(t100 - t101, costab8);
633+
634+ t136 = t106 + t107;
635+ t171 = MUL(t106 - t107, costab8); */
636+ butterfly2 r6/*t132*/, r4/*t166*/, r7/*t136*/, r5/*t171*/, costab8, r10, r11, lr
637+
638+/* t133 = t102 + t103;
639+ t167 = MUL(t102 - t103, costab24);
640+
641+ t137 = t108 + t109;
642+ t172 = MUL(t108 - t109, costab24);*/
643+ butterfly2 r2/*t133*/, r0/*t167*/, r3/*t137*/, r1/*t172*/, costab24, r10, r11, lr
644+
645+
646+/* t104 = t132 + t133;
647+ t182 = MUL(t132 - t133, costab16) ;
648+
649+ t110 = t136 + t137;
650+ t183 = MUL(t136 - t137, costab16) ;
651+
652+ t168 = t166 + t167;
653+ t189 = MUL(t166 - t167, costab16) ;
654+
655+ t173 = t171 + t172;
656+ t208 = MUL(t171 - t172, costab16) ; */
657+ butterfly4 r6/*t104*/, r2/*t182*/, r7/*t110*/, r3/*t183*/, r4/*t168*/, r0/*189*/, r5/*t173*/, r1/*t208*/, costab16, r10, r11, lr
658+
659+/* Store away the computed butterflies:
660+ sp[4-5] = t110, t104
661+ sp[12-13] = t173, t168
662+ sp[20-21] = t183, t182
663+ sp[28-29] = t208, t189 */
664+ st.d sp[4*4] , r6
665+ st.d sp[12*4], r4
666+ st.d sp[20*4], r2
667+ st.d sp[28*4], r0
668+
669+/* Now we have the following stack
670+
671+ sp[0-7] = t67, t58 , t32, t191, t110, t104, t98, t93
672+ sp[8-15] = t154, t150, t146, t143, t173, t168, t163, t159
673+ sp[16-23] = t179, t178, t177, t192, t183, t182, t181, t180
674+ sp[24-31] = t186, t185, t184, t190, t208, t189, t188, t187
675+*/
676+
677+ /* Get slot, lo and hi from stack */
678+ lddsp lr, sp[32*4 + 4] /*slot*/
679+ lddsp r12, sp[32*4 + 8] /*lo*/
680+ lddsp r11, sp[32*4 + 12] /*hi*/
681+
682+ add r12, r12, lr << 2
683+ add r11, r11, lr << 2
684+
685+
686+/* t49 = -(t67 * 2) + t32;
687+ hi[14][slot] = SHIFT(t32);
688+ t87 = -(t110 * 2) + t67;
689+ t138 = -(t173 * 2) + t110;
690+ t203 = -(t208 * 2) + t173; */
691+
692+ lddsp r0/*t67*/, sp[0]
693+ lddsp r1/*t32*/, sp[2*4]
694+ lddsp r2/*t110*/, sp[4*4]
695+ lddsp r3/*t173*/, sp[12*4]
696+ lddsp r5/*t208*/, sp[28*4]
697+
698+ sub r4/*t49*/, r1, r0 << 1
699+ scale r1
700+ sub r0/*t87*/, r0, r2 << 1
701+ st.w r11[14*SLOTS*4], r1
702+ sub r2/*t138*/, r2, r3 << 1
703+ sub r1/*t203*/, r3, r5 << 1
704+
705+/* Live: r0 = t87, r1= t203, r2= t138, r4 = t49
706+ Free: r3, r5, r6, r7, r8, r9, r10, lr */
707+
708+/* t68 = (t98 * 2) + t49;
709+ hi[12][slot] = SHIFT(-t49);
710+ t130 = -(t163 * 2) + t98;
711+ t201 = -(t188 * 2) + t163;
712+ t200 = -(t186 * 2) + t154;
713+ t111 = (t154 * 2) + t87;
714+ t77 = -(-(t87 * 2) - t68);
715+ t88 = (t146 * 2) + t77;
716+ t199 = -(t184 * 2) + t146;
717+ hi[ 8][slot] = SHIFT(-t77);
718+ hi[10][slot] = SHIFT(t68);*/
719+ lddsp r3/*t98*/, sp[6*4]
720+ lddsp r5/*t163*/, sp[14*4]
721+ lddsp r6/*t188*/, sp[30*4]
722+ lddsp r10/*t186*/, sp[24*4]
723+
724+ add r7/*t68*/, r4, r3 << 1
725+ neg r4
726+ scale r4
727+ lddsp r9/*t154*/, sp[8*4]
728+ sub r3/*t130*/, r3, r5 << 1
729+ st.w r11[12*SLOTS*4], r4
730+ sub r8/*t201*/, r5, r6 << 1
731+ sub r4/*t200*/, r9, r10 << 1
732+ lddsp lr/*t146*/, sp[10*4]
733+ lddsp r6/*t184*/, sp[26*4]
734+ add r10/*t111*/, r0, r9 << 1
735+ add r5/*t77*/,r7, r0 << 1
736+ add r0/*t88*/, r5, lr << 1
737+ sub r6/*t199*/, lr, r6 << 1
738+ neg r5
739+ scale r5
740+ scale r7
741+ st.w r11[8*SLOTS*4], r5
742+ st.w r11[10*SLOTS*4], r7
743+
744+/* Live: r0 = t88, r1= t203, r2= t138, r3 = t130, r4 = t200,
745+ r6 = 199, r8 = t201, r10 = t111
746+ Free: r5, r7, r9, lr */
747+
748+
749+/*
750+ t123 = -(-(t138 * 2) - t111);
751+ t174 = (t183 * 2) + t138;
752+ t99 = -(t111 * 2) + t88;
753+ hi[ 6][slot] = SHIFT(t88); */
754+ lddsp r5/*t183*/, sp[20*4]
755+
756+ add r7/*t123*/, r10, r2 << 1
757+ sub r10/*t99*/, r0, r10 << 1
758+ scale r0
759+ add r2/*t174*/, r2, r5 << 1
760+ st.w r11[6*SLOTS*4], r0
761+
762+/* Live: r1 = t203, r2 = t174, r3 = t130, r4 = t200,
763+ r6 = t199, r7 = t123, r8 = t201, r10 = t99
764+ Free: r0, r5, r9, lr */
765+
766+/* t112 = -(t130 * 2) + t99;
767+ t164 = (t181 * 2) + t130;
768+ hi[ 4][slot] = SHIFT(-t99); */
769+ lddsp r0/*t181*/, sp[22*4]
770+
771+ sub r5/*t112*/, r10, r3 << 1
772+ neg r10
773+ scale r10
774+ add r3/*164*/, r3, r0 << 1
775+ st.w r11[4*SLOTS*4], r10
776+
777+/* Live: r1 = t203, r2 = t174, r3 = t164, r4 = t200,
778+ r5 = t112, r6 = t199, r7 = t123, r8 = t201
779+ Free: r0, r9, r10, lr */
780+
781+
782+/* t117 = -(-(t123 * 2) - t112);
783+ t139 = (t179 * 2) + t123;
784+ hi[ 2][slot] = SHIFT(t112); */
785+ lddsp r0/*t179*/, sp[16*4]
786+
787+ add r9/*t117*/, r5, r7 << 1
788+ scale r5
789+ add r7/*t139*/, r7, r0 << 1
790+ st.w r11[2*SLOTS*4], r5
791+
792+/* Live: r1 = t203, r2 = t174, r3 = t164, r4 = t200,
793+ r6 = t199, r7 = t139, r8 = t201, r9 = t117
794+ Free: r0, r5, r10, lr */
795+
796+/* t155 = -(t174 * 2) + t139;
797+ t204 = -(-(t203 * 2) - t174);
798+ t124 = (t177 * 2) + t117;
799+ hi[ 0][slot] = SHIFT(-t117);
800+ t131 = -(t139 * 2) + t124;
801+ lo[ 1][slot] = SHIFT(t124);*/
802+ lddsp r0/*t177*/, sp[18*4]
803+
804+ sub r5/*t155*/, r7, r2 << 1
805+ add r2/*t204*/, r2, r1 << 1
806+ add r0/*t124*/, r9, r0 << 1
807+ neg r9
808+ scale r9
809+ sub r7/*t131*/, r0, r7 << 1
810+ scale r0
811+ st.w r11[0*SLOTS*4], r9
812+ st.w r12[1*SLOTS*4], r0
813+
814+/* Live: r2 = t204, r3 = t164, r4 = t200,
815+ r5 = t155, r6 = t199, r7 = t131, r8 = t201
816+ Free: r0, r1, r9, r10, lr */
817+
818+/* t140 = (t164 * 2) + t131;
819+ lo[ 3][slot] = SHIFT(-t131);
820+ t202 = -(-(t201 * 2) - t164); */
821+ add r0/*t140*/, r7, r3 << 1
822+ neg r7
823+ scale r7
824+ add r3/*t202*/, r3, r8 << 1
825+ st.w r12[3*SLOTS*4], r7
826+
827+/* Live: r0 = t140, r2 = t204, r3 = t202, r4 = t200,
828+ r5 = t155, r6 = t199
829+ Free: r1, r7, r8, r9, r10, lr */
830+
831+
832+/* t147 = -(-(t155 * 2) - t140);
833+ lo[ 5][slot] = SHIFT(t140);
834+ t175 = -(t200 * 2) + t155;
835+ t156 = -(t199 * 2) + t147;
836+ lo[ 7][slot] = SHIFT(-t147); */
837+ add r1/*t147*/, r0, r5 << 1
838+ scale r0
839+ sub r5/*t175*/, r5, r4 << 1
840+ sub r4/*156*/, r1, r6 << 1
841+ neg r1
842+ scale r1
843+ st.w r12[5*SLOTS*4], r0
844+ st.w r12[7*SLOTS*4], r1
845+
846+/* Live: r2 = t204, r3 = t202,
847+ r4 = t156, r5 = t175
848+ Free: r0, r1, r6, r7, r8, r9, r10, lr */
849+
850+
851+/* t205 = -(-(t204 * 2) - t175);
852+ t165 = -(t175 * 2) + t156;
853+ lo[ 9][slot] = SHIFT(t156);
854+ t176 = -(t202 * 2) + t165;
855+ lo[11][slot] = SHIFT(-t165);
856+ t206 = -(-(t205 * 2) - t176);
857+ lo[15][slot] = SHIFT(-t206)
858+ lo[13][slot] = SHIFT(t176) */
859+ add r0/*t205*/, r5, r2 << 1
860+ sub r1/*t165*/, r4, r5 << 1
861+ scale r4
862+ sub r3/*t176*/, r1, r3 << 1
863+ st.w r12[9*SLOTS*4], r4
864+ neg r1
865+ scale r1
866+ add r6/*t206*/, r3, r0 << 1
867+ neg r6
868+ scale r6
869+ scale r3
870+ st.w r12[11*SLOTS*4], r1
871+ st.w r12[15*SLOTS*4], r6
872+ st.w r12[13*SLOTS*4], r3
873+
874+/* t193 = -((t190 * 2) - t143)
875+ hi[ 7][slot] = SHIFT(t143);
876+ lo[ 8][slot] = SHIFT(-t193);
877+ t82 = -(t104 * 2) + t58;
878+ hi[13][slot] = SHIFT(t58);
879+ t134 = -(t168 * 2) + t104;
880+ t196 = -(t189 * 2) + t168; */
881+
882+ lddsp r0/*t190*/, sp[27*4]
883+ lddsp r1/*t143*/, sp[11*4]
884+ lddsp r2/*t104*/, sp[5*4]
885+ lddsp r3/*t58*/, sp[1*4]
886+ lddsp r4/*t168*/, sp[13*4]
887+ lddsp r5/*t189*/, sp[29*4]
888+ sub r0/*t193*/, r1, r0 << 1
889+ neg r0
890+ scale r1
891+ scale r0
892+ st.w r11[7*SLOTS*4], r1
893+ st.w r12[8*SLOTS*4], r0
894+ sub r0/*t82*/, r3, r2 << 1
895+ scale r3
896+ sub r2/*t134*/, r2, r4 << 1
897+ sub r4/*t196*/, r4, r5 << 1
898+ st.w r11[13*SLOTS*4], r3
899+
900+/* Live: r0 = t82, r2 = t134,
901+ r4 = t196
902+ Free: r1, r3, r5, r6, r7, r8, r9, r10, lr */
903+
904+
905+
906+/*
907+
908+ t207 = -(t185 * 2) + t150;
909+ t105 = (t150 * 2) + t82;
910+ hi[ 9][slot] = SHIFT(-t82);
911+ t120 = -(-(t134 * 2) - t105);
912+ hi[ 5][slot] = SHIFT(t105);
913+ t169 = (t182 * 2) + t134;
914+
915+ t135 = (t178 * 2) + t120;
916+ hi[ 1][slot] = SHIFT(-t120);
917+ t197 = -(-(t196 * 2) - t169);
918+ t151 = -(t169 * 2) + t135;
919+ lo[ 2][slot] = SHIFT(t135); */
920+ lddsp r1/*t185*/, sp[25*4]
921+ lddsp r3/*t150*/, sp[9*4]
922+ lddsp r5/*t182*/, sp[21*4]
923+ lddsp r8/*t178*/, sp[17*4]
924+
925+ sub r6/*t207*/, r3, r1 << 1
926+ add r3/*t105*/, r0, r3 << 1
927+ neg r0
928+ scale r0
929+ add r7/*t120*/, r3, r2 << 1
930+ scale r3
931+ st.w r11[9*SLOTS*4], r0
932+ st.w r11[5*SLOTS*4], r3
933+ add r2/*t169*/, r2, r5 << 1
934+ add r8/*t135*/, r7, r8 << 1
935+ neg r7
936+ scale r7
937+ add r4/*t197*/, r2, r4 << 1
938+ sub r2/*t151*/, r8, r2 << 1
939+ scale r8
940+ st.w r11[1*SLOTS*4], r7
941+ st.w r12[2*SLOTS*4], r8
942+
943+/* Live: r2 = t151, r4 = t197, r6 = t207
944+
945+ Free: r0, r1, r3, r5, r7, r8, r9, r10, lr */
946+
947+
948+
949+/* t170 = -(t207 * 2) + t151;
950+ lo[ 6][slot] = SHIFT(-t151);
951+
952+ t198 = -(-(t197 * 2) - t170);
953+ lo[10][slot] = SHIFT(t170);
954+ lo[14][slot] = SHIFT(-t198);
955+
956+ t127 = -(t159 * 2) + t93;
957+ hi[11][slot] = SHIFT(t93);
958+ t194 = -(t187 * 2) + t159; */
959+ lddsp r0/*t159*/, sp[15*4]
960+ lddsp r1/*t93*/, sp[7*4]
961+ lddsp r3/*t187*/, sp[31*4]
962+ sub r5/*t170*/, r2, r6 << 1
963+ neg r2
964+ scale r2
965+ add r4/*t198*/,r5, r4 << 1
966+ neg r4
967+ scale r5
968+ scale r4
969+ st.w r12[6*SLOTS*4], r2
970+ st.w r12[10*SLOTS*4], r5
971+ st.w r12[14*SLOTS*4], r4
972+ sub r7/*t127*/, r1, r0 << 1
973+ scale r1
974+ sub r0/*t194*/, r0, r3 << 1
975+ st.w r11[11*SLOTS*4], r1
976+
977+
978+/* Live: r0 = t194, r7 = t127
979+ Free: r1, r2, r3, r4, r6, r5, r8, r9, r10, lr */
980+
981+/* t160 = (t180 * 2) + t127;
982+ hi[ 3][slot] = SHIFT(-t127);
983+ t195 = -(-(t194 * 2) - t160);
984+ lo[ 4][slot] = SHIFT(t160);
985+ lo[12][slot] = SHIFT(-t195);
986+
987+ hi[15][slot] = SHIFT(t191);
988+ lo[ 0][slot] = SHIFT(t192); */
989+ lddsp r1/*t180*/, sp[23*4]
990+ lddsp r2/*t191*/, sp[3*4]
991+ lddsp r3/*t192*/, sp[19*4]
992+ add r4/*t160*/, r7, r1 << 1
993+ neg r7
994+ scale r7
995+ add r6/*t195*/, r4, r0 << 1
996+ scale r4
997+ neg r6
998+ scale r6
999+ st.w r11[3*SLOTS*4], r7
1000+ st.w r12[4*SLOTS*4], r4
1001+ st.w r12[12*SLOTS*4], r6
1002+ scale r2
1003+ scale r3
1004+ st.w r11[15*SLOTS*4], r2
1005+ st.w r12[0*SLOTS*4], r3
1006+
1007+ sub sp, -32*4
1008+ ldm sp++,r0-r7, r9-r11, pc
1009diff --git a/fixed.h b/fixed.h
1010index 4b58abf..0a1350a 100644
1011--- a/fixed.h
1012+++ b/fixed.h
1013@@ -237,6 +237,46 @@ mad_fixed_t mad_f_mul_inline(mad_fixed_t x, mad_fixed_t y)
1014 # define MAD_F_SCALEBITS MAD_F_FRACBITS
1015 # endif
1016
1017+/* --- AVR32 ----------------------------------------------------------------- */
1018+
1019+# elif defined(FPM_AVR32)
1020+
1021+typedef signed short mad_coeff_t;
1022+
1023+struct DWstruct {int high, low;};
1024+
1025+typedef union {
1026+ struct DWstruct s;
1027+ long long ll;
1028+} DWunion;
1029+
1030+# define MAD_F_MLX(hi, lo, x, y) \
1031+ { register DWunion __res; \
1032+ __res.ll = (long long)x * (long long)y; \
1033+ /* asm ("muls.d\t%0, %1, %2" : "=r" (__res.ll) : "r" (x), "r" (y));*/ \
1034+ hi = __res.s.high; \
1035+ lo = __res.s.low; }
1036+
1037+# define MAD_F_MLA(hi, lo, x, y) \
1038+ { register DWunion __res; \
1039+ __res.s.high = hi; \
1040+ __res.s.low = lo; \
1041+ __res.ll += (long long)x * (long long)y; \
1042+/* asm ("macs.d\t%0, %1, %2" : "+r" (__res.ll) : "r" (x), "r" (y));*/ \
1043+ hi = __res.s.high; \
1044+ lo = __res.s.low; }
1045+
1046+
1047+# define MAD_F_MLN(hi, lo) \
1048+ asm ("neg %0\n" \
1049+ "acr %1\n" \
1050+ "neg %1" \
1051+ : "+r" (lo), "+r" (hi) \
1052+ :: "cc")
1053+
1054+
1055+# define MAD_F_SCALEBITS MAD_F_FRACBITS
1056+
1057 /* --- ARM ----------------------------------------------------------------- */
1058
1059 # elif defined(FPM_ARM)
1060@@ -433,6 +473,8 @@ mad_fixed_t mad_f_mul_inline(mad_fixed_t x, mad_fixed_t y)
1061 *
1062 * Pre-rounding is required to stay within the limits of compliance.
1063 */
1064+typedef signed int mad_coeff_t;
1065+
1066 # if defined(OPT_SPEED)
1067 # define mad_f_mul(x, y) (((x) >> 12) * ((y) >> 16))
1068 # else
1069diff --git a/imdct_avr32.S b/imdct_avr32.S
1070new file mode 100644
1071index 0000000..d0ee6b4
1072--- /dev/null
1073+++ b/imdct_avr32.S
1074@@ -0,0 +1,789 @@
1075+/*
1076+ Optimized 36-point Inverse Modified Cosine Transform (IMDCT)
1077+ Copyright 2003-2006 Atmel Corporation.
1078+
1079+ Written by Ronny Pedersen, Atmel Norway
1080+
1081+ This program is free software; you can redistribute it and/or modify
1082+ it under the terms of the GNU General Public License as published by
1083+ the Free Software Foundation; either version 2 of the License, or
1084+ (at your option) any later version.
1085+
1086+ This program is distributed in the hope that it will be useful,
1087+ but WITHOUT ANY WARRANTY; without even the implied warranty of
1088+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1089+ GNU General Public License for more details.
1090+
1091+ You should have received a copy of the GNU General Public License
1092+ along with this program; if not, write to the Free Software
1093+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
1094+
1095+#define MAD_F(x) ((x + (1 << 13)) >> 14)
1096+
1097+ .public imdct36_avr32
1098+
1099+/*
1100+ void imdct36(mad_fixed_t const x[18], mad_fixed_t y[36])
1101+ {
1102+ mad_fixed_t tmp[18];
1103+ int i;
1104+*/
1105+/* DCT-IV */
1106+imdct36_avr32:
1107+ pushm r0-r7,r11,lr
1108+ sub sp, 4*18
1109+/*
1110+ {
1111+ mad_fixed_t tmp2[18];
1112+ int i;
1113+
1114+ /* scale[i] = 2 * cos(PI * (2 * i + 1) / (4 * 18)) */
1115+/*
1116+ static mad_fixed_t const scale[18] = {
1117+ MAD_F(0x1ff833fa), MAD_F(0x1fb9ea93), MAD_F(0x1f3dd120),
1118+ MAD_F(0x1e84d969), MAD_F(0x1d906bcf), MAD_F(0x1c62648b),
1119+ MAD_F(0x1afd100f), MAD_F(0x1963268b), MAD_F(0x1797c6a4),
1120+ MAD_F(0x159e6f5b), MAD_F(0x137af940), MAD_F(0x11318ef3),
1121+ MAD_F(0x0ec6a507), MAD_F(0x0c3ef153), MAD_F(0x099f61c5),
1122+ MAD_F(0x06ed12c5), MAD_F(0x042d4544), MAD_F(0x0165547c)
1123+ };
1124+*/
1125+
1126+ /* scaling */
1127+
1128+/*
1129+ for (i = 0; i < 18; i += 3) {
1130+ tmp2[i + 0] = mad_f_mul(x[i + 0], scale[i + 0]);
1131+ tmp2[i + 1] = mad_f_mul(x[i + 1], scale[i + 1]);
1132+ tmp2[i + 2] = mad_f_mul(x[i + 2], scale[i + 2]);
1133+ }
1134+*/
1135+ /* even input butterfly */
1136+
1137+/*
1138+ for (i = 0; i < 9; i += 3) {
1139+ tmp3[i + 0] = tmp2[i + 0] + tmp2[18 - (i + 0) - 1];
1140+ tmp3[i + 1] = tmp2[i + 1] + tmp2[18 - (i + 1) - 1];
1141+ tmp3[i + 2] = tmp2[i + 2] + tmp2[18 - (i + 2) - 1];
1142+ }
1143+ for (i = 0; i < 9; i += 3) {
1144+ tmp4[i + 0] = tmp2[i + 0] - tmp2[18 - (i + 0) - 1];
1145+ tmp4[i + 1] = tmp2[i + 1] - tmp2[18 - (i + 1) - 1];
1146+ tmp4[i + 2] = tmp2[i + 2] - tmp2[18 - (i + 2) - 1];
1147+ }
1148+*/
1149+
1150+ ld.d r8, r12[0] /*r8 = x[1], r9 = x[0]*/
1151+ ld.d r0, pc[scale_dctIV - .] /*r0 = {scale[2], scale[3]}, r1 = { scale[0], scale[1] }*/
1152+ ld.d r2, r12[2*4] /*r2 = x[3], r3 = x[2]*/
1153+ ld.d r4, pc[scale_dctIV - . + 14*2] /*r4 = {scale[16], scale[17]}, r5 = { scale[14], scale[15] }*/
1154+ mulsatrndwh.w r9/*tmp2[0]*/, r9, r1:t /*tmp2[0] = mad_f_mul(x[0], scale[0]) */
1155+ ld.d r6, r12[16*4] /*r6 = x[17], r7 = x[16]*/
1156+ mulsatrndwh.w r8/*tmp2[1]*/, r8, r1:b /*tmp2[1] = mad_f_mul(x[1], scale[1]) */
1157+ mulsatrndwh.w r3/*tmp2[2]*/, r3, r0:t /*tmp2[2] = mad_f_mul(x[2], scale[2]) */
1158+ mulsatrndwh.w r2/*tmp2[3]*/, r2, r0:b /*tmp2[3] = mad_f_mul(x[3], scale[3]) */
1159+ ld.d r0, r12[14*4] /*r0 = x[15], r1 = x[14]*/
1160+ mulsatrndwh.w r7/*tmp2[16]*/, r7, r4:t /*tmp2[16] = mad_f_mul(x[16], scale[16]) */
1161+ mulsatrndwh.w r6/*tmp2[17]*/, r6, r4:b /*tmp2[17] = mad_f_mul(x[17], scale[17]) */
1162+ mulsatrndwh.w r1/*tmp2[14]*/, r1, r5:t /*tmp2[14] = mad_f_mul(x[14], scale[14]) */
1163+ mulsatrndwh.w r0/*tmp2[15]*/, r0, r5:b /*tmp2[15] = mad_f_mul(x[15], scale[15]) */
1164+
1165+ ld.d r4, r12[4*4] /*r4 = x[5], r5 = x[4]*/
1166+
1167+ sub lr/*tmp4[0]*/, r9, r6
1168+ add r6/*tmp3[0]*/, r9, r6
1169+ sub r10/*tmp4[1]*/, r8, r7
1170+ add r7/*tmp3[1]*/, r8, r7
1171+ sub r9/*tmp4[2]*/, r3, r0
1172+ add r0/*tmp3[2]*/, r3, r0
1173+ sub r8/*tmp4[3]*/, r2, r1
1174+ add r1/*tmp3[3]*/, r2, r1
1175+
1176+ ld.d r2, pc[scale_dctIV - . + 4*2] /*r2 = {scale[6], scale[7]}, r3 = { scale[4], scale[5] }*/
1177+
1178+ stm --sp, r8-r10, lr /*sp[0] = tmp4[0],sp[1] = tmp4[1],
1179+ sp[2] = tmp4[2],sp[3] = tmp4[3] */
1180+
1181+ /* Registers used: r0 = tmp3[2], r1 = tmp3[3], r6 = tmp3[0], r7 = tmp3[1], r12 = x
1182+ Free registers: r2-r5, r8-r11, lr
1183+ */
1184+ ld.d r8, r12[6*4] /*r8 = x[7], r9 = x[6]*/
1185+ ld.d r10, pc[scale_dctIV - . + 10*2] /*r10 = {scale[12], scale[13]}, r11 = { scale[10], scale[11] }*/
1186+ mulsatrndwh.w r5/*tmp2[4]*/, r5, r3:t /*tmp2[4] = mad_f_mul(x[4], scale[4]) */
1187+ mulsatrndwh.w r4/*tmp2[5]*/, r4, r3:b /*tmp2[5] = mad_f_mul(x[5], scale[5]) */
1188+ mulsatrndwh.w r9/*tmp2[6]*/, r9, r2:t /*tmp2[6] = mad_f_mul(x[6], scale[6]) */
1189+ mulsatrndwh.w r8/*tmp2[7]*/, r8, r2:b /*tmp2[7] = mad_f_mul(x[7], scale[7]) */
1190+
1191+ ld.d r2, r12[12*4] /*r2 = x[13], r3 = x[12]*/
1192+ ld.w lr, r12[11*4] /*lr = x[11] */
1193+ mulsatrndwh.w r3/*tmp2[12]*/, r3, r10:t /*tmp2[12] = mad_f_mul(x[12], scale[12]) */
1194+ mulsatrndwh.w r2/*tmp2[13]*/, r2, r10:b /*tmp2[13] = mad_f_mul(x[13], scale[13]) */
1195+ ld.w r10, r12[10*4] /*r10 = x[10] */
1196+ mulsatrndwh.w lr/*tmp2[11]*/, lr, r11:b /*tmp2[11] = mad_f_mul(x[11], scale[11]) */
1197+ mulsatrndwh.w r10/*tmp2[10]*/, r10, r11:t /*tmp2[10] = mad_f_mul(x[10], scale[10]) */
1198+
1199+ sub r11/*tmp4[4]*/, r5, r2
1200+ add r2/*tmp3[4]*/, r5, r2
1201+ sub r5/*tmp4[5]*/, r4, r3
1202+ add r3/*tmp3[5]*/, r4, r3
1203+ sub r4/*tmp4[6]*/, r9, lr
1204+ add lr/*tmp3[6]*/, r9, lr
1205+ sub r9/*tmp4[7]*/, r8, r10
1206+ add r10/*tmp3[7]*/, r8, r10
1207+ lddpc r8, scale_dctIV + 8*2 /*r8 = {scale[8], scale[9]} */
1208+
1209+ stm --sp, r4, r5, r9, r11 /*sp[0] = tmp4[4],sp[1] = tmp4[7],
1210+ sp[2] = tmp4[5],sp[3] = tmp4[6] */
1211+ ld.d r4, r12[8*4] /*r4 = x[9], r5 = x[8]*/
1212+ mulsatrndwh.w r5/*tmp2[8]*/, r5, r8:t /*tmp2[8] = mad_f_mul(x[8], scale[8]) */
1213+ mulsatrndwh.w r4/*tmp2[9]*/, r4, r8:b /*tmp2[9] = mad_f_mul(x[9], scale[9]) */
1214+ sub r9/*tmp4[8]*/, r5, r4
1215+ add r5/*tmp3[8]*/, r5, r4
1216+
1217+ st.w --sp, r9 /* sp[0] = tmp4[8] */
1218+
1219+ /* Registers used:
1220+
1221+ r0=tmp3[2], r1=tmp3[3], r2=tmp3[4], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1222+ r7 = tmp3[1], r10=tmp3[7], lr=tmp3[6]
1223+ Free registers:
1224+ r4, r8, r9, r11, r12
1225+ */
1226+
1227+
1228+ /* SDCT-II */
1229+/*
1230+
1231+ {
1232+ mad_fixed_t tmp3[9];
1233+ int i;
1234+*/
1235+ /* scale[i] = 2 * cos(PI * (2 * i + 1) / (2 * 18)) */
1236+/*
1237+ static mad_fixed_t const scale[9] = {
1238+ MAD_F(0x1fe0d3b4), MAD_F(0x1ee8dd47), MAD_F(0x1d007930),
1239+ MAD_F(0x1a367e59), MAD_F(0x16a09e66), MAD_F(0x125abcf8),
1240+ MAD_F(0x0d8616bc), MAD_F(0x08483ee1), MAD_F(0x02c9fad7)
1241+ };
1242+*/
1243+ /* divide the 18-point SDCT-II into two 9-point SDCT-IIs */
1244+
1245+
1246+ /* fastdct */
1247+
1248+/*
1249+ {
1250+ mad_fixed_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12;
1251+ mad_fixed_t a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25;
1252+ mad_fixed_t m0, m1, m2, m3, m4, m5, m6, m7;
1253+*/
1254+// enum {
1255+// c0 = MAD_F(0x1f838b8d), /* 2 * cos( 1 * PI / 18) */
1256+// c1 = MAD_F(0x1bb67ae8), /* 2 * cos( 3 * PI / 18) */
1257+// c2 = MAD_F(0x18836fa3), /* 2 * cos( 4 * PI / 18) */
1258+// c3 = MAD_F(0x1491b752), /* 2 * cos( 5 * PI / 18) */
1259+// c4 = MAD_F(0x0af1d43a), /* 2 * cos( 7 * PI / 18) */
1260+// c5 = MAD_F(0x058e86a0), /* 2 * cos( 8 * PI / 18) */
1261+// c6 = -MAD_F(0x1e11f642) /* 2 * cos(16 * PI / 18) */
1262+// };
1263+
1264+/*
1265+ a2 = tmp3[6] + tmp3[2];
1266+ a6 = tmp3[8] + tmp3[0];
1267+ a11 = a2 - a6;
1268+ m5 = mad_f_mul(a11, -c6) ;
1269+ a4 = tmp3[1] + tmp3[7];
1270+
1271+ a18 = tmp3[4] + a4;
1272+ a19 = -2 * tmp3[4] + a4;
1273+
1274+ a0 = tmp3[3] + tmp3[5];
1275+
1276+*/
1277+ add r11/*a4*/, r7, r10
1278+ add r12/*a18*/, r2, r11
1279+ sub r11/*a19*/, r11, r2<<1
1280+
1281+ add r4/*a2*/, lr, r0
1282+ add r8/*a6*/, r5, r6
1283+ sub r9/*a11*/, r4, r8
1284+
1285+ st.d --sp, r0 /* sp[0] = tmp3[3], sp1[1] = tmp3[2]*/
1286+
1287+ mov r2, MAD_F(0x1e11f642)
1288+ mulsatrndwh.w r9/*m5*/, r9, r2:b
1289+
1290+ add r2/*a0*/, r1, r3
1291+
1292+ /* Registers used:
1293+
1294+ r2=a0, r3=tmp3[5], r4=a2, r5=tmp3[8], r6 = tmp3[0],
1295+ r7 = tmp3[1], r8=a6, r10=tmp3[7], r9=m5, r11=a19, r12=a18,lr=tmp3[6]
1296+ Free registers:
1297+ r0, r1
1298+ */
1299+
1300+/*
1301+ a8 = a0 + a2;
1302+ a12 = a8 + a6;
1303+ a10 = a0 - a6;
1304+ a9 = a0 - a2;
1305+ m7 = mad_f_mul(a9, -c2) ;
1306+ m6 = mad_f_mul(a10, -c5) ;
1307+*/
1308+
1309+ add r0/*a8*/, r2, r4
1310+ add r0/*a12*/, r8
1311+ rsub r8/*a10*/, r2
1312+ sub r2/*a9*/, r4
1313+ mov r1, -MAD_F(0x18836fa3)
1314+ mulsatrndwh.w r2/*m7*/, r2, r1:b
1315+ mov r1, -MAD_F(0x058e86a0)
1316+ mulsatrndwh.w r8/*m6*/, r8, r1:b
1317+
1318+ /* Registers used:
1319+
1320+ r0=a12, r2=m7, r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1321+ r7 = tmp3[1], r8=m6, r10=tmp3[7], r9=m5, r11=a19, r12=a18,lr=tmp3[6]
1322+ Free registers:
1323+ r1, r4
1324+ */
1325+
1326+
1327+/*
1328+ a21 = -a19 - (m5 << 1);
1329+ tmp[ 8] = a21 - (m6 << 1);
1330+
1331+ a20 = a19 - (m5 << 1);
1332+ tmp[ 4] = (m7 << 1) + a20;
1333+ a22 = -a19 + (m6 << 1);
1334+ tmp[16] = a22 + (m7 << 1);
1335+ tmp[ 0] = a18 + a12;
1336+ tmp[12] = a12 - 2 * a18;
1337+*/
1338+ add r1/*a21*/, r11, r9 << 1
1339+ neg r1
1340+ sub r1/*tmp[8]*/, r1, r8 << 1
1341+ stdsp sp[4*11/*tmp3[..] on the stack*/ + 8*4], r1
1342+ sub r4/*a20*/, r11, r9 << 1
1343+ add r4/*tmp[4]*/, r4, r2 << 1
1344+ stdsp sp[4*11/*tmp3[..] on the stack*/ + 4*4], r4
1345+ neg r11
1346+ add r1/*a22*/, r11, r8 << 1
1347+ add r1/*tmp[16]*/, r1, r2 << 1
1348+ stdsp sp[4*11/*tmp3[..] on the stack*/ + 16*4], r1
1349+ add r4, r12, r0
1350+ sub r1, r0, r12 << 1
1351+ stdsp sp[4*11/*tmp3[..] on the stack*/ + 0*4], r4
1352+ stdsp sp[4*11/*tmp3[..] on the stack*/ + 12*4], r1
1353+
1354+ ld.d r0, sp++
1355+
1356+ /* Registers used:
1357+
1358+ r0 = tmp3[2], r1 = tmp3[3], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1359+ r7 = tmp3[1], r10=tmp3[7], r11=a19, lr=tmp3[6]
1360+ Free registers:
1361+ r2,r4,r8,r9,r12
1362+ */
1363+
1364+/*
1365+ a5 = tmp3[1] - tmp3[7];
1366+ a7 = tmp3[8] - tmp3[0];
1367+ a3 = tmp3[6] - tmp3[2];
1368+ a1 = tmp3[3] - tmp3[5];
1369+ a13 = a1 - a3;
1370+ a14 = a13 + a7;
1371+ m3 = mad_f_mul(a14, -c1) ;
1372+ m4 = mad_f_mul(a5, -c1) ;
1373+ tmp[ 6] = m3 << 1;
1374+*/
1375+ sub r7/*a5*/, r10
1376+ sub r2/*a7*/, r5, r6
1377+ sub r4/*a3*/, lr, r0
1378+ sub r8/*a1*/, r1, r3
1379+ sub r9/*a13*/, r8, r4
1380+ add r12/*a14*/, r9, r2
1381+ mov r0, -MAD_F(0x1bb67ae8)
1382+ mulsatrndwh.w r12/*m3*/, r12, r0:b
1383+ mulsatrndwh.w r7/*m4*/, r7, r0:b
1384+ lsl r12, 1
1385+ stdsp sp[4*9/*tmp3[..] on the stack*/ + 6*4], r12
1386+
1387+ /* Registers used:
1388+ r2 = a7, r4 = a3, r7 = m4, r8 = a1, r12 = m3
1389+
1390+ Free registers:
1391+ r0, r1, r3, r5, r6, r10, r9, r11, lr
1392+ */
1393+
1394+
1395+/*
1396+ a15 = a3 + a7;
1397+ m2 = mad_f_mul(a15, -c4) ;
1398+ a17 = a1 + a3;
1399+ m0 = mad_f_mul(a17, -c3) ;
1400+ a23 = (m4 << 1) + (m2 << 1);
1401+ tmp[14] = a23 + (m0 << 1); */
1402+ add r0/*a15*/, r4, r2
1403+ mov r1, -MAD_F(0x0af1d43a)
1404+ mulsatrndwh.w r0/*m2*/, r0, r1:b
1405+ mov r3, -MAD_F(0x1491b752)
1406+ add r5/*a17*/, r8, r4
1407+ mulsatrndwh.w r5/*m0*/, r5, r3:b
1408+ lsl r7, 1
1409+ add r6/*a23*/, r7, r0 << 1
1410+ add r6/*tmp[14]*/, r6, r5 << 1
1411+ stdsp sp[4*9/*tmp3[..] on the stack*/ + 14*4], r6
1412+
1413+ /* Registers used:
1414+ r0 = m2, r2 = a7, r5 = m0, r7 = m4, r8 = a1
1415+
1416+ Free registers:
1417+ r1, r3, r4, r6, r10, r9, r11, lr
1418+ */
1419+
1420+/*
1421+ a16 = a1 - a7;
1422+ m1 = mad_f_mul(a16, -c0) ;
1423+ a24 = (m4 << 1) - (m2 << 1);
1424+ tmp[10] = a24 - (m1 << 1);
1425+
1426+ a25 = (m4 << 1) + (m1 << 1);
1427+ tmp[ 2] = (m0 << 1) - a25;
1428+*/
1429+ sub r3/*a16*/, r8, r2
1430+ mov r4, -MAD_F(0x1f838b8d)
1431+ mulsatrndwh.w r3/*m1*/, r3, r4:b
1432+ sub r1/*a24*/, r7, r0 << 1
1433+ sub r1/*tmp[10]*/, r1, r3 << 1
1434+ stdsp sp[4*9/*tmp3[..] on the stack*/ + 10*4], r1
1435+ add r7/*a25*/, r7, r3 << 1
1436+ sub r7, r7, r5 << 1
1437+ neg r7
1438+ stdsp sp[4*9/*tmp3[..] on the stack*/ + 2*4], r7
1439+
1440+
1441+
1442+
1443+ /* output to every other slot for convenience */
1444+
1445+ /*} */
1446+ /* End fastdct */
1447+
1448+ /* odd input butterfly and scaling */
1449+
1450+
1451+ /* On the stack:
1452+ sp[0] = tmp4[8], sp[1] = tmp4[4],sp[2] = tmp4[7], sp[3] = tmp4[5],sp[4] = tmp4[6]
1453+ sp[5] = tmp4[0], sp[6] = tmp4[1],sp[7] = tmp4[2],sp[8] = tmp4[3]
1454+ */
1455+
1456+ /*
1457+ tmp3[0] = mad_f_mul(tmp4[0], scale[0]);
1458+ tmp3[1] = mad_f_mul(tmp4[1], scale[1]) << 1;
1459+ tmp3[2] = mad_f_mul(tmp4[2], scale[2]);
1460+ tmp3[3] = mad_f_mul(tmp4[3], scale[3]) << 1;
1461+ tmp3[4] = mad_f_mul(tmp4[4], scale[4]);
1462+ tmp3[5] = mad_f_mul(tmp4[5], scale[5]);
1463+ tmp3[6] = mad_f_mul(tmp4[6], scale[6]) << 1;
1464+ tmp3[7] = mad_f_mul(tmp4[7], scale[7]);
1465+ tmp3[8] = mad_f_mul(tmp4[8], scale[8]) << 1;
1466+ */
1467+ /* Registers used:
1468+ r1 = tmp4[3], r2 = tmp4[2], r3 = tmp4[1], r4 = tmp4[0], r7 = tmp4[6]
1469+ r10 = tmp4[5], r11 = tmp4[7], r12 = tmp4[4], lr = tmp4[8]
1470+
1471+ Free registers:
1472+ r0, r5, r6, r8, r9
1473+ */
1474+ ld.d r8, pc[ scale_sdctII - . + 4*2] /* r8 = { scale[6], scale[7] }, r9 = { scale[4], scale[5]} */
1475+ ldm sp++, r1, r2, r3, r4, r7, r10, r11, r12, lr
1476+ mov r5, MAD_F(0x02c9fad7) /* r3 = scale[8] */
1477+ mulsatrndwh.w r5/*tmp3[8]*/, lr, r5:b
1478+ mulsatrndwh.w lr/*tmp3[6]*/, r7, r8:t
1479+ ld.d r6, pc[ scale_sdctII - . + 0*2] /* r6 = { scale[2], scale[3] }, r7 = { scale[0], scale[1]} */
1480+ lsl lr, 1
1481+ lsl r5, 1
1482+ mulsatrndwh.w r0/*tmp3[2]*/, r2, r6:t
1483+ mulsatrndwh.w r1/*tmp3[3]*/, r1, r6:b
1484+ mulsatrndwh.w r6/*tmp3[0]*/, r4, r7:t
1485+ mulsatrndwh.w r7/*tmp3[1]*/, r3, r7:b
1486+ mulsatrndwh.w r3/*tmp3[5]*/, r10, r9:b
1487+ mulsatrndwh.w r2/*tmp3[4]*/, r12, r9:t
1488+ mulsatrndwh.w r9/*tmp3[7]*/, r11, r8:b
1489+ lsl r1, 1
1490+ lsl r7, 1
1491+
1492+
1493+ /* fastdct */
1494+
1495+/*
1496+ {
1497+ mad_fixed_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12;
1498+ mad_fixed_t a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25;
1499+ mad_fixed_t m0, m1, m2, m3, m4, m5, m6, m7;
1500+*/
1501+// enum {
1502+// c0 = MAD_F(0x1f838b8d), /* 2 * cos( 1 * PI / 18) */
1503+// c1 = MAD_F(0x1bb67ae8), /* 2 * cos( 3 * PI / 18) */
1504+// c2 = MAD_F(0x18836fa3), /* 2 * cos( 4 * PI / 18) */
1505+// c3 = MAD_F(0x1491b752), /* 2 * cos( 5 * PI / 18) */
1506+// c4 = MAD_F(0x0af1d43a), /* 2 * cos( 7 * PI / 18) */
1507+// c5 = MAD_F(0x058e86a0), /* 2 * cos( 8 * PI / 18) */
1508+// c6 = -MAD_F(0x1e11f642) /* 2 * cos(16 * PI / 18) */
1509+// };
1510+
1511+ /* Registers used:
1512+
1513+ r0=tmp3[2], r1=tmp3[3], r2=tmp3[4], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1514+ r7 = tmp3[1], r9=tmp3[7], lr=tmp3[6]
1515+ Free registers:
1516+ r4, r8, r10, r11, r12
1517+ */
1518+
1519+/*
1520+ a2 = tmp3[6] + (tmp3[2] << 1);
1521+ a6 = tmp3[8] + (tmp3[0] << 1);
1522+ a11 = a2 - a6;
1523+ m5 = mad_f_mul(a11, c6) ;
1524+ a4 = tmp3[1] + (tmp3[7] << 1);
1525+
1526+ a18 = (tmp3[4] << 1) + a4;
1527+ a19 = -2 * (tmp3[4] << 1) + a4;
1528+
1529+ a0 = tmp3[3] + (tmp3[5] << 1);
1530+
1531+*/
1532+ add r11/*a4*/, r7, r9 << 1
1533+ add r12/*a18*/, r11, r2 << 1
1534+ sub r11/*a19*/, r11, r2 << 2
1535+
1536+ add r4/*a2*/, lr, r0 << 1
1537+ add r8/*a6*/, r5, r6 << 1
1538+ sub r10/*a11*/, r4, r8
1539+
1540+ st.d --sp, r0 /* sp[0] = tmp3[3], sp1[1] = tmp3[2]*/
1541+
1542+ mov r2, -MAD_F(0x1e11f642)
1543+ mulsatrndwh.w r10/*m5*/, r10, r2:b
1544+
1545+ add r2/*a0*/, r1, r3 << 1
1546+
1547+ /* Registers used:
1548+
1549+ r2=a0, r3=tmp3[5], r4=a2, r5=tmp3[8], r6 = tmp3[0],
1550+ r7 = tmp3[1], r8=a6, r9=tmp3[7], r10=m5, r11=a19, r12=a18,lr=tmp3[6]
1551+ Free registers:
1552+ r0, r1
1553+ */
1554+
1555+/*
1556+ a8 = a0 + a2;
1557+ a12 = a8 + a6;
1558+ a10 = a0 - a6;
1559+ a9 = a0 - a2;
1560+ m7 = mad_f_mul(a9, -c2) ;
1561+ m6 = mad_f_mul(a10, -c5) ;
1562+*/
1563+
1564+ add r0/*a8*/, r2, r4
1565+ add r0/*a12*/, r8
1566+ rsub r8/*a10*/, r2
1567+ sub r2/*a9*/, r4
1568+ mov r1, -MAD_F(0x18836fa3)
1569+ mulsatrndwh.w r2/*m7*/, r2, r1:b
1570+ mov r1, -MAD_F(0x058e86a0)
1571+ mulsatrndwh.w r8/*m6*/, r8, r1:b
1572+
1573+ /* Registers used:
1574+
1575+ r0=a12, r2=m7, r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1576+ r7 = tmp3[1], r8=m6, r9=tmp3[7], r10=m5, r11=a19, r12=a18,lr=tmp3[6]
1577+ Free registers:
1578+ r1, r4
1579+ */
1580+
1581+
1582+/*
1583+ a21 = -a19 + (m5 << 1);
1584+ tmp[ 9] = a21 - (m6 << 1);
1585+
1586+ a20 = -(-a19 - (m5 << 1));
1587+ tmp[ 5] = (m7 << 1) + a20;
1588+ a22 = -a19 + (m6 << 1);
1589+ tmp[17] = a22 + (m7 << 1);
1590+ tmp[ 1] = a18 + a12;
1591+ tmp[13] = a12 - 2 * a18;
1592+*/
1593+ sub r1/*a21*/, r11, r10 << 1
1594+ neg r1
1595+ sub r1/*tmp[9]*/, r1, r8 << 1
1596+ stdsp sp[4*2/*tmp3[..] on the stack*/ + 9*4], r1
1597+ add r4/*a20*/, r11, r10 << 1
1598+ add r4/*tmp[5]*/, r4, r2 << 1
1599+ stdsp sp[4*2/*tmp3[..] on the stack*/ + 5*4], r4
1600+ neg r11
1601+ add r1/*a22*/, r11, r8 << 1
1602+ add r1/*tmp[17]*/, r1, r2 << 1
1603+ stdsp sp[4*2/*tmp3[..] on the stack*/ + 17*4], r1
1604+ add r4, r12, r0
1605+ sub r1, r0, r12 << 1
1606+ stdsp sp[4*2/*tmp3[..] on the stack*/ + 1*4], r4
1607+ stdsp sp[4*2/*tmp3[..] on the stack*/ + 13*4], r1
1608+
1609+ ld.d r0, sp++
1610+
1611+ /* Registers used:
1612+
1613+ r0 = tmp3[2], r1 = tmp3[3], r3=tmp3[5], r5=tmp3[8], r6 = tmp3[0],
1614+ r7 = tmp3[1], r9=tmp3[7], r11=a19, lr=tmp3[6]
1615+ Free registers:
1616+ r2,r4,r8,r10,r12
1617+ */
1618+
1619+/*
1620+ a5 = tmp3[1] - (tmp3[7] << 1);
1621+ a7 = tmp3[8] - (tmp3[0] << 1);
1622+ a3 = tmp3[6] - (tmp3[2] << 1);
1623+ a1 = tmp3[3] - (tmp3[5] << 1);
1624+ a13 = a1 - a3;
1625+ a14 = a13 + a7;
1626+ m3 = mad_f_mul(a14, -c1) ;
1627+ m4 = mad_f_mul(a5, -c1) ;
1628+ tmp[ 7] = m3 << 1;
1629+*/
1630+ sub r7/*a5*/, r7, r9 << 1
1631+ sub r2/*a7*/, r5, r6 << 1
1632+ sub r4/*a3*/, lr, r0 << 1
1633+ sub r8/*a1*/, r1, r3 << 1
1634+ sub r10/*a13*/, r8, r4
1635+ add r12/*a14*/, r10, r2
1636+ mov r0, -MAD_F(0x1bb67ae8)
1637+ mulsatrndwh.w r12/*m3*/, r12, r0:b
1638+ mulsatrndwh.w r7/*m4*/, r7, r0:b
1639+ lsl r12, 1
1640+ stdsp sp[7*4], r12
1641+
1642+ /* Registers used:
1643+ r2 = a7, r4 = a3, r7 = m4, r8 = a1, r12 = m3
1644+
1645+ Free registers:
1646+ r0, r1, r3, r5, r6, r9, r10, r11, lr
1647+ */
1648+
1649+
1650+/*
1651+ a15 = a3 + a7;
1652+ m2 = mad_f_mul(a15, -c4) ;
1653+ a17 = a1 + a3;
1654+ m0 = mad_f_mul(a17, -c3) ;
1655+ a23 = (m4 << 1) + (m2 << 1);
1656+ tmp[15] = a23 + (m0 << 1); */
1657+ add r0/*a15*/, r4, r2
1658+ mov r1, -MAD_F(0x0af1d43a)
1659+ mulsatrndwh.w r0/*m2*/, r0, r1:b
1660+ mov r3, -MAD_F(0x1491b752)
1661+ add r5/*a17*/, r8, r4
1662+ mulsatrndwh.w r5/*m0*/, r5, r3:b
1663+ lsl r7, 1
1664+ add r6/*a23*/, r7, r0 << 1
1665+ add r6/*tmp[15]*/, r6, r5 << 1
1666+ stdsp sp[15*4], r6
1667+
1668+ /* Registers used:
1669+ r0 = m2, r2 = a7, r5 = m0, r7 = m4, r8 = a1
1670+
1671+ Free registers:
1672+ r1, r3, r4, r6, r9, r10, r11, lr
1673+ */
1674+
1675+/*
1676+ a16 = a1 - a7;
1677+ m1 = mad_f_mul(a16, -c0) ;
1678+ a24 = (m4 << 1) - (m2 << 1);
1679+ tmp[11] = a24 - (m1 << 1);
1680+
1681+ a25 = (m4 << 1) + (m1 << 1);
1682+ tmp[ 3] = (m0 << 1) - a25;
1683+*/
1684+ sub r3/*a16*/, r8, r2
1685+ mov r4, -MAD_F(0x1f838b8d)
1686+ mulsatrndwh.w r3/*m1*/, r3, r4:b
1687+ sub r1/*a24*/, r7, r0 << 1
1688+ sub r1/*tmp[11]*/, r1, r3 << 1
1689+ stdsp sp[11*4], r1
1690+ add r7/*a25*/, r7, r3 << 1
1691+ sub r7, r7, r5 << 1
1692+ neg r7
1693+ lddsp r12, sp[4*18+4] /* Get y from stack */
1694+ stdsp sp[3*4], r7
1695+
1696+
1697+ /* output to every other slot for convenience */
1698+
1699+ /* End fastdct */
1700+
1701+ /* output accumulation */
1702+
1703+/* for (i = 3; i < 18; i += 8) {
1704+ tmp[i + 0] -= tmp[(i + 0) - 2];
1705+ tmp[i + 2] -= tmp[(i + 2) - 2];
1706+ tmp[i + 4] -= tmp[(i + 4) - 2];
1707+ tmp[i + 6] -= tmp[(i + 6) - 2];
1708+ }
1709+ }
1710+*/
1711+
1712+/* End SDCT-II */
1713+
1714+
1715+
1716+ /* scale reduction and output accumulation */
1717+
1718+/*
1719+ for (i = 1; i < 17; i += 4) {
1720+ tmp[i + 0] = tmp[i + 0] - tmp[(i + 0) - 1];
1721+ tmp[i + 1] = tmp[i + 1] - tmp[(i + 1) - 1];
1722+ tmp[i + 2] = tmp[i + 2] - tmp[(i + 2) - 1];
1723+ tmp[i + 3] = tmp[i + 3] - tmp[(i + 3) - 1];
1724+ }
1725+ tmp[17] = tmp[17] - tmp[16];
1726+ }
1727+*/
1728+/* End DCT-IV */
1729+
1730+
1731+ /* convert 18-point DCT-IV to 36-point IMDCT */
1732+
1733+/*
1734+ for (i = 0; i < 9; i += 3) {
1735+ y[i + 0] = tmp[9 + (i + 0)];
1736+ y[i + 1] = tmp[9 + (i + 1)];
1737+ y[i + 2] = tmp[9 + (i + 2)];
1738+ }
1739+ for (i = 9; i < 27; i += 3) {
1740+ y[i + 0] = -tmp[36 - (9 + (i + 0)) - 1];
1741+ y[i + 1] = -tmp[36 - (9 + (i + 1)) - 1];
1742+ y[i + 2] = -tmp[36 - (9 + (i + 2)) - 1];
1743+ }
1744+ for (i = 27; i < 36; i += 3) {
1745+ y[i + 0] = -tmp[(i + 0) - 27];
1746+ y[i + 1] = -tmp[(i + 1) - 27];
1747+ y[i + 2] = -tmp[(i + 2) - 27];
1748+ }
1749+ }
1750+*/
1751+
1752+ /* Registers used:
1753+ r0 = tmp[8], r1 = tmp[7], r2 = tmp[6], r3 = tmp[5], r4 = tmp[4]
1754+ r5 = tmp[3], r6 = tmp[2], r7 = tmp[1], r8 = tmp[0], r12 = y
1755+
1756+ Free registers:
1757+ r9, r10, r11, lr
1758+ */
1759+
1760+ ldm sp++, r0-r8 /* Get tmp[0]-tmp[8] from stack */
1761+ sub r5, r7 /* tmp[3] -= tmp[1]*/
1762+ sub r3, r5 /* tmp[5] -= tmp[3]*/
1763+ sub r1, r3 /* tmp[7] -= tmp[5]*/
1764+
1765+ sub r7, r8 /* tmp[1] -= tmp[0]*/
1766+ sub r6, r7 /* tmp[2] -= tmp[1]*/
1767+ sub r5, r6 /* tmp[3] -= tmp[2]*/
1768+ neg r8
1769+ st.w r12[26*4], r8 /* y[26] = -tmp[0] */
1770+ st.w r12[27*4], r8 /* y[27] = -tmp[0] */
1771+ neg r7
1772+ neg r6
1773+ st.w r12[25*4], r7 /* y[25] = -tmp[1] */
1774+ st.w r12[24*4], r6 /* y[24] = -tmp[2] */
1775+ st.d r12[28*4], r6 /* y[28] = -tmp[1], y[29] = -tmp[2]*/
1776+
1777+ sub r4, r5 /* tmp[4] -= tmp[3]*/
1778+ sub r3, r4 /* tmp[5] -= tmp[4]*/
1779+ neg r5
1780+ neg r4
1781+ st.w r12[23*4], r5 /* y[23] = -tmp[3] */
1782+ st.w r12[22*4], r4 /* y[22] = -tmp[4] */
1783+ st.d r12[30*4], r4 /* y[30] = -tmp[3], y[31] = -tmp[4]*/
1784+
1785+ ldm sp++, r4-r11,lr /* Get tmp[9]-tmp[17] from stack */
1786+
1787+ sub r2, r3 /* tmp[6] -= tmp[5]*/
1788+
1789+ sub lr, r1 /* tmp[9] -= tmp[7]*/
1790+ sub r10, lr /* tmp[11] -= tmp[9]*/
1791+ sub r8, r10 /* tmp[13] -= tmp[11]*/
1792+ sub r6, r8 /* tmp[15] -= tmp[13]*/
1793+ sub r4, r6 /* tmp[17] -= tmp[15]*/
1794+
1795+ sub r1, r2 /* tmp[7] -= tmp[6]*/
1796+ sub r0, r1 /* tmp[8] -= tmp[7]*/
1797+ neg r3
1798+ neg r2
1799+ st.w r12[21*4], r3 /* y[21] = -tmp[5] */
1800+ st.w r12[20*4], r2 /* y[20] = -tmp[6] */
1801+ st.d r12[32*4], r2 /* y[32] = -tmp[5], y[33] = -tmp[6]*/
1802+
1803+ sub lr, r0 /* tmp[9] -= tmp[8]*/
1804+ sub r11, lr /* tmp[10] -= tmp[9]*/
1805+ neg r1
1806+ neg r0
1807+ st.w r12[19*4], r1 /* y[19] = -tmp[7] */
1808+ st.w r12[18*4], r0 /* y[18] = -tmp[8] */
1809+ st.d r12[34*4], r0 /* y[34] = -tmp[7], y[35] = -tmp[8]*/
1810+
1811+ sub r10, r11 /* tmp[11] -= tmp[10]*/
1812+ sub r9, r10 /* tmp[12] -= tmp[11]*/
1813+
1814+ st.w r12[0*4], lr /* y[0] = tmp[9]*/
1815+ neg lr
1816+ st.w r12[17*4], lr /* y[17] = -tmp[9]*/
1817+ st.d r12[1*4], r10 /* y[1] = tmp[10], y[2] = tmp[11] */
1818+ neg r11
1819+ neg r10
1820+ st.w r12[16*4], r11 /* y[16] = -tmp[10] */
1821+ st.w r12[15*4], r10 /* y[15] = -tmp[11] */
1822+
1823+
1824+ sub r8, r9 /* tmp[13] -= tmp[12]*/
1825+ sub r7, r8 /* tmp[14] -= tmp[13]*/
1826+ st.d r12[3*4], r8 /* y[3] = tmp[12], y[4] = tmp[13] */
1827+ neg r9
1828+ neg r8
1829+ st.w r12[14*4], r9 /* y[14] = -tmp[12] */
1830+ st.w r12[13*4], r8 /* y[13] = -tmp[13] */
1831+
1832+ sub r6, r7 /* tmp[15] -= tmp[14]*/
1833+ sub r5, r6 /* tmp[16] -= tmp[15]*/
1834+ sub r4, r5 /* tmp[17] -= tmp[16]*/
1835+
1836+ st.d r12[5*4], r6 /* y[5] = tmp[14], y[6] = tmp[15] */
1837+ neg r7
1838+ neg r6
1839+ st.w r12[12*4], r7 /* y[12] = -tmp[14] */
1840+ st.w r12[11*4], r6 /* y[11] = -tmp[15] */
1841+
1842+ st.d r12[7*4], r4 /* y[7] = tmp[16], y[8] = tmp[17] */
1843+ neg r5
1844+ neg r4
1845+ st.w r12[10*4], r5 /* y[10] = -tmp[16] */
1846+ st.w r12[9*4], r4 /* y[9] = -tmp[17] */
1847+
1848+ popm r0-r7,r11,pc
1849+
1850+ .align 2
1851+scale_dctIV:
1852+ .short MAD_F(0x1ff833fa), MAD_F(0x1fb9ea93), MAD_F(0x1f3dd120)
1853+ .short MAD_F(0x1e84d969), MAD_F(0x1d906bcf), MAD_F(0x1c62648b)
1854+ .short MAD_F(0x1afd100f), MAD_F(0x1963268b), MAD_F(0x1797c6a4)
1855+ .short MAD_F(0x159e6f5b), MAD_F(0x137af940), MAD_F(0x11318ef3)
1856+ .short MAD_F(0x0ec6a507), MAD_F(0x0c3ef153), MAD_F(0x099f61c5)
1857+ .short MAD_F(0x06ed12c5), MAD_F(0x042d4544), MAD_F(0x0165547c)
1858+
1859+ .align 2
1860+scale_sdctII:
1861+ .short MAD_F(0x1fe0d3b4), MAD_F(0x1ee8dd47), MAD_F(0x1d007930)
1862+ .short MAD_F(0x1a367e59), MAD_F(0x16a09e66), MAD_F(0x125abcf8)
1863+ .short MAD_F(0x0d8616bc), MAD_F(0x08483ee1), MAD_F(0x02c9fad7)
1864diff --git a/layer3.c b/layer3.c
1865index 4e5d3fa..dffdab3 100644
1866--- a/layer3.c
1867+++ b/layer3.c
1868@@ -378,6 +378,11 @@ mad_fixed_t const ca[8] = {
1869 -MAD_F(0x003a2847) /* -0.014198569 */, -MAD_F(0x000f27b4) /* -0.003699975 */
1870 };
1871
1872+#ifdef FPM_AVR32
1873+# undef MAD_F
1874+# define MAD_F(x) ((x + (1 << 12)) >> 13)
1875+#endif
1876+
1877 /*
1878 * IMDCT coefficients for short blocks
1879 * derived from section 2.4.3.4.10.2 of ISO/IEC 11172-3
1880@@ -386,7 +391,7 @@ mad_fixed_t const ca[8] = {
1881 * imdct_s[i /odd][k] = cos((PI / 24) * (2 * (6 + (i-1)/2) + 7) * (2 * k + 1))
1882 */
1883 static
1884-mad_fixed_t const imdct_s[6][6] = {
1885+mad_coeff_t const imdct_s[6][6] = {
1886 # include "imdct_s.dat"
1887 };
1888
1889@@ -398,7 +403,7 @@ mad_fixed_t const imdct_s[6][6] = {
1890 * window_l[i] = sin((PI / 36) * (i + 1/2))
1891 */
1892 static
1893-mad_fixed_t const window_l[36] = {
1894+mad_coeff_t const window_l[36] = {
1895 MAD_F(0x00b2aa3e) /* 0.043619387 */, MAD_F(0x0216a2a2) /* 0.130526192 */,
1896 MAD_F(0x03768962) /* 0.216439614 */, MAD_F(0x04cfb0e2) /* 0.300705800 */,
1897 MAD_F(0x061f78aa) /* 0.382683432 */, MAD_F(0x07635284) /* 0.461748613 */,
1898@@ -429,7 +434,7 @@ mad_fixed_t const window_l[36] = {
1899 * window_s[i] = sin((PI / 12) * (i + 1/2))
1900 */
1901 static
1902-mad_fixed_t const window_s[12] = {
1903+mad_coeff_t const window_s[12] = {
1904 MAD_F(0x0216a2a2) /* 0.130526192 */, MAD_F(0x061f78aa) /* 0.382683432 */,
1905 MAD_F(0x09bd7ca0) /* 0.608761429 */, MAD_F(0x0cb19346) /* 0.793353340 */,
1906 MAD_F(0x0ec835e8) /* 0.923879533 */, MAD_F(0x0fdcf549) /* 0.991444861 */,
1907@@ -438,6 +443,11 @@ mad_fixed_t const window_s[12] = {
1908 MAD_F(0x061f78aa) /* 0.382683432 */, MAD_F(0x0216a2a2) /* 0.130526192 */,
1909 };
1910
1911+#ifdef FPM_AVR32
1912+# undef MAD_F
1913+# define MAD_F(x) ((mad_fixed_t) (x##L))
1914+#endif
1915+
1916 /*
1917 * coefficients for intensity stereo processing
1918 * derived from section 2.4.3.4.9.3 of ISO/IEC 11172-3
1919@@ -879,6 +889,42 @@ void III_exponents(struct channel const *channel,
1920 * NAME: III_requantize()
1921 * DESCRIPTION: requantize one (positive) value
1922 */
1923+
1924+#if 0
1925+/*static*/
1926+mad_fixed_t III_requantize(unsigned int value, signed int exp)
1927+{
1928+ register mad_fixed_t tmp2, tmp3;
1929+ long long tmp_d;
1930+
1931+ asm ("asr\t%0, %1, 2\n"
1932+ "ld.w\t%2, %4[%5 << 2]\n"
1933+ "sub\t%1, %1, %0 << 2\n"
1934+ "asr\t%3, %2, 7\n"
1935+ "andl\t%2, 0x7f, COH\n"
1936+ "add\t%0, %2\n"
1937+ "lsl\t%m0,%3,%0\n"
1938+ "neg\t%0\n"
1939+ "asr\t%3,%3,%0\n"
1940+ "add\t%2, %6, %1 << 2\n"
1941+ "ld.w\t%2, %2[12]\n"
1942+ "cp.w\t%0, 0\n"
1943+ "movlt\t%3, %m0\n"
1944+ "muls.d\t%0, %3, %2\n"
1945+ "cp.w\t%1, 0\n"
1946+ "breq\t0f\n"
1947+ "lsr\t%0, %0, 28\n"
1948+ "or\t%3, %0, %m0 << 4\n"
1949+ "0:\n"
1950+ : "=&r"(tmp_d), "+r"(exp), "=&r"(tmp2), "=&r"(tmp3)
1951+ : "r"(&rq_table), "r"(value), "r"(root_table));
1952+
1953+
1954+ return tmp3;
1955+}
1956+
1957+#else
1958+
1959 static
1960 mad_fixed_t III_requantize(unsigned int value, signed int exp)
1961 {
1962@@ -918,6 +964,7 @@ mad_fixed_t III_requantize(unsigned int value, signed int exp)
1963
1964 return frac ? mad_f_mul(requantized, root_table[3 + frac]) : requantized;
1965 }
1966+#endif
1967
1968 /* we must take care that sz >= bits and sz < sizeof(long) lest bits == 0 */
1969 # define MASK(cache, sz, bits) \
1970@@ -2054,27 +2101,42 @@ void imdct36(mad_fixed_t const X[18], mad_fixed_t x[36])
1971 }
1972 # endif
1973
1974+
1975+#ifdef FPM_AVR32
1976+# undef mad_f_mul
1977+# define mad_f_mul(x, y) __builtin_mulsatrndwh_w(x, y)
1978+#endif
1979+
1980 /*
1981 * NAME: III_imdct_l()
1982 * DESCRIPTION: perform IMDCT and windowing for long blocks
1983 */
1984 static
1985-void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
1986+void III_imdct_l(mad_fixed_t /*const*/ X[18], mad_fixed_t z[36],
1987 unsigned int block_type)
1988 {
1989 unsigned int i;
1990+ mad_fixed_t *z_ptr;
1991+ mad_coeff_t *w_ptr;
1992
1993 /* IMDCT */
1994
1995+#ifdef FPM_AVR32
1996+ imdct36_avr32(X, z);
1997+#else
1998 imdct36(X, z);
1999+#endif
2000
2001 /* windowing */
2002
2003+ z_ptr = &z[0];
2004+ w_ptr = &window_l[0];
2005+
2006 switch (block_type) {
2007 case 0: /* normal window */
2008 # if defined(ASO_INTERLEAVE1)
2009 {
2010- register mad_fixed_t tmp1, tmp2;
2011+ register mad_coeff_t tmp1, tmp2;
2012
2013 tmp1 = window_l[0];
2014 tmp2 = window_l[1];
2015@@ -2091,15 +2153,16 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
2016 }
2017 # elif defined(ASO_INTERLEAVE2)
2018 {
2019- register mad_fixed_t tmp1, tmp2;
2020+ register mad_fixed_t tmp1;
2021+ register mad_coeff_t tmp2;
2022
2023- tmp1 = z[0];
2024- tmp2 = window_l[0];
2025+ tmp1 = *z_ptr;
2026+ tmp2 = *w_ptr++;
2027
2028 for (i = 0; i < 35; ++i) {
2029- z[i] = mad_f_mul(tmp1, tmp2);
2030- tmp1 = z[i + 1];
2031- tmp2 = window_l[i + 1];
2032+ *z_ptr++ = mad_f_mul(tmp1, tmp2);
2033+ tmp1 = *z_ptr;
2034+ tmp2 = *w_ptr++;
2035 }
2036
2037 z[35] = mad_f_mul(tmp1, tmp2);
2038@@ -2118,23 +2181,28 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
2039
2040 case 1: /* start block */
2041 for (i = 0; i < 18; i += 3) {
2042- z[i + 0] = mad_f_mul(z[i + 0], window_l[i + 0]);
2043- z[i + 1] = mad_f_mul(z[i + 1], window_l[i + 1]);
2044- z[i + 2] = mad_f_mul(z[i + 2], window_l[i + 2]);
2045+ *(z_ptr++) = mad_f_mul(*z_ptr, *w_ptr++);
2046+ *(z_ptr++) = mad_f_mul(*z_ptr, *w_ptr++);
2047+ *(z_ptr++) = mad_f_mul(*z_ptr, *w_ptr++);
2048 }
2049+ z_ptr += 6;
2050+ w_ptr = &window_s[6];
2051 /* (i = 18; i < 24; ++i) z[i] unchanged */
2052- for (i = 24; i < 30; ++i) z[i] = mad_f_mul(z[i], window_s[i - 18]);
2053- for (i = 30; i < 36; ++i) z[i] = 0;
2054+ for (i = 24; i < 30; ++i) *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
2055+ for (i = 30; i < 36; ++i) *z_ptr++ = 0;
2056 break;
2057
2058 case 3: /* stop block */
2059- for (i = 0; i < 6; ++i) z[i] = 0;
2060- for (i = 6; i < 12; ++i) z[i] = mad_f_mul(z[i], window_s[i - 6]);
2061+ w_ptr = &window_s[0];
2062+ for (i = 0; i < 6; ++i) *z_ptr++ = 0;
2063+ for (i = 6; i < 12; ++i) *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
2064 /* (i = 12; i < 18; ++i) z[i] unchanged */
2065+ w_ptr = &window_l[18];
2066+ z_ptr += 6;
2067 for (i = 18; i < 36; i += 3) {
2068- z[i + 0] = mad_f_mul(z[i + 0], window_l[i + 0]);
2069- z[i + 1] = mad_f_mul(z[i + 1], window_l[i + 1]);
2070- z[i + 2] = mad_f_mul(z[i + 2], window_l[i + 2]);
2071+ *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++ );
2072+ *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
2073+ *z_ptr++ = mad_f_mul(*z_ptr, *w_ptr++);
2074 }
2075 break;
2076 }
2077@@ -2146,10 +2214,10 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36],
2078 * DESCRIPTION: perform IMDCT and windowing for short blocks
2079 */
2080 static
2081-void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2082+void III_imdct_s(mad_fixed_t /*const*/ X[18], mad_fixed_t z[36])
2083 {
2084 mad_fixed_t y[36], *yptr;
2085- mad_fixed_t const *wptr;
2086+ mad_coeff_t const *wptr;
2087 int w, i;
2088 register mad_fixed64hi_t hi;
2089 register mad_fixed64lo_t lo;
2090@@ -2159,11 +2227,56 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2091 yptr = &y[0];
2092
2093 for (w = 0; w < 3; ++w) {
2094- register mad_fixed_t const (*s)[6];
2095+ register mad_coeff_t const (*s)[6];
2096
2097 s = imdct_s;
2098
2099 for (i = 0; i < 3; ++i) {
2100+#ifdef FPM_AVR32
2101+ register long long int acc, tmp1, tmp2, tmp3, tmp4;
2102+ asm volatile ("ld.d\t%0, %5++\n"
2103+ "ld.d\t%1, %6[0]\n"
2104+ "ld.d\t%2, %6[2*4]\n"
2105+ "ld.d\t%3, %6[4*4]\n"
2106+ "mulwh.d\t%4, %m1, %m0:t\n"
2107+ "macwh.d\t%4, %1, %m0:b\n"
2108+ "ld.w\t%m0, %5++\n"
2109+ "macwh.d\t%4, %m2, %0:t\n"
2110+ "macwh.d\t%4, %2, %0:b\n"
2111+ "macwh.d\t%4, %m3, %m0:t\n"
2112+ "macwh.d\t%4, %3, %m0:b\n"
2113+ "ld.d\t%0, %5++\n"
2114+ "rol\t%4\n"
2115+ "rol\t%m4\n"
2116+ : "=&r"(tmp1), "=&r"(tmp2), "=&r"(tmp3), "=&r"(tmp4),
2117+ "=&r"(acc), "+r"(s)
2118+ : "r"(X));
2119+
2120+ asm volatile ("st.w\t%1[0], %m0\n"
2121+ "neg\t%m0\n"
2122+ "st.w\t%2[5*4], %m0\n"
2123+ : "+r"(acc)
2124+ : "r"(&yptr[i]), "r"(&yptr[-i]));
2125+
2126+ asm volatile ("mulwh.d\t%4, %m1, %m0:t\n"
2127+ "macwh.d\t%4, %1, %m0:b\n"
2128+ "ld.w\t%m0, %5++\n"
2129+ "macwh.d\t%4, %m2, %0:t\n"
2130+ "macwh.d\t%4, %2, %0:b\n"
2131+ "macwh.d\t%4, %m3, %m0:t\n"
2132+ "macwh.d\t%4, %3, %m0:b\n"
2133+ "rol\t%4\n"
2134+ "rol\t%m4\n"
2135+ : "+r"(tmp1), "+r"(tmp2), "+r"(tmp3), "+r"(tmp4),
2136+ "=&r"(acc), "+r"(s)
2137+ : "r"(X));
2138+
2139+ asm volatile ( "st.w\t%1[6*4], %m0\n"
2140+ "st.w\t%2[11*4], %m0\n"
2141+ :: "r"(acc), "r"(&yptr[i]), "r"(&yptr[-i]));
2142+
2143+
2144+#else
2145 MAD_F_ML0(hi, lo, X[0], (*s)[0]);
2146 MAD_F_MLA(hi, lo, X[1], (*s)[1]);
2147 MAD_F_MLA(hi, lo, X[2], (*s)[2]);
2148@@ -2187,6 +2300,7 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2149 yptr[11 - i] = yptr[i + 6];
2150
2151 ++s;
2152+#endif
2153 }
2154
2155 yptr += 12;
2156@@ -2198,6 +2312,196 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2157 yptr = &y[0];
2158 wptr = &window_s[0];
2159
2160+#ifdef FPM_AVR32
2161+ /* z[0] = 0;
2162+ z[1] = 0;
2163+ z[2] = 0;
2164+ z[3] = 0;
2165+ z[4] = 0;
2166+ z[5] = 0;
2167+ z[30] = 0;
2168+ z[31] = 0;
2169+ z[32] = 0;
2170+ z[33] = 0;
2171+ z[34] = 0;
2172+ z[35] = 0;
2173+ */
2174+ {
2175+ register long long int tmp, tmp2, tmp3, w0123, w4567, w891011;
2176+ asm volatile ("mov\t%m0, 0\n"
2177+ "mov\t%0, %m0\n"
2178+ "st.d\t%1[0], %0\n"
2179+ "st.d\t%1[2*4], %0\n"
2180+ "st.d\t%1[4*4], %0\n"
2181+ "st.d\t%1[30*4], %0\n"
2182+ "st.d\t%1[32*4], %0\n"
2183+ "st.d\t%1[34*4], %0\n"
2184+ : "=&r"(tmp) : "r"(z));
2185+
2186+
2187+
2188+ /*
2189+ z[6] = mad_f_mul(yptr [0], wptr[0]);
2190+ z[7] = mad_f_mul(yptr [1], wptr[1]);
2191+ z[8] = mad_f_mul(yptr [2], wptr[2]);
2192+ z[9] = mad_f_mul(yptr [3], wptr[3]);
2193+ z[10] = mad_f_mul(yptr[4], wptr[4]);
2194+ z[11] = mad_f_mul(yptr[5], wptr[5]);
2195+ z[24] = mad_f_mul(yptr [30], wptr[6]);
2196+ z[25] = mad_f_mul(yptr [31], wptr[7]);
2197+ z[26] = mad_f_mul(yptr [32], wptr[8]);
2198+ z[27] = mad_f_mul(yptr [33], wptr[9]);
2199+ z[28] = mad_f_mul(yptr[34], wptr[10]);
2200+ z[29] = mad_f_mul(yptr[35], wptr[11]);
2201+ */
2202+
2203+
2204+ asm volatile ("ld.d\t%0, %5[0*4]\n"
2205+ "ld.d\t%3, %6[0*4]\n"
2206+ "ld.d\t%1, %5[2*4]\n"
2207+ "ld.d\t%2, %5[4*4]\n"
2208+ "mulsatrndwh.w\t%m3, %m3, %m0:t\n"
2209+ "mulsatrndwh.w\t%3, %3, %m0:b\n"
2210+ "ld.d\t%4, %6[2*4]\n"
2211+ "st.d\t%7[6*4], %3\n"
2212+
2213+ "mulsatrndwh.w\t%m4, %m4, %0:t\n"
2214+ "mulsatrndwh.w\t%4, %4, %0:b\n"
2215+ "ld.d\t%3, %6[4*4]\n"
2216+ "st.d\t%7[8*4], %4\n"
2217+
2218+ "mulsatrndwh.w\t%m3, %m3, %m1:t\n"
2219+ "mulsatrndwh.w\t%3, %3, %m1:b\n"
2220+ "ld.d\t%4, %6[30*4]\n"
2221+ "st.d\t%7[10*4], %3\n"
2222+
2223+ "mulsatrndwh.w\t%m4, %m4, %1:t\n"
2224+ "mulsatrndwh.w\t%4, %4, %1:b\n"
2225+ "ld.d\t%3, %6[32*4]\n"
2226+ "st.d\t%7[24*4], %4\n"
2227+
2228+ "mulsatrndwh.w\t%m3, %m3, %m2:t\n"
2229+ "mulsatrndwh.w\t%3, %3, %m2:b\n"
2230+ "ld.d\t%4, %6[34*4]\n"
2231+ "st.d\t%7[26*4], %3\n"
2232+
2233+ "mulsatrndwh.w\t%m4, %m4, %2:t\n"
2234+ "mulsatrndwh.w\t%4, %4, %2:b\n"
2235+ "st.d\t%7[28*4], %4\n"
2236+
2237+ : "=&r"(w0123), "=&r"(w4567), "=&r"(w891011), "=&r"(tmp), "=&r"(tmp2)
2238+ : "r"(wptr), "r"(yptr), "r"(z));
2239+ /*
2240+ MAD_F_ML0(hi, lo, yptr[6], wptr[6]);
2241+ MAD_F_MLA(hi, lo, yptr[12], wptr[0]);
2242+ z[12] = MAD_F_MLZ(hi, lo);
2243+ MAD_F_ML0(hi, lo, yptr[7], wptr[7]);
2244+ MAD_F_MLA(hi, lo, yptr[13], wptr[1]);
2245+ z[13] = MAD_F_MLZ(hi, lo);
2246+ MAD_F_ML0(hi, lo, yptr[8], wptr[8]);
2247+ MAD_F_MLA(hi, lo, yptr[14], wptr[2]);
2248+ z[14] = MAD_F_MLZ(hi, lo);
2249+ MAD_F_ML0(hi, lo, yptr[9], wptr[9]);
2250+ MAD_F_MLA(hi, lo, yptr[15], wptr[3]);
2251+ z[15] = MAD_F_MLZ(hi, lo);
2252+ MAD_F_ML0(hi, lo, yptr[10], wptr[10]);
2253+ MAD_F_MLA(hi, lo, yptr[16], wptr[4]);
2254+ z[16] = MAD_F_MLZ(hi, lo);
2255+ MAD_F_ML0(hi, lo, yptr[11], wptr[11]);
2256+ MAD_F_MLA(hi, lo, yptr[17], wptr[5]);
2257+ z[17] = MAD_F_MLZ(hi, lo);
2258+
2259+ MAD_F_ML0(hi, lo, yptr[18], wptr[6]);
2260+ MAD_F_MLA(hi, lo, yptr[24], wptr[0]);
2261+ z[18] = MAD_F_MLZ(hi, lo);
2262+ MAD_F_ML0(hi, lo, yptr[19], wptr[7]);
2263+ MAD_F_MLA(hi, lo, yptr[25], wptr[1]);
2264+ z[19] = MAD_F_MLZ(hi, lo);
2265+ MAD_F_ML0(hi, lo, yptr[20], wptr[8]);
2266+ MAD_F_MLA(hi, lo, yptr[26], wptr[2]);
2267+ z[20] = MAD_F_MLZ(hi, lo);
2268+ MAD_F_ML0(hi, lo, yptr[21], wptr[9]);
2269+ MAD_F_MLA(hi, lo, yptr[27], wptr[3]);
2270+ z[21] = MAD_F_MLZ(hi, lo);
2271+ MAD_F_ML0(hi, lo, yptr[22], wptr[10]);
2272+ MAD_F_MLA(hi, lo, yptr[28], wptr[4]);
2273+ z[22] = MAD_F_MLZ(hi, lo);
2274+ MAD_F_ML0(hi, lo, yptr[23], wptr[11]);
2275+ MAD_F_MLA(hi, lo, yptr[29], wptr[5]);
2276+ z[23] = MAD_F_MLZ(hi, lo);*/
2277+
2278+
2279+ asm volatile ("ld.d\t%0, %3[6*4]\n"
2280+ "ld.d\t%1, %3[12*4]\n"
2281+ "mulwh.d\t%2, %m0, %5:t\n"
2282+ "macwh.d\t%2, %m1, %m4:t\n"
2283+ "mulwh.d\t%0, %0, %5:b\n"
2284+ "macwh.d\t%0, %1, %m4:b\n"
2285+ "lsl\t%m2, 1\n"
2286+ "lsl\t%2, %m0, 1\n"
2287+ "st.d\t%6[12*4], %2\n"
2288+
2289+ "ld.d\t%0, %3[18*4]\n"
2290+ "ld.d\t%1, %3[24*4]\n"
2291+ "mulwh.d\t%2, %m0, %5:t\n"
2292+ "macwh.d\t%2, %m1, %m4:t\n"
2293+ "mulwh.d\t%0, %0, %5:b\n"
2294+ "macwh.d\t%0, %1, %m4:b\n"
2295+ "lsl\t%m2, 1\n"
2296+ "lsl\t%2, %m0, 1\n"
2297+ "st.d\t%6[18*4], %2\n"
2298+
2299+ : "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
2300+ : "r"(yptr), "r"(w0123), "r"(w4567), "r"(z));
2301+
2302+ asm volatile ("ld.d\t%0, %3[8*4]\n"
2303+ "ld.d\t%1, %3[14*4]\n"
2304+ "mulwh.d\t%2, %m0, %m5:t\n"
2305+ "macwh.d\t%2, %m1, %4:t\n"
2306+ "mulwh.d\t%0, %0, %m5:b\n"
2307+ "macwh.d\t%0, %1, %4:b\n"
2308+ "lsl\t%m2, 1\n"
2309+ "lsl\t%2, %m0, 1\n"
2310+ "st.d\t%6[14*4], %2\n"
2311+
2312+ "ld.d\t%0, %3[20*4]\n"
2313+ "ld.d\t%1, %3[26*4]\n"
2314+ "mulwh.d\t%2, %m0, %m5:t\n"
2315+ "macwh.d\t%2, %m1, %4:t\n"
2316+ "mulwh.d\t%0, %0, %m5:b\n"
2317+ "macwh.d\t%0, %1, %4:b\n"
2318+ "lsl\t%m2, 1\n"
2319+ "lsl\t%2, %m0, 1\n"
2320+ "st.d\t%6[20*4], %2\n"
2321+
2322+ : "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
2323+ : "r"(yptr), "r"(w0123), "r"(w891011), "r"(z));
2324+
2325+ asm volatile ("ld.d\t%0, %3[10*4]\n"
2326+ "ld.d\t%1, %3[16*4]\n"
2327+ "mulwh.d\t%2, %m0, %5:t\n"
2328+ "macwh.d\t%2, %m1, %m4:t\n"
2329+ "mulwh.d\t%0, %0, %5:b\n"
2330+ "macwh.d\t%0, %1, %m4:b\n"
2331+ "lsl\t%m2, 1\n"
2332+ "lsl\t%2, %m0, 1\n"
2333+ "st.d\t%6[16*4], %2\n"
2334+
2335+ "ld.d\t%0, %3[22*4]\n"
2336+ "ld.d\t%1, %3[28*4]\n"
2337+ "mulwh.d\t%2, %m0, %5:t\n"
2338+ "macwh.d\t%2, %m1, %m4:t\n"
2339+ "mulwh.d\t%0, %0, %5:b\n"
2340+ "macwh.d\t%0, %1, %m4:b\n"
2341+ "lsl\t%m2, 1\n"
2342+ "lsl\t%2, %m0, 1\n"
2343+ "st.d\t%6[22*4], %2\n"
2344+
2345+ : "=&r"(tmp), "=&r"(tmp2), "=&r"(tmp3)
2346+ : "r"(yptr), "r"(w4567), "r"(w891011), "r"(z));
2347+
2348+ }
2349+#else
2350 for (i = 0; i < 6; ++i) {
2351 z[i + 0] = 0;
2352 z[i + 6] = mad_f_mul(yptr[ 0 + 0], wptr[0]);
2353@@ -2218,8 +2522,15 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36])
2354 ++yptr;
2355 ++wptr;
2356 }
2357+#endif
2358 }
2359
2360+#ifdef FPM_AVR32
2361+# undef mad_f_mul
2362+# define mad_f_mul(x, y) ((((x) + (1L << 11)) >> 12) * \
2363+ (((y) + (1L << 15)) >> 16))
2364+#endif
2365+
2366 /*
2367 * NAME: III_overlap()
2368 * DESCRIPTION: perform overlap-add of windowed IMDCT outputs
2369diff --git a/synth.c b/synth.c
2370index 1d28d43..f42d49b 100644
2371--- a/synth.c
2372+++ b/synth.c
2373@@ -29,20 +29,6 @@
2374 # include "frame.h"
2375 # include "synth.h"
2376
2377-/*
2378- * NAME: synth->init()
2379- * DESCRIPTION: initialize synth struct
2380- */
2381-void mad_synth_init(struct mad_synth *synth)
2382-{
2383- mad_synth_mute(synth);
2384-
2385- synth->phase = 0;
2386-
2387- synth->pcm.samplerate = 0;
2388- synth->pcm.channels = 0;
2389- synth->pcm.length = 0;
2390-}
2391
2392 /*
2393 * NAME: synth->mute()
2394@@ -88,6 +74,10 @@ void mad_synth_mute(struct mad_synth *synth)
2395
2396 /* FPM_DEFAULT without OPT_SSO will actually lose accuracy and performance */
2397
2398+# if defined(FPM_AVR32)
2399+# define OPT_SSO
2400+# endif
2401+
2402 # if defined(FPM_DEFAULT) && !defined(OPT_SSO)
2403 # define OPT_SSO
2404 # endif
2405@@ -522,9 +512,15 @@ void dct32(mad_fixed_t const in[32], unsigned int slot,
2406 # endif
2407 # define ML0(hi, lo, x, y) ((lo) = (x) * (y))
2408 # define MLA(hi, lo, x, y) ((lo) += (x) * (y))
2409-# define MLN(hi, lo) ((lo) = -(lo))
2410-# define MLZ(hi, lo) ((void) (hi), (mad_fixed_t) (lo))
2411-# define SHIFT(x) ((x) >> 2)
2412+# if defined(FPM_AVR32)
2413+# define MLN(hi, lo) MAD_F_MLN((hi), (lo))
2414+# define MLZ(hi, lo) (hi)
2415+# define SHIFT(x) ((x) << 2)
2416+# else
2417+# define MLN(hi, lo) ((lo) = -(lo))
2418+# define MLZ(hi, lo) ((void) (hi), (mad_fixed_t) (lo))
2419+# define SHIFT(x) ((x) >> 2)
2420+# endif
2421 # define PRESHIFT(x) ((MAD_F(x) + (1L << 13)) >> 14)
2422 # else
2423 # define ML0(hi, lo, x, y) MAD_F_ML0((hi), (lo), (x), (y))
2424@@ -541,11 +537,54 @@ void dct32(mad_fixed_t const in[32], unsigned int slot,
2425 # endif
2426 # endif
2427
2428+/*
2429+ * NAME: synth->init()
2430+ * DESCRIPTION: initialize synth struct
2431+ */
2432+
2433+#ifdef FPM_AVR32
2434+short Dmod[17][33];
2435+#endif
2436+
2437 static
2438+#ifdef FPM_AVR32
2439+short const D[17][32] = {
2440+#else
2441 mad_fixed_t const D[17][32] = {
2442+#endif
2443 # include "D.dat"
2444 };
2445
2446+void mad_synth_init(struct mad_synth *synth)
2447+{
2448+
2449+ mad_synth_mute(synth);
2450+
2451+ synth->phase = 0;
2452+
2453+ synth->pcm.samplerate = 0;
2454+ synth->pcm.channels = 0;
2455+ synth->pcm.length = 0;
2456+
2457+#ifdef FPM_AVR32
2458+ {
2459+ int i, j;
2460+ for ( i = 0; i < 17; i++ ){
2461+ for ( j = 0; j < 32; j++ ){
2462+ if ( j & 1 ){
2463+ Dmod[i][17 + (j >> 1)]= D[i][j];
2464+ } else {
2465+ Dmod[i][(j >> 1)]= D[i][j];
2466+ }
2467+ }
2468+
2469+ Dmod[i][16]= Dmod[i][16+8];
2470+ }
2471+ }
2472+#endif
2473+
2474+}
2475+
2476 # if defined(ASO_SYNTH)
2477 void synth_full(struct mad_synth *, struct mad_frame const *,
2478 unsigned int, unsigned int);
2479@@ -560,9 +599,13 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
2480 {
2481 unsigned int phase, ch, s, sb, pe, po;
2482 mad_fixed_t *pcm1, *pcm2, (*filter)[2][2][16][8];
2483- mad_fixed_t const (*sbsample)[36][32];
2484+ mad_fixed_t /*const*/ (*sbsample)[36][32];
2485 register mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8];
2486+#ifdef FPM_AVR32
2487+ register short const (*Dptr)[32], *ptr;
2488+#else
2489 register mad_fixed_t const (*Dptr)[32], *ptr;
2490+#endif
2491 register mad_fixed64hi_t hi;
2492 register mad_fixed64lo_t lo;
2493
2494@@ -573,6 +616,20 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
2495 pcm1 = synth->pcm.samples[ch];
2496
2497 for (s = 0; s < ns; ++s) {
2498+# ifdef FPM_AVR32
2499+/*
2500+ int i;
2501+ for ( i = 0; i < 32; i++ ){
2502+ (*sbsample)[s][i] = ((*sbsample)[s][i] + (1 << 13)) & 0xFFFFC000;
2503+ }
2504+*/
2505+ dct32_avr32((*sbsample)[s], phase >> 1,
2506+ (*filter)[0][phase & 1], (*filter)[1][phase & 1]);
2507+ /* printf("dct32: %d\n", GET_CYCLES);*/
2508+ pcm1 = synth_avr32(phase, (mad_fixed_t *)filter, \
2509+ pcm1, (short *)&Dmod[0]);
2510+ /* printf("synth_window: %d\n", GET_CYCLES);*/
2511+# else
2512 dct32((*sbsample)[s], phase >> 1,
2513 (*filter)[0][phase & 1], (*filter)[1][phase & 1]);
2514
2515@@ -679,6 +736,7 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame,
2516 MLA(hi, lo, (*fo)[7], ptr[ 2]);
2517
2518 *pcm1 = SHIFT(-MLZ(hi, lo));
2519+# endif
2520 pcm1 += 16;
2521
2522 phase = (phase + 1) % 16;
2523diff --git a/synth_avr32.S b/synth_avr32.S
2524new file mode 100644
2525index 0000000..701077b
2526--- /dev/null
2527+++ b/synth_avr32.S
2528@@ -0,0 +1,394 @@
2529+/*
2530+ Optimized function for speeding up synthesis filter
2531+ in MPEG Audio Decoding.
2532+ Copyright 2003-2006 Atmel Corporation.
2533+
2534+ Written by Ronny Pedersen and Lars Even Almås, Atmel Norway
2535+
2536+ This program is free software; you can redistribute it and/or modify
2537+ it under the terms of the GNU General Public License as published by
2538+ the Free Software Foundation; either version 2 of the License, or
2539+ (at your option) any later version.
2540+
2541+ This program is distributed in the hope that it will be useful,
2542+ but WITHOUT ANY WARRANTY; without even the implied warranty of
2543+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2544+ GNU General Public License for more details.
2545+
2546+ You should have received a copy of the GNU General Public License
2547+ along with this program; if not, write to the Free Software
2548+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
2549+
2550+
2551+/* *****************
2552+ Defining macros
2553+ ***************** */
2554+
2555+ .macro window_1 f, ptr, acc, ptr_offset, mul, tmp1_lo, tmp1_hi, tmp2_lo, tmp2_hi, tmp3_lo, tmp3_hi
2556+ ld.d \tmp1_lo, \f[0*4] /* tmp1 = { f[0], f[1] } */
2557+ ld.w \tmp2_lo, \ptr[0*2+\ptr_offset*2] /* tmp2_lo = { ptr[0], ptr[1] }*/
2558+ ld.d \tmp3_lo, \f[6*4] /* tmp3 = { f[6], f[7] } */
2559+ ld.w \tmp2_hi, \ptr[6*2+\ptr_offset*2] /* tmp2_hi = { ptr[6], ptr[7] }*/
2560+ .if \mul
2561+ mulwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[0]*/
2562+ .else
2563+ macwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[0]*/
2564+ .endif
2565+ macwh.d \acc, \tmp3_lo, \tmp2_lo:b /* f[7] * ptr[1]*/
2566+ ld.w \tmp2_lo, \ptr[2*2+\ptr_offset*2] /* tmp2_lo = { ptr[2], ptr[3] }*/
2567+ macwh.d \acc, \tmp1_lo, \tmp2_hi:b /* f[1] * ptr[7]*/
2568+ ld.d \tmp1_lo, \f[2*4] /* tmp1 = { f[2], f[3] } */
2569+
2570+ macwh.d \acc, \tmp3_hi, \tmp2_lo:t /* f[6] * ptr[2]*/
2571+ macwh.d \acc, \tmp1_hi, \tmp2_hi:t /* f[2] * ptr[6]*/
2572+ ld.d \tmp3_lo, \f[4*4] /* tmp3 = { f[4], f[5] } */
2573+ ld.w \tmp2_hi, \ptr[4*2+\ptr_offset*2] /* tmp2_hi = { ptr[4], ptr[5] }*/
2574+ macwh.d \acc, \tmp3_lo, \tmp2_lo:b /* f[5] * ptr[3]*/
2575+
2576+ macwh.d \acc, \tmp1_lo, \tmp2_hi:b /* f[3] * ptr[5]*/
2577+ macwh.d \acc, \tmp3_hi, \tmp2_hi:t /* f[4] * ptr[4]*/
2578+ .endm
2579+
2580+ .macro window_2 f, ptr, acc, ptr_offset, mul, tmp1_lo, tmp1_hi, tmp2_lo, tmp2_hi, tmp3_lo, tmp3_hi
2581+ ld.d \tmp1_lo, \f[0*4] /* tmp1 = { f[0], f[1] } */
2582+ ld.w \tmp2_lo, \ptr[7*2+\ptr_offset*2] /* tmp2_lo = { ptr[7], ptr[8] }*/
2583+ ld.d \tmp3_lo, \f[2*4] /* tmp3 = { f[2], f[3] } */
2584+ ld.w \tmp2_hi, \ptr[9*2+\ptr_offset*2] /* tmp2_hi = { ptr[9], ptr[10] }*/
2585+ .if \mul
2586+ mulwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[7]*/
2587+ .else
2588+ macwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[0] * ptr[7]*/
2589+ .endif
2590+ macwh.d \acc, \tmp1_lo, \tmp2_lo:b /* f[1] * ptr[8]*/
2591+
2592+ ld.d \tmp1_lo, \f[4*4] /* tmp1 = { f[4], f[5] } */
2593+ ld.w \tmp2_lo, \ptr[11*2+\ptr_offset*2] /* tmp2_lo = { ptr[11], ptr[12] }*/
2594+
2595+ macwh.d \acc, \tmp3_hi, \tmp2_hi:t /* f[2] * ptr[9]*/
2596+ macwh.d \acc, \tmp3_lo, \tmp2_hi:b /* f[3] * ptr[10]*/
2597+
2598+ ld.d \tmp3_lo, \f[6*4] /* tmp3 = { f[6], f[7] } */
2599+ ld.w \tmp2_hi, \ptr[13*2+\ptr_offset*2] /* tmp2_hi = { ptr[13], ptr[14] }*/
2600+
2601+ macwh.d \acc, \tmp1_hi, \tmp2_lo:t /* f[4] * ptr[11]*/
2602+ macwh.d \acc, \tmp1_lo, \tmp2_lo:b /* f[5] * ptr[12]*/
2603+ macwh.d \acc, \tmp3_hi, \tmp2_hi:t /* f[6] * ptr[13]*/
2604+ macwh.d \acc, \tmp3_lo, \tmp2_hi:b /* f[7] * ptr[14]*/
2605+ .endm
2606+
2607+ .macro scale res, d_lo, d_hi
2608+ lsl \d_hi, 2
2609+ .endm
2610+
2611+/* **********************
2612+ Starting main function
2613+ ********************** */
2614+
2615+/* Function synth_avr32 is called from synth.c with arguments:
2616+ phase, filter, *pcm1, &D[0] */
2617+
2618+ .global synth_avr32
2619+synth_avr32:
2620+ pushm r0-r7, lr
2621+ sub sp, 8
2622+
2623+ /* R12 = phase, R11 = filter, R10 = pcm1, r9 = D*/
2624+ bld r12, 0
2625+ brcc synth_even
2626+
2627+ /* Filter for odd phases */
2628+
2629+ /* fe = &(*filter)[0][1][0];
2630+ fx = &(*filter)[0][0][0];
2631+ fo = &(*filter)[1][0][0]; */
2632+ sub lr /*fe*/, r11, -16*8*4
2633+ sub r8 /*fo*/, r11, -16*8*4*2
2634+
2635+ /* pe = phase >> 1; */
2636+ lsr r12, 1
2637+ stdsp sp[4], r12
2638+ /* ptr = (short const *)Dmod + pe; */
2639+ add r12, r9, r12 << 1
2640+
2641+ /* ML0(hi, lo, (*fx)[0], ptr[0 + 17]);
2642+ MLA(hi, lo, (*fx)[1], ptr[7 + 17]);
2643+ MLA(hi, lo, (*fx)[2], ptr[6 + 17]);
2644+ MLA(hi, lo, (*fx)[3], ptr[5 + 17]);
2645+ MLA(hi, lo, (*fx)[4], ptr[4 + 17]);
2646+ MLA(hi, lo, (*fx)[5], ptr[3 + 17]);
2647+ MLA(hi, lo, (*fx)[6], ptr[2 + 17]);
2648+ MLA(hi, lo, (*fx)[7], ptr[1 + 17]); */
2649+ window_1 r11/*fx*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2650+
2651+ /* MLN(hi, lo); */
2652+ neg r0
2653+ acr r1
2654+ neg r1
2655+
2656+ /* MLA(hi, lo, (*fe)[0], ptr[0]);
2657+ MLA(hi, lo, (*fe)[1], ptr[7]);
2658+ MLA(hi, lo, (*fe)[2], ptr[6]);
2659+ MLA(hi, lo, (*fe)[3], ptr[5]);
2660+ MLA(hi, lo, (*fe)[4], ptr[4]);
2661+ MLA(hi, lo, (*fe)[5], ptr[3]);
2662+ MLA(hi, lo, (*fe)[6], ptr[2]);
2663+ MLA(hi, lo, (*fe)[7], ptr[1]); */
2664+ window_1 lr/*fe*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2665+
2666+ /* *pcm1++ = SHIFT(MLZ(hi, lo));
2667+
2668+ pcm2 = pcm1 + 31; */
2669+ scale r1, r0, r1
2670+ st.w r10/*pcm_1*/++, r1
2671+ sub r11/*pcm2*/, r10, -4*31
2672+
2673+ /* for (sb = 1; sb < 16; ++sb) { */
2674+ mov r2, 15
2675+ stdsp sp[0], r2
2676+odd_loop:
2677+ /* ++fe;
2678+ ptr += 33; */
2679+ sub lr /*fe*/, -8*4
2680+ sub r12, -33*2
2681+
2682+ /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
2683+ MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
2684+ MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
2685+ MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
2686+ MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
2687+ MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
2688+ MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
2689+ MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
2690+ window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2691+ /* MLN(hi, lo); */
2692+
2693+ neg r0
2694+ acr r1
2695+ neg r1
2696+
2697+ /* MLA(hi, lo, (*fe)[7], ptr[1]);
2698+ MLA(hi, lo, (*fe)[6], ptr[2]);
2699+ MLA(hi, lo, (*fe)[5], ptr[3]);
2700+ MLA(hi, lo, (*fe)[4], ptr[4]);
2701+ MLA(hi, lo, (*fe)[3], ptr[5]);
2702+ MLA(hi, lo, (*fe)[2], ptr[6]);
2703+ MLA(hi, lo, (*fe)[1], ptr[7]);
2704+ MLA(hi, lo, (*fe)[0], ptr[0]); */
2705+ window_1 lr/*fe*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2706+
2707+ /* ptr -= 2*pe; */
2708+ lddsp r2, sp[4]
2709+
2710+ /* *pcm1++ = SHIFT(MLZ(hi, lo)); */
2711+
2712+ scale r1, r0, r1
2713+ sub r12/*ptr*/, r12, r2/*pe*/<< 2
2714+ st.w r10/*pcm_1*/++, r1
2715+
2716+
2717+ /* ML0(hi, lo, (*fe)[0], ptr[7 + 17]);
2718+ MLA(hi, lo, (*fe)[1], ptr[8 + 17]);
2719+ MLA(hi, lo, (*fe)[2], ptr[9 + 17]);
2720+ MLA(hi, lo, (*fe)[3], ptr[10 + 17]);
2721+ MLA(hi, lo, (*fe)[4], ptr[11 + 17]);
2722+ MLA(hi, lo, (*fe)[5], ptr[12 + 17]);
2723+ MLA(hi, lo, (*fe)[6], ptr[13 + 17]);
2724+ MLA(hi, lo, (*fe)[7], ptr[14 + 17]); */
2725+ window_2 lr/*fe*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2726+ /* MLA(hi, lo, (*fo)[7], ptr[14]);
2727+ MLA(hi, lo, (*fo)[6], ptr[13]);
2728+ MLA(hi, lo, (*fo)[5], ptr[12]);
2729+ MLA(hi, lo, (*fo)[4], ptr[11]);
2730+ MLA(hi, lo, (*fo)[3], ptr[10]);
2731+ MLA(hi, lo, (*fo)[2], ptr[9]);
2732+ MLA(hi, lo, (*fo)[1], ptr[8]);
2733+ MLA(hi, lo, (*fo)[0], ptr[7]); */
2734+ window_2 r8/*fo*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2735+
2736+
2737+ /* *pcm2-- = SHIFT(MLZ(hi, lo)); */
2738+ lddsp r3, sp[4]
2739+ lddsp r2, sp[0]
2740+ scale r1, r0, r1
2741+ st.w --r11/*pcm_2*/, r1
2742+
2743+ /* ptr += 2*pe; */
2744+ add r12/*ptr*/, r12, r3/*pe*/<< 2
2745+
2746+ /* ++fo;
2747+ } */
2748+ sub r8/*fo*/, -8*4
2749+
2750+ sub r2, 1
2751+ stdsp sp[0], r2
2752+ brne odd_loop
2753+
2754+ /* ptr += 33; */
2755+ sub r12/*ptr*/, -33*2
2756+
2757+ /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
2758+ MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
2759+ MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
2760+ MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
2761+ MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
2762+ MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
2763+ MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
2764+ MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
2765+ window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2766+
2767+ rjmp synth_end
2768+synth_even:
2769+ /* Filter for even phases */
2770+
2771+ /* fe = &(*filter)[0][0][0];
2772+ fx = &(*filter)[0][1][0];
2773+ fo = &(*filter)[1][1][0]; */
2774+ sub lr /*fx*/, r11, -16*8*4
2775+ sub r8 /*fo*/, r11, -(16*8*4*2 + 16*8*4)
2776+
2777+ /* po = ((phase - 1) & 0xF) >> 1; */
2778+ sub r12, 1
2779+ andl r12, 0xe, COH
2780+ stdsp sp[4], r12
2781+ /* ptr = (short const *)Dmod + po; */
2782+ add r12, r9, r12
2783+
2784+ /* ML0(hi, lo, (*fx)[0], ptr[0 + 17]);
2785+ MLA(hi, lo, (*fx)[1], ptr[7 + 17]);
2786+ MLA(hi, lo, (*fx)[2], ptr[6 + 17]);
2787+ MLA(hi, lo, (*fx)[3], ptr[5 + 17]);
2788+ MLA(hi, lo, (*fx)[4], ptr[4 + 17]);
2789+ MLA(hi, lo, (*fx)[5], ptr[3 + 17]);
2790+ MLA(hi, lo, (*fx)[6], ptr[2 + 17]);
2791+ MLA(hi, lo, (*fx)[7], ptr[1 + 17]); */
2792+ window_1 lr/*fx*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2793+
2794+ /* MLN(hi, lo); */
2795+ neg r0
2796+ acr r1
2797+ neg r1
2798+
2799+ /* MLA(hi, lo, (*fe)[0], ptr[0 + 1]);
2800+ MLA(hi, lo, (*fe)[1], ptr[7 + 1]);
2801+ MLA(hi, lo, (*fe)[2], ptr[6 + 1]);
2802+ MLA(hi, lo, (*fe)[3], ptr[5 + 1]);
2803+ MLA(hi, lo, (*fe)[4], ptr[4 + 1]);
2804+ MLA(hi, lo, (*fe)[5], ptr[3 + 1]);
2805+ MLA(hi, lo, (*fe)[6], ptr[2 + 1]);
2806+ MLA(hi, lo, (*fe)[7], ptr[1 + 1]); */
2807+ window_1 r11/*fe*/,r12/*ptr*/,r0/*acc*/,1/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2808+
2809+ /* *pcm1++ = SHIFT(MLZ(hi, lo));
2810+
2811+ pcm2 = pcm1 + 31; */
2812+ scale r1, r0, r1
2813+ st.w r10/*pcm_1*/++, r1
2814+ sub lr/*pcm2*/, r10, -4*31
2815+
2816+ /* for (sb = 1; sb < 16; ++sb) { */
2817+ mov r2, 15
2818+ stdsp sp[0], r2
2819+even_loop:
2820+ /* ++fe;
2821+ ptr += 33; */
2822+ sub r11 /*fe*/, -8*4
2823+ sub r12, -33*2
2824+
2825+ /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
2826+ MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
2827+ MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
2828+ MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
2829+ MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
2830+ MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
2831+ MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
2832+ MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
2833+ window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2834+ /* MLN(hi, lo); */
2835+ neg r0
2836+ acr r1
2837+ neg r1
2838+
2839+ /* MLA(hi, lo, (*fe)[7], ptr[1 + 1]);
2840+ MLA(hi, lo, (*fe)[6], ptr[2 + 1]);
2841+ MLA(hi, lo, (*fe)[5], ptr[3 + 1]);
2842+ MLA(hi, lo, (*fe)[4], ptr[4 + 1]);
2843+ MLA(hi, lo, (*fe)[3], ptr[5 + 1]);
2844+ MLA(hi, lo, (*fe)[2], ptr[6 + 1]);
2845+ MLA(hi, lo, (*fe)[1], ptr[7 + 1]);
2846+ MLA(hi, lo, (*fe)[0], ptr[0 + 1]); */
2847+ window_1 r11/*fe*/,r12/*ptr*/,r0/*acc*/,1/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2848+
2849+ /* *pcm1++ = SHIFT(MLZ(hi, lo)); */
2850+ lddsp r2, sp[4]
2851+ scale r1, r0, r1
2852+ /* ptr -= 2*po; */
2853+ sub r12/*ptr*/, r12, r2/*po*/<< 1
2854+ st.w r10/*pcm_1*/++, r1
2855+
2856+
2857+ /* ML0(hi, lo, (*fe)[0], ptr[7 + 17 - 1]);
2858+ MLA(hi, lo, (*fe)[1], ptr[8 + 17 - 1]);
2859+ MLA(hi, lo, (*fe)[2], ptr[9 + 17 - 1]);
2860+ MLA(hi, lo, (*fe)[3], ptr[10 + 17 - 1]);
2861+ MLA(hi, lo, (*fe)[4], ptr[11 + 17 - 1]);
2862+ MLA(hi, lo, (*fe)[5], ptr[12 + 17 - 1]);
2863+ MLA(hi, lo, (*fe)[6], ptr[13 + 17 - 1]);
2864+ MLA(hi, lo, (*fe)[7], ptr[14 + 17 - 1]); */
2865+ window_2 r11/*fe*/,r12/*ptr*/,r0/*acc*/,16/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2866+ /* MLA(hi, lo, (*fo)[7], ptr[14]);
2867+ MLA(hi, lo, (*fo)[6], ptr[13]);
2868+ MLA(hi, lo, (*fo)[5], ptr[12]);
2869+ MLA(hi, lo, (*fo)[4], ptr[11]);
2870+ MLA(hi, lo, (*fo)[3], ptr[10]);
2871+ MLA(hi, lo, (*fo)[2], ptr[9]);
2872+ MLA(hi, lo, (*fo)[1], ptr[8]);
2873+ MLA(hi, lo, (*fo)[0], ptr[7]); */
2874+ window_2 r8/*fo*/,r12/*ptr*/,r0/*acc*/,0/*off*/,0/*mac*/,r2,r3,r4,r5,r6,r7
2875+
2876+
2877+ /* *pcm2-- = SHIFT(MLZ(hi, lo)); */
2878+ lddsp r3, sp[4]
2879+ lddsp r2, sp[0]
2880+ scale r1, r0, r1
2881+ st.w --lr/*pcm_2*/, r1
2882+
2883+ /* ptr += 2*po; */
2884+ add r12/*ptr*/, r12, r3/*po*/<< 1
2885+
2886+ /* ++fo;
2887+ } */
2888+ sub r8/*fo*/, -8*4
2889+
2890+ sub r2, 1
2891+ stdsp sp[0], r2
2892+ brne even_loop
2893+
2894+ /* ptr += 33; */
2895+ sub r12/*ptr*/, -33*2
2896+
2897+ /* ML0(hi, lo, (*fo)[0], ptr[0 + 17]);
2898+ MLA(hi, lo, (*fo)[1], ptr[7 + 17]);
2899+ MLA(hi, lo, (*fo)[2], ptr[6 + 17]);
2900+ MLA(hi, lo, (*fo)[3], ptr[5 + 17]);
2901+ MLA(hi, lo, (*fo)[4], ptr[4 + 17]);
2902+ MLA(hi, lo, (*fo)[5], ptr[3 + 17]);
2903+ MLA(hi, lo, (*fo)[6], ptr[2 + 17]);
2904+ MLA(hi, lo, (*fo)[7], ptr[1 + 17]); */
2905+ window_1 r8/*fo*/,r12/*ptr*/,r0/*acc*/,17/*off*/,1/*mul*/,r2,r3,r4,r5,r6,r7
2906+
2907+
2908+
2909+synth_end:
2910+ /* *pcm1 = SHIFT(-MLZ(hi, lo)); */
2911+ scale r1, r0, r1
2912+ neg r1
2913+ st.w r10/*pcm_1*/, r1
2914+
2915+ mov r12, r10
2916+ sub sp, -8
2917+ popm r0-r7, pc
2918+
2919+
2920+
2921+
2922+
diff --git a/meta-oe/recipes-multimedia/libmad/files/mad-mips-h-constraint.patch b/meta-oe/recipes-multimedia/libmad/files/mad-mips-h-constraint.patch
deleted file mode 100644
index b65555e1f..000000000
--- a/meta-oe/recipes-multimedia/libmad/files/mad-mips-h-constraint.patch
+++ /dev/null
@@ -1,70 +0,0 @@
1diff -ur libmad-0.15.1b-orig/fixed.h libmad-0.15.1b/fixed.h
2--- libmad-0.15.1b-orig/fixed.h 2004-02-17 12:32:03.000000000 +1030
3+++ libmad-0.15.1b/fixed.h 2009-08-05 10:46:30.000000000 +0930
4@@ -299,6 +299,23 @@
5
6 # elif defined(FPM_MIPS)
7
8+/* Test for gcc >= maj.min, as per __GNUC_PREREQ in glibc */
9+#if defined (__GNUC__) && defined (__GNUC_MINOR__)
10+#define __GNUC_PREREQ(maj, min) \
11+ ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
12+#else
13+#define __GNUC_PREREQ(maj, min) 0
14+#endif
15+
16+#if __GNUC_PREREQ(4,4)
17+ typedef unsigned int u64_di_t __attribute__ ((mode (DI)));
18+# define MAD_F_MLX(hi, lo, x, y) \
19+ do { \
20+ u64_di_t __ll = (u64_di_t) (x) * (y); \
21+ hi = __ll >> 32; \
22+ lo = __ll; \
23+ } while (0)
24+#else
25 /*
26 * This MIPS version is fast and accurate; the disposition of the least
27 * significant bit depends on OPT_ACCURACY via mad_f_scale64().
28@@ -328,6 +345,7 @@
29 : "%r" ((x) >> 12), "r" ((y) >> 16))
30 # define MAD_F_MLZ(hi, lo) ((mad_fixed_t) (lo))
31 # endif
32+#endif /* __GNU_PREREQ(4,4) */
33
34 # if defined(OPT_SPEED)
35 # define mad_f_scale64(hi, lo) \
36diff -ur libmad-0.15.1b-orig/mad.h libmad-0.15.1b/mad.h
37--- libmad-0.15.1b-orig/mad.h 2004-02-17 13:25:44.000000000 +1030
38+++ libmad-0.15.1b/mad.h 2009-08-05 10:42:40.000000000 +0930
39@@ -344,6 +344,23 @@
40
41 # elif defined(FPM_MIPS)
42
43+/* Test for gcc >= maj.min, as per __GNUC_PREREQ in glibc */
44+#if defined (__GNUC__) && defined (__GNUC_MINOR__)
45+#define __GNUC_PREREQ(maj, min) \
46+ ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
47+#else
48+#define __GNUC_PREREQ(maj, min) 0
49+#endif
50+
51+#if __GNUC_PREREQ(4,4)
52+ typedef unsigned int u64_di_t __attribute__ ((mode (DI)));
53+# define MAD_F_MLX(hi, lo, x, y) \
54+ do { \
55+ u64_di_t __ll = (u64_di_t) (x) * (y); \
56+ hi = __ll >> 32; \
57+ lo = __ll; \
58+ } while (0)
59+#else
60 /*
61 * This MIPS version is fast and accurate; the disposition of the least
62 * significant bit depends on OPT_ACCURACY via mad_f_scale64().
63@@ -373,6 +390,7 @@
64 : "%r" ((x) >> 12), "r" ((y) >> 16))
65 # define MAD_F_MLZ(hi, lo) ((mad_fixed_t) (lo))
66 # endif
67+#endif /* __GNU_PREREQ(4,4) */
68
69 # if defined(OPT_SPEED)
70 # define mad_f_scale64(hi, lo) \
diff --git a/meta-oe/recipes-multimedia/libmad/files/mad.diff b/meta-oe/recipes-multimedia/libmad/files/mad.diff
deleted file mode 100644
index 851dc0120..000000000
--- a/meta-oe/recipes-multimedia/libmad/files/mad.diff
+++ /dev/null
@@ -1,24 +0,0 @@
1--- /tmp/configure.ac 2008-07-11 10:19:17.000000000 +0200
2+++ libmad-0.15.1b/configure.ac 2008-07-11 10:20:00.313198000 +0200
3@@ -140,21 +140,14 @@
4 case "$optimize" in
5 -O|"-O "*)
6 optimize="-O"
7- optimize="$optimize -fforce-mem"
8- optimize="$optimize -fforce-addr"
9 : #x optimize="$optimize -finline-functions"
10 : #- optimize="$optimize -fstrength-reduce"
11- optimize="$optimize -fthread-jumps"
12- optimize="$optimize -fcse-follow-jumps"
13- optimize="$optimize -fcse-skip-blocks"
14 : #x optimize="$optimize -frerun-cse-after-loop"
15 : #x optimize="$optimize -frerun-loop-opt"
16 : #x optimize="$optimize -fgcse"
17 optimize="$optimize -fexpensive-optimizations"
18- optimize="$optimize -fregmove"
19 : #* optimize="$optimize -fdelayed-branch"
20 : #x optimize="$optimize -fschedule-insns"
21- optimize="$optimize -fschedule-insns2"
22 : #? optimize="$optimize -ffunction-sections"
23 : #? optimize="$optimize -fcaller-saves"
24 : #> optimize="$optimize -funroll-loops"
diff --git a/meta-oe/recipes-multimedia/libmad/libmad-0.15.1b/obsolete_automake_macros.patch b/meta-oe/recipes-multimedia/libmad/libmad-0.15.1b/obsolete_automake_macros.patch
deleted file mode 100644
index b0f5f77e2..000000000
--- a/meta-oe/recipes-multimedia/libmad/libmad-0.15.1b/obsolete_automake_macros.patch
+++ /dev/null
@@ -1,14 +0,0 @@
1Upstream-Status: Submitted (https://sourceforge.net/tracker/?group_id=12349&atid=112349)
2
3Signed-off-by: Marko Lindqvist <cazfi74@gmail.com>
4diff -Nurd libmad-0.15.1b/configure.ac libmad-0.15.1b/configure.ac
5--- libmad-0.15.1b/configure.ac 2004-01-23 11:41:32.000000000 +0200
6+++ libmad-0.15.1b/configure.ac 2013-01-03 08:28:23.718693697 +0200
7@@ -28,7 +28,7 @@
8
9 AM_INIT_AUTOMAKE
10
11-AM_CONFIG_HEADER([config.h])
12+AC_CONFIG_HEADERS([config.h])
13
14 dnl System type.
diff --git a/meta-oe/recipes-multimedia/libmad/libmad_0.15.1b.bb b/meta-oe/recipes-multimedia/libmad/libmad_0.15.1b.bb
deleted file mode 100644
index b5ff6988a..000000000
--- a/meta-oe/recipes-multimedia/libmad/libmad_0.15.1b.bb
+++ /dev/null
@@ -1,38 +0,0 @@
1DESCRIPTION = "MPEG Audio Decoder Library"
2SECTION = "libs"
3
4DEPENDS = "libid3tag"
5
6LICENSE = "GPLv2"
7LIC_FILES_CHKSUM = "file://COPYING;md5=94d55d512a9ba36caa9b7df079bae19f"
8
9PR = "r1"
10
11SRC_URI = "${SOURCEFORGE_MIRROR}/mad/libmad-${PV}.tar.gz \
12 file://add-pkgconfig.patch \
13 file://mad.diff \
14 file://mad-mips-h-constraint.patch \
15 file://obsolete_automake_macros.patch \
16"
17
18SRC_URI_append_avr32 = " file://libmad-0.15.1b-avr32-optimization.patch"
19
20SRC_URI[md5sum] = "1be543bc30c56fb6bea1d7bf6a64e66c"
21SRC_URI[sha256sum] = "bbfac3ed6bfbc2823d3775ebb931087371e142bb0e9bb1bee51a76a6e0078690"
22
23S = "${WORKDIR}/libmad-${PV}"
24
25inherit autotools pkgconfig
26
27EXTRA_OECONF = "-enable-speed --enable-shared"
28# The ASO's don't take any account of thumb...
29EXTRA_OECONF_append_thumb = " --disable-aso --enable-fpm=default"
30EXTRA_OECONF_append_arm = " --enable-fpm=arm"
31
32do_configure_prepend () {
33# damn picky automake...
34 touch NEWS AUTHORS ChangeLog
35}
36
37ARM_INSTRUCTION_SET = "arm"
38